In [19]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM

# Load your dataset
data = pd.read_csv("C:\\Users\\HP\\Desktop\\labs\\nnv_final_dmpa\\cleaned_data_1.csv")

# Select relevant features for anomaly detection
features_for_anomaly_detection = ['line_item_quantity', 'unit_price', 'pack_price', 'line_item_value', 'line_item_insurance', 'weight']

# Filter the data to include only the selected features
data_subset = data[features_for_anomaly_detection]

# Convert non-numeric values to NaN and then drop rows with NaN
data_subset = data_subset.apply(pd.to_numeric, errors='coerce').dropna()

# Standardize the data
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_subset)

# Isolation Forest
start_time = time.time()
iso_forest = IsolationForest(contamination=0.05)
iso_forest.fit(data_standardized)
iso_forest_time = time.time() - start_time

# LOF
start_time = time.time()
lof = LocalOutlierFactor(n_neighbors=100, contamination=0.05)
lof.fit(data_standardized)
lof_time = time.time() - start_time

# One-Class SVM
start_time = time.time()
svm = OneClassSVM(nu=0.05)
svm.fit(data_standardized)
svm_time = time.time() - start_time

print(f"Isolation Forest Time: {iso_forest_time:.4f} seconds")
print(f"LOF Time: {lof_time:.4f} seconds")
print(f"One-Class SVM Time: {svm_time:.4f} seconds")


import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
import time

# Load your dataset
data = pd.read_csv("C:\\Users\\HP\\Desktop\\labs\\nnv_final_dmpa\\cleaned_data_1.csv")

# Select relevant features for anomaly detection
features_for_anomaly_detection = ['line_item_quantity', 'unit_price', 'pack_price', 'line_item_value', 'line_item_insurance', 'weight']

# Filter the data to include only the selected features
data_subset = data[features_for_anomaly_detection]

# Convert non-numeric values to NaN and then drop rows with NaN
data_subset = data_subset.apply(pd.to_numeric, errors='coerce').dropna()

# Split the data into training and testing sets
X_train, X_test = train_test_split(data_subset, test_size=0.05, random_state=42)

# Measure the start time
start_time = time.time()

# Calculate Z-Scores for training set
z_scores_train = np.abs(zscore(X_train))

# Set a threshold for anomaly detection
threshold = 3

# Identify outliers in the training set
outliers_train = (z_scores_train > threshold).any(axis=1)

# Add an 'is_anomaly' column to the training set
X_train['is_anomaly'] = outliers_train.astype(int)

# Calculate Z-Scores for the testing set
z_scores_test = np.abs(zscore(X_test))

# Identify outliers in the testing set
outliers_test = (z_scores_test > threshold).any(axis=1)

# Add an 'is_anomaly' column to the testing set
X_test['is_anomaly'] = outliers_test.astype(int)

# Measure the end time
end_time = time.time()

# Evaluate the model
y_true = X_test['is_anomaly']
y_pred = X_test['is_anomaly']

# Print the computational efficiency
print(f"Z-Score: {end_time - start_time:.4f} seconds")

Isolation Forest Time: 0.2088 seconds
LOF Time: 0.1863 seconds
One-Class SVM Time: 0.0639 seconds
Z-Score: 0.0099 seconds


In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import time

# Load your dataset
data = pd.read_csv("C:\\Users\\HP\\Desktop\\labs\\nnv_final_dmpa\\cleaned_data_1.csv")

# Select relevant features for anomaly detection
features_for_anomaly_detection = ['line_item_quantity', 'unit_price', 'pack_price', 'line_item_value', 'line_item_insurance', 'weight']

# Filter the data to include only the selected features
data_subset = data[features_for_anomaly_detection]

# Convert non-numeric values to NaN and then drop rows with NaN
data_subset = data_subset.apply(pd.to_numeric, errors='coerce').dropna()

# Standardize the data
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_subset)

# Measure the start time for Isolation Forest
start_time_iso_forest = time.time()
iso_forest = IsolationForest(contamination=0.05)
iso_forest.fit(data_standardized)
iso_forest_time = time.time() - start_time_iso_forest

# Measure the start time for LOF
start_time_lof = time.time()
lof = LocalOutlierFactor(n_neighbors=100, contamination=0.05)
lof.fit(data_standardized)
lof_time = time.time() - start_time_lof

# Measure the start time for One-Class SVM
start_time_svm = time.time()
svm = OneClassSVM(nu=0.05)
svm.fit(data_standardized)
svm_time = time.time() - start_time_svm

# Split the data into training and testing sets for Z-Score
X_train, X_test = train_test_split(data_subset, test_size=0.05, random_state=42)

# Measure the start time for Z-Score
start_time_zscore = time.time()

# Calculate Z-Scores for training set
z_scores_train = np.abs(zscore(X_train))

# Set a threshold for anomaly detection
threshold = 3

# Identify outliers in the training set
outliers_train = (z_scores_train > threshold).any(axis=1)

# Add an 'is_anomaly' column to the training set
X_train['is_anomaly'] = outliers_train.astype(int)

# Calculate Z-Scores for the testing set
z_scores_test = np.abs(zscore(X_test))

# Identify outliers in the testing set
outliers_test = (z_scores_test > threshold).any(axis=1)

# Add an 'is_anomaly' column to the testing set
X_test['is_anomaly'] = outliers_test.astype(int)

# Measure the end time for Z-Score
end_time_zscore = time.time()

# Print the computational efficiency
print(f"Isolation Forest Time: {iso_forest_time:.4f} seconds")
print(f"LOF Time: {lof_time:.4f} seconds")
print(f"One-Class SVM Time: {svm_time:.4f} seconds")
print(f"Z-Score Time: {end_time_zscore-start_time_zscore:.4f} seconds")


Isolation Forest Time: 0.2268 seconds
LOF Time: 0.1942 seconds
One-Class SVM Time: 0.0647 seconds
Z-Score Time: 0.0160 seconds
