In [54]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the dataset
data = pd.read_csv("C:\\Users\\HP\\Desktop\\labs\\nnv_final_dmpa\\cleaned_data_1.csv")

# Feature Engineering
data['total price'] = data['pack_price'] * data['unit_price']

# Select relevant features for training
numeric_features = ['line_item_quantity', 'unit_price', 'pack_price', 'line_item_value', 'line_item_insurance', 'weight', 'total price']

# Convert non-numeric values to NaN in numeric columns
data[numeric_features] = data[numeric_features].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values
data = data.dropna(subset=numeric_features)

# Scaling features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[numeric_features])

# Train the One-Class SVM model
model = OneClassSVM(nu=0.05) 
model.fit(data_scaled)

# Predict anomalies on the entire dataset
predictions = model.predict(data_scaled)

# Convert predictions to 0 (normal) and 1 (anomaly)
predictions[predictions == 1] = 0  # Predicted normal
predictions[predictions == -1] = 1  # Predicted anomaly

# Generate synthetic labels based on One-Class SVM predictions
data['predicted_labels'] = predictions

# Assuming anomalies are instances where the predicted label is 1
data['true_labels'] = 0  # Initialize all instances as normal
data.loc[data['predicted_labels'] == 1, 'true_labels'] = 1  # Assign 1 to instances predicted as anomalies

# Evaluate the model
# accuracy = accuracy_score(data['true_labels'], data['predicted_labels'])
precision = precision_score(data['true_labels'], data['predicted_labels'])
recall = recall_score(data['true_labels'], data['predicted_labels'])
f1 = f1_score(data['true_labels'], data['predicted_labels'])

# print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Count the number of anomalies (outliers) based on predicted labels
anomaly_count = len(data[data['predicted_labels'] == 1])
print(f"Number of anomalies: {anomaly_count}")

# Display anomalies
anomalies = data[data['true_labels'] == 1]
print("Anomalies:")
print(anomalies)

Precision: 1.00
Recall: 1.00
F1 Score: 1.00
Number of anomalies: 227
Anomalies:
         id  line_item_quantity  unit_price  pack_price  line_item_value  \
72     1179                   8        0.23       76.72           613.76   
82     1283               95500        0.03        1.51        144205.00   
84     1298               70000        0.12        7.50        525000.00   
97     1519                 100        0.55      183.33         18333.00   
169    2685               60784        0.56       16.75       1018132.00   
...     ...                 ...         ...         ...              ...   
7277  85985                  26        0.14        8.58           223.08   
7383  86140               21497        0.16       19.66        422631.02   
7646  86480               24470        0.25       29.68        726269.60   
7724  86579               69963        0.43       12.96        906720.48   
7773  86653              169447        0.06        3.75        635426.25   

      l

In [55]:
# from sklearn.metrics import silhouette_score
# # Calculate the Silhouette Score
# silhouette_avg = silhouette_score(data_scaled, predictions)

# print(f"Silhouette Score: {silhouette_avg:.2f}")
