In [20]:
# The code generates synthetic data, applies three different anomaly detection techniques (OC-SVM, Isolation Forest, LOF), 
# and evaluates their performance using various metrics. 
# Keep in mind that the choice of parameters and the quality of the synthetic data can impact the results,
# and in practice, real-world data should be used for a more accurate assessment of these techniques.

In [21]:
#Importing Libraries

In [22]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, auc


In [23]:
# Generating Synthetic DataWe create a synthetic dataset using make_blobs. 
# This dataset contains two clusters, and we introduce 30 outlier points manually.

X, y = make_blobs(n_samples=300, centers=2, random_state=42, cluster_std=1.0)
outliers = np.random.uniform(low=-10, high=10, size=(30, 2))
X = np.vstack([X, outliers])
y = np.hstack([y, [-1] * len(outliers)])


In [24]:
# Splitting Data
# We split the data into training (X_train, y_train) and testing (X_test, y_test) sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# One-class SVM (OC-SVM)
# We create an OC-SVM model with a contamination rate of 5% 
# (meaning we expect 5% of data to be outliers). We train it on the training data and make predictions on the test data.

ocsvm = OneClassSVM(nu=0.05)
ocsvm.fit(X_train)
y_pred_ocsvm = ocsvm.predict(X_test)


In [26]:
# Isolation Forest
iforest = IsolationForest(contamination=0.05, random_state=42)
iforest.fit(X_train)
y_pred_iforest = iforest.predict(X_test)


In [27]:
# We create an Isolation Forest model with the same contamination rate as OC-SVM. 
# We fit it to the training data and make predictions on the test data.
iforest = IsolationForest(contamination=0.05, random_state=42)
iforest.fit(X_train)
y_pred_iforest = iforest.predict(X_test)


In [28]:
# Local Outlier Factor (LOF)
# We create an LOF model with a contamination rate of 5% and set 
# the number of neighbors to consider as 20. We fit it to the test data and predict anomalies.
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
y_pred_lof = lof.fit_predict(X_test)


In [29]:
# Evaluation Metrics:
#     We define a function evaluate to assess model performance.
#    We print the confusion matrix (although it may not be very informative for unsupervised anomaly detection).
#    We display a classification report showing precision, recall, F1-score, and support for each class 
#    (anomaly and non-anomaly).
#    We calculate the AUC-ROC score and plot the ROC curve if applicable.

def evaluate(y_true, y_pred):
    # Confusion Matrix
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

    # Classification Report
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

    # AUC-ROC
    try:
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        roc_auc = auc(fpr, tpr)
        print(f"\nAUC-ROC: {roc_auc}")
        # Plot ROC curve
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        # Other ROC curve settings
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()
    except ValueError:
        pass


In [30]:
#Model Evaluation

print("One-class SVM Results:")
evaluate(y_test, y_pred_ocsvm)

print("\nIsolation Forest Results:")
evaluate(y_test, y_pred_iforest)

print("\nLocal Outlier Factor (LOF) Results:")
evaluate(y_test, y_pred_lof)


One-class SVM Results:
Confusion Matrix:
 [[ 4  0  3]
 [ 0  0 27]
 [ 0  0 32]]

Classification Report:
               precision    recall  f1-score   support

          -1       1.00      0.57      0.73         7
           0       0.00      0.00      0.00        27
           1       0.52      1.00      0.68        32

    accuracy                           0.55        66
   macro avg       0.51      0.52      0.47        66
weighted avg       0.36      0.55      0.41        66


Isolation Forest Results:
Confusion Matrix:
 [[ 5  0  2]
 [ 0  0 27]
 [ 0  0 32]]

Classification Report:
               precision    recall  f1-score   support

          -1       1.00      0.71      0.83         7
           0       0.00      0.00      0.00        27
           1       0.52      1.00      0.69        32

    accuracy                           0.56        66
   macro avg       0.51      0.57      0.51        66
weighted avg       0.36      0.56      0.42        66


Local Outlier Factor (LOF

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
i see a