<a href="https://colab.research.google.com/github/Mide478/DeepLearning-LatentSpace-StabilityEvaluation/blob/master/Varying_Noise_Ratio_Experiments_for_LOF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

In [None]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Apr  15 14:13:51 2024

@author: jlh7233
@author: aom478
"""
#%% Import required libraries

import numpy as np
import pandas as pd
import random as rand
import copy
import pickle
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import *
from torchvision import datasets
from torchvision import transforms
import anomaly_ls_utils as als

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from scipy.stats import gaussian_kde


from sklearn.metrics import silhouette_score, silhouette_samples, confusion_matrix
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold

In [None]:
#  Generate datasets with different noise ratios for the same  noise mean and standard deviation of these anomaly points as experiments to check methodology efficacy.

# Define the parameters
mean_noise = 1
stdev_noise = 3
seed = 78703
torch.manual_seed(seed)
n_obs = 3000
dim = 4
initial_noise_fraction = 14 / n_obs  # Initial fraction of noise points
final_noise_fraction = 0.1  # Target fraction of noise points
offset=[0.90, 1.1]

# Create a dict to store the generated datasets
datasets = {}

# Iterate through increasing fractions of noise points
for noise_fraction in np.linspace(initial_noise_fraction, final_noise_fraction, num=25):
    n_noise = int(noise_fraction * n_obs)  # Number of noise points based on the fraction

    # Generate the dataset
    X, X_abn, abn_dataset, X_abnds, scaler_data = als.generate_dataset(n_rows=n_obs, dim=dim, n_noise=n_noise,
                                                                        mean_noise=mean_noise, stdev_noise=stdev_noise,
                                                                        offset=offset)

    # Store the dataset in the dictionary
    datasets[(mean_noise, stdev_noise, n_noise)] = {
        'X': X,
        'X_abn': X_abn,
        'abn_dataset': abn_dataset,
        'X_abnds': X_abnds,
        'scaler_data': scaler_data
    }

In [None]:
# Create an empty list to store dictionaries of f1 scores
f1_scores_list = []

# Iterate through each dataset
for (mean_noise, stdev_noisee, n_noise), dataset_info in datasets.items():
    X_abnds = dataset_info['X_abnds']
    X = dataset_info['X']

    # Extract real outliers and inliers
    real_outliers = X_abnds[X.shape[0]::]
    real_inliers = X_abnds[0:X.shape[0], :]

    # Run PCA
    pca = PCA(n_components=2)
    real_pca = pca.fit_transform(X_abnds)

    # Run Local Outlier Factor (LOF)
    LOF = LocalOutlierFactor(n_neighbors=200, contamination='auto')
    LOF.fit(real_pca)
    LOF_pred = LOF.fit_predict(real_pca)
    LOF_inlier_indices = np.where(LOF_pred == 1)[0]
    LOF_outlier_indices = np.where(LOF_pred == -1)[0]
    LOF_outliers = X_abnds[LOF_outlier_indices.astype(int)]
    LOF_inliers = X_abnds[LOF_inlier_indices.astype(int)]

    # Compute error labels
    TP, TN, FN, FP, _ = als.error_labels(data_array=X_abnds, real_outliers=real_outliers, pred_outliers=LOF_outliers,
                                         real_inliers=real_inliers, pred_inliers=LOF_inliers)

    # Compute metrics
    accuracy_score = (TP + TN) / (TP + TN + FP + FN)
    far_score = FP / (TN + FP)
    precision_score = als.precision(TP, FP)
    recall_score = als.recall(TP, FN)
    f1_score = als.f1(precision_score, recall_score)

    # print("Mean Noise:", mean_noise, "Std Dev Noise:", stdev_noise)
    # print("F1 Score:", round(f1_score, 4))

    # Append f1 score to list
    f1_scores_list.append({'mean_noise': mean_noise, 'stdev_noise': stdev_noise, 'f1_score_LOF': f1_score})

# Convert the list of dictionaries to a DataFrame
f1_scores_df_LOF = pd.DataFrame(f1_scores_list)

# Save the DataFrame to a CSV file
f1_scores_df_LOF.to_csv('f1_scores_LOF_Different_N.csv', index=False)

In [None]:
f1_scores_df_LOF.tail()

Unnamed: 0,mean_noise,stdev_noise,f1_score_LOF
20,1,3,0.694497
21,1,3,0.69708
22,1,3,0.689408
23,1,3,0.690846
24,1,3,0.690117
