Preprocesarea datelor

In [112]:
from pathlib import Path
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import time

data = pd.read_csv(Path("./data/creditcard.csv"))
dataWithoutTime = data.drop("Time", axis = 1)

train_ratio = 0.75
validation_ratio = 0.25

splitterVal = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
splitterTest = StratifiedShuffleSplit(n_splits=1, test_size=validation_ratio, random_state=42)

train_index, val_index = list(splitterTest.split(dataWithoutTime, dataWithoutTime['Class']))[0]

train_data, val_data = dataWithoutTime.iloc[train_index], dataWithoutTime.iloc[val_index]

normal_train_data = train_data[train_data.Class == 0]
outlier_train_data = train_data[train_data.Class == 1]

val_data = pd.concat([val_data, outlier_train_data], ignore_index=True)

test_index, val_index = list(splitterVal.split(val_data, val_data['Class']))[0]

test_data, val_data = val_data.iloc[test_index], val_data.iloc[val_index]

test_labels = test_data['Class'].to_numpy()
val_labels = val_data['Class'].to_numpy()

train_data = normal_train_data.drop("Class", axis=1)
test_data = test_data.drop("Class", axis=1)
val_data = val_data.drop("Class", axis=1)

train_data = train_data.to_numpy()
test_data = test_data.to_numpy()
val_data = val_data.to_numpy()

scaler = StandardScaler()

train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)


Salvam modelul deja antrenat

In [113]:
OCSVM_MODEL_PATH = 'ocsvm.pkl'
GMM_MODEL_PATH = 'gmm.pkl'
KDE_MODEL_PATH = 'kde.pkl'

import pickle
def save_model(model, filename):
    # save
    with open(filename,'wb') as f:
        pickle.dump(model,f)
        
def load_model(filename, model_name):
    import os.path
    if os.path.isfile(filename):
        # load
        with open(filename, 'rb') as f:
            return pickle.load(f)
    else:
        if model_name == 'OCSVM':
            from sklearn.svm import OneClassSVM
            ocsvm = OneClassSVM(kernel='rbf', gamma=0.01, nu=0.0001)
            ocsvm.fit(train_data)
            
            save_model(ocsvm, OCSVM_MODEL_PATH)
            
            return ocsvm
        elif model_name == 'GMM':
            from sklearn.mixture import GaussianMixture
            gmm = GaussianMixture(n_components=4, random_state=0)
            gmm.fit(train_data)
            
            save_model(gmm, GMM_MODEL_PATH)
            
            return gmm
        else:
            from sklearn.neighbors import KernelDensity
            kde = KernelDensity(bandwidth=9, kernel='tophat')
            kde.fit(train_data)
            
            save_model(kde, KDE_MODEL_PATH)
            
            return kde

Incarcam modelele / Antrenam daca nu sunt deja antrenate

In [114]:
ocsvm = load_model(OCSVM_MODEL_PATH, 'OCSVM')
gmm = load_model(GMM_MODEL_PATH, 'GMM')
kde = load_model(KDE_MODEL_PATH, 'KDE')

Prezicem pentru  300 de puncte din setul de test

In [115]:
def process_predictions(model_name, test_predictions):   
    if model_name == 'OCSVM':
        test_predictions[test_predictions == 1] = 0
        test_predictions[test_predictions == -1] = 1
    else:
        scores = test_predictions
        threshold = np.quantile(scores, 0.01)

        idxAnomaly = np.where(scores <= threshold)
        idxNormal = np.where(scores > threshold)

        test_predictions = scores

        test_predictions[idxAnomaly] = 1
        test_predictions[idxNormal] = 0
        
    return test_predictions

Comparam timpul pentru prezicere luand cate un punct in parte

In [116]:
import time

def print_anomaly_message(pred):
    if  pred == 1:
        print('Prezis Anomalie')
    else:
        print('Prezis Normal')

total_time_ocsvm = 0
total_time_gmm = 0
total_time_kde = 0

N = 300
np.random.seed(2)
sampled_idx = np.random.choice(len(test_data), N)
true_labels = test_labels[sampled_idx]
sampled_points = test_data[sampled_idx]
ocsvm_preds = []
gmm_preds = []
kde_preds = []
for point in sampled_points:

    p = point.reshape(1, -1)
    start_ts = time.time()
    ocsvm_preds.append(ocsvm.predict(p))
    end_ts = time.time()
    print(f"OCSVM [s]: {(end_ts-start_ts):.3f}")
    total_time_ocsvm += end_ts-start_ts

    start_ts = time.time()
    gmm_preds.append(gmm.score_samples(p))
    end_ts = time.time()
    print(f"GMM [s]: {(end_ts-start_ts):.3f}")
    total_time_gmm += end_ts-start_ts

    start_ts = time.time()
    kde_preds.append(kde.score_samples(p))
    end_ts = time.time()
    print(f"KDE [s]: {(end_ts-start_ts):.3f}")
    total_time_kde += end_ts-start_ts
        
    print('----------------------------------')

OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.028
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.026
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.026
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.023
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.023
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.023
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.000
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.025
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.026
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.028
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.026
----------------------------------
OCSVM [s]: 0.000
GMM [s]: 0.000
KDE [s]: 0.026
----------------------------------
OCSVM [s]: 0.000

In [117]:
print("Total OCSVM= ", total_time_ocsvm)
print("Total GMM= ", total_time_gmm)
print("Total KDE= ", total_time_kde)

Total OCSVM=  0.050276756286621094
Total GMM=  0.035904884338378906
Total KDE=  6.7128894329071045


Procesam prezicerile

In [118]:
ocsvm_preds = process_predictions('OCSVM', np.array(ocsvm_preds))
gmm_preds = process_predictions('GMM', np.array(gmm_preds))
kde_preds = process_predictions('KDE', np.array(kde_preds))

In [119]:
for idx in range(N):
    if ocsvm_preds[idx] != gmm_preds[idx] or gmm_preds[idx] != kde_preds[idx] or kde_preds[idx] != ocsvm_preds[idx]:
        print('OCSVM', end=' ')
        print_anomaly_message(ocsvm_preds[idx])
        print('GMM', end=' ')
        print_anomaly_message(gmm_preds[idx])
        print('KDE', end=' ')
        print_anomaly_message(kde_preds[idx])
        
        if true_labels[idx] == 1:
            print("Eticheta adevarata: Anomalie")
        else:
            print("Eticheta adevarata: Normal")
            
        print('--------------------------')

OCSVM Prezis Anomalie
GMM Prezis Normal
KDE Prezis Normal
Eticheta adevarata: Anomalie
--------------------------
OCSVM Prezis Anomalie
GMM Prezis Normal
KDE Prezis Anomalie
Eticheta adevarata: Normal
--------------------------
OCSVM Prezis Anomalie
GMM Prezis Anomalie
KDE Prezis Normal
Eticheta adevarata: Anomalie
--------------------------
