In [1]:
%config Completer.use_jedi = False

In [4]:
from scipy import stats
from scipy.stats import entropy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [3]:
np.set_printoptions(suppress=True)

# LOADING DATA

In [92]:
DIRTY_SINGLE_COLUMN_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners"

In [93]:
dataset = pickle.load(open(f"{DIRTY_SINGLE_COLUMN_FEATURIZED_PATH}/DirtySingleColumn.pickle", "rb"))

# TRAIN / EVALUATE

In [94]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
import scipy.stats as ss

### ANOMALY DETECTION

In [152]:
algorithms = {
    'LocalOutlierFactor': LocalOutlierFactor(novelty=True),
    'OneClassSVM_Linear': OneClassSVM(kernel='linear'),
    'OneClassSVM_RBF': OneClassSVM(),
    'IsolationForest': IsolationForest(contamination=1e-5)
}

In [153]:
results_per_algorithm = dict()

for algorithm_name, algorithm in algorithms.items():
    results = dict()
    for column in dataset.keys():
        scaler = StandardScaler()

        train_raw = np.array(dataset[column]['train']).astype(np.float32)
        anomaly_raw = np.array(dataset[column]['anomaly']).astype(np.float32)
        control_raw = np.array(dataset[column]['control']).astype(np.float32)

        train = scaler.fit_transform(train_raw)
        anomaly = scaler.transform(anomaly_raw)
        control = scaler.transform(control_raw)

        algorithm.fit(train)

        anomaly_score = algorithm.decision_function(anomaly)
        control_score = algorithm.decision_function(control)

        anomaly_prediction = algorithm.predict(anomaly)
        control_prediction = algorithm.predict(control)

        results[column] = {
            "score": np.r_[anomaly_score, control_score],
            "pred": np.r_[(anomaly_prediction == -1).astype(int), (control_prediction == -1).astype(int)],
            "true": np.r_[np.ones_like(anomaly_prediction), np.zeros_like(control_prediction)]
        }
    results_per_algorithm[algorithm_name] = results

### EVALUATE

In [154]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [155]:
metrics = dict()

for algorithm_name, results in results_per_algorithm.items(): 
    metrics[algorithm_name] = dict()
    for column in results.keys():
        metrics[algorithm_name][column] = {
            'precision': precision_score(results[column]['true'], results[column]['pred']),
            'recall': recall_score(results[column]['true'], results[column]['pred']),
            'auc': roc_auc_score(results[column]['true'], -results[column]['score'])
        }

In [156]:
pd.DataFrame(metrics['LocalOutlierFactor']).T

Unnamed: 0,precision,recall,auc
Education_Level,1.0,0.7625,0.915625
Credit_Limit,1.0,1.0,1.0
Income_Category,1.0,0.7625,0.88625
Total_Revolving_Bal,1.0,1.0,1.0
Total_Ct_Chng_Q4_Q1,1.0,0.990909,1.0
Card_Category,1.0,0.6625,0.85
Marital_Status,1.0,0.7375,0.91
CLIENTNUM,1.0,1.0,1.0
Total_Trans_Amt,1.0,1.0,1.0
Avg_Open_To_Buy,1.0,1.0,1.0


In [157]:
pd.DataFrame(metrics['OneClassSVM_Linear']).T

Unnamed: 0,precision,recall,auc
Education_Level,0.815789,0.3875,0.38
Credit_Limit,0.904762,0.518182,0.532727
Income_Category,0.833333,0.1875,0.21
Total_Revolving_Bal,0.947368,0.490909,0.494545
Total_Ct_Chng_Q4_Q1,0.96,0.654545,0.657273
Card_Category,0.809524,0.425,0.3575
Marital_Status,0.741935,0.2875,0.22625
CLIENTNUM,0.905405,0.609091,0.587273
Total_Trans_Amt,0.829787,0.354545,0.338182
Avg_Open_To_Buy,0.890909,0.445455,0.418182


In [158]:
pd.DataFrame(metrics['OneClassSVM_RBF']).T

Unnamed: 0,precision,recall,auc
Education_Level,0.904762,0.95,0.91125
Credit_Limit,0.973451,1.0,1.0
Income_Category,0.916667,0.9625,0.905
Total_Revolving_Bal,0.932203,1.0,1.0
Total_Ct_Chng_Q4_Q1,0.964912,1.0,1.0
Card_Category,0.914634,0.9375,0.88
Marital_Status,0.935065,0.9,0.91125
CLIENTNUM,0.964912,1.0,1.0
Total_Trans_Amt,0.964912,1.0,1.0
Avg_Open_To_Buy,0.948276,1.0,1.0


In [159]:
pd.DataFrame(metrics['IsolationForest']).T

Unnamed: 0,precision,recall,auc
Education_Level,1.0,0.7375,0.91
Credit_Limit,1.0,0.090909,0.920909
Income_Category,1.0,0.7125,0.89375
Total_Revolving_Bal,1.0,0.409091,0.937273
Total_Ct_Chng_Q4_Q1,1.0,0.054545,0.958182
Card_Category,1.0,0.45,0.82375
Marital_Status,1.0,0.4375,0.91375
CLIENTNUM,1.0,0.818182,0.952727
Total_Trans_Amt,1.0,0.136364,0.935455
Avg_Open_To_Buy,1.0,0.118182,0.882727
