In [23]:
%config Completer.use_jedi = False

In [68]:
from scipy import stats
from scipy.stats import entropy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from itertools import groupby

In [25]:
np.set_printoptions(suppress=True)

# LOADING DATA

In [26]:
DIRTY_SINGLE_COLUMN_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners"

In [27]:
dataset = pickle.load(open(f"{DIRTY_SINGLE_COLUMN_FEATURIZED_PATH}/DirtySingleColumn.pickle", "rb"))

In [35]:
datasets_control_novelty = dict()
datasets_control_outlier = dict()
datasets_train = dict()

for column, ds_column in dataset.items():
    datasets_control_novelty[column] = {
        'X': ds_column['control'] + ds_column['anomaly'],
        'y': [False] * len(ds_column['control']) + [True] * len(ds_column['anomaly']),
    }
    
    datasets_train[column] = {
        'X': ds_column['train'],
        'y': [False] * len(ds_column['train']),
    }
    
    datasets_control_outlier[column] = {
        'X': datasets_control_novelty[column]['X'] + datasets_train[column]['X'],
        'y': datasets_control_novelty[column]['y'] + datasets_train[column]['y'],
    }

# TRAIN / EVALUATE

In [36]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
import scipy.stats as ss

### ANOMALY DETECTION

In [71]:
algorithms_novelty = {
    'LocalOutlierFactorNovelty': LocalOutlierFactor(novelty=True),
    'OneClassSVM': OneClassSVM(),
}

algorithms_outlier = {
    'LocalOutlierFactor': LocalOutlierFactor(novelty=False),
    'IsolationForest': IsolationForest()
}

In [72]:
NUM_ITERATIONS = 20

In [73]:
def sample_from_dataset(dataset, rng, share = 0.1):
    _, counts = np.unique(dataset['y'], return_counts=True)
            
    positive_loc = np.argwhere(np.array(dataset['y']) == True).squeeze()
    negative_loc = np.argwhere(np.array(dataset['y']) == False).squeeze()

    num_negatives = counts[0]
    num_positives = int(num_negatives / (1 - share) - num_negatives)
    new_positives = rng.choice(positive_loc, num_positives)

    X_control_negatives = np.array(dataset['X'], dtype='object')[negative_loc].tolist()
    Y_control_negatives = np.array(dataset['y'], dtype='object')[negative_loc].tolist()

    X_control_positives = np.array(dataset['X'], dtype='object')[new_positives].tolist()
    Y_control_positives = np.array(dataset['y'], dtype='object')[new_positives].tolist()

    X_control_new = X_control_negatives + X_control_positives
    Y_control_new = np.r_[Y_control_negatives, Y_control_positives]
    
    return {
        'X': X_control_new,
        'y': Y_control_new,
    }

In [74]:
results = list()

for column in datasets_control_novelty.keys():
    for algorithm_name, algorithm in algorithms_novelty.items():
        scaler = StandardScaler()
        algorithm.fit(scaler.fit_transform(datasets_train[column]['X']))
        rng = np.random.default_rng(seed=42)
        for i in range(NUM_ITERATIONS):
            new_dataset = sample_from_dataset(datasets_control_novelty[column], rng, 0.1)
            
            control_X = scaler.transform(new_dataset['X'])

            control_score = algorithm.decision_function(control_X)
            control_prediction = algorithm.predict(control_X)

            results.append({
                "algorithm": algorithm_name,
                "column": column,
                "score": -control_score,
                "pred": (control_prediction == -1).astype(int),
                "true": new_dataset['y']
            })

In [75]:
for column in datasets_control_outlier.keys():
    for algorithm_name, algorithm in algorithms_outlier.items():
        rng = np.random.default_rng(seed=42)
        for i in range(NUM_ITERATIONS):
            new_dataset = sample_from_dataset(datasets_control_outlier[column], rng, 0.1)
            
            scaler = StandardScaler()
            scaler.fit(new_dataset['X'])
            
            control_prediction = algorithm.fit_predict(scaler.transform(new_dataset['X']))
            control_score = None
            
            if(hasattr(algorithm, 'decision_function')):
                control_score = algorithm.decision_function(scaler.transform(new_dataset['X']))
            elif isinstance(algorithm, LocalOutlierFactor):
                control_score = algorithm.negative_outlier_factor_
            else:    
                raise Exception(f"estimator for {column} has no decision_function")

            results.append({
                "algorithm": algorithm_name,
                "column": column,
                "score": -control_score,
                "pred": (control_prediction == -1).astype(int),
                "true": new_dataset['y']
            })

### EVALUATE

In [76]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [77]:
results_sorted = sorted(results, key=lambda x: f"{x['algorithm']}|{x['column']}")

results_new = list()

for gr, grp in groupby(results_sorted, lambda x: f"{x['algorithm']}|{x['column']}"):
    grp = list(grp)
    results_new.append({
        'algorithm': gr.split('|')[0],
        'column': gr.split('|')[1],
        'precision': np.mean([precision_score(result['true'], result['pred']) for result in grp]),
        'recall':np.mean([recall_score(result['true'], result['pred']) for result in grp]),
        'auc': np.mean([roc_auc_score(result['true'], result['pred']) for result in grp])
    })

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
pd.set_option('display.max_rows', 1000)

In [80]:
pd.DataFrame(results_new).sort_values(by=['column', 'algorithm'])

Unnamed: 0,algorithm,column,precision,recall,auc
0,IsolationForest,Attrition_Flag,0.612381,0.9875,0.960625
40,LocalOutlierFactor,Attrition_Flag,0.333333,1.0,0.9
20,LocalOutlierFactorNovelty,Attrition_Flag,0.85,0.85,0.925
60,OneClassSVM,Attrition_Flag,0.2,1.0,0.8
1,IsolationForest,Avg_Open_To_Buy,0.743413,1.0,0.97875
41,LocalOutlierFactor,Avg_Open_To_Buy,0.551984,1.0,0.95875
21,LocalOutlierFactorNovelty,Avg_Open_To_Buy,1.0,1.0,1.0
61,OneClassSVM,Avg_Open_To_Buy,0.142857,1.0,0.7
2,IsolationForest,Avg_Utilization_Ratio,0.790556,1.0,0.984375
42,LocalOutlierFactor,Avg_Utilization_Ratio,0.793333,1.0,0.985625
