In [155]:
%config Completer.use_jedi = False

In [156]:
from scipy import stats
from scipy.stats import entropy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [157]:
np.set_printoptions(suppress=True)

# LOADING DATA

In [226]:
DIRTY_SINGLE_ANOMALY_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners/DirtySingleAnomaly"

In [242]:
dataset = pickle.load(open(f"{DIRTY_SINGLE_ANOMALY_FEATURIZED_PATH}/DirtySingleAnomaly.pickle", "rb"))

# META MODEL INTERFACE

In [243]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone as clone_estimator
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from itertools import groupby

In [244]:
class MetaNoveltyModel:
    def __init__(self, base_clr, base_scaler):
        self.base_clr = base_clr
        self.base_scaler = base_scaler
        self.estimators = dict()
        self.scalers = dict()
    
    def fit(self, X, z):
        sorted_by_column = sorted(zip(X, z), key=lambda x: x[-1])
        
        for (z_val, X_z) in groupby(sorted_by_column, key=lambda x: x[-1]):
            X_, _ = zip(*X_z)
            estimator = clone_estimator(self.base_clr, safe=True)
            scaler = clone_estimator(self.base_scaler, safe=True)
            x_ = scaler.fit_transform(X_)
            estimator.fit(x_)
            self.estimators[z_val] = estimator
            self.scalers[z_val] = scaler
    
    def __predict(self, column, x):
        check_is_fitted(self.scalers[column])
        check_is_fitted(self.estimators[column])
        
        return self.estimators[column].predict(
            self.scalers[column].transform(
                np.atleast_2d(x)
            )
        )
    
    def __decision_function(self, column, x):
        check_is_fitted(self.scalers[column])
        check_is_fitted(self.estimators[column])
        
        if(hasattr(self.estimators[column], 'decision_function')):
            return self.estimators[column].decision_function(
                self.scalers[column].transform(
                    np.atleast_2d(x)
                )
            )
        else:
            raise Exception(f"estimator for {column} has no decision_function")
            
    def __apply_function(self, X, z, func):
        i = list(range(len(z)))
        sorted_by_column = sorted(zip(X, i, z), key=lambda x: x[-1])
        
        pred = np.zeros((len(z)))
        
        for (z_val, X_z) in groupby(sorted_by_column, key=lambda x: x[-1]):
            X_, i_, _ = zip(*X_z)
            pred[list(i_)] = func(z_val, X_)
            
        return np.array(pred)
    
    def predict(self, X, z):
        return self.__apply_function(X, z, self.__predict)
    
    def decision_function(self, X, z):
        return self.__apply_function(X, z, self.__decision_function)

In [245]:
class MetaOutlierModel:
    def __init__(self, base_clr, base_scaler):
        self.base_clr = base_clr
        self.base_scaler = base_scaler
        self.estimators = dict()
        self.scalers = dict()
    
    def __fit_predict(self, column, x):
        return self.estimators[column].fit_predict(
            self.scalers[column].fit_transform(
                np.atleast_2d(x)
            )
        )
    
    def __fit_decision_function(self, column, x):
        x_ = self.scalers[column].fit_transform(np.atleast_2d(x))
        if(hasattr(self.estimators[column], 'decision_function')):
            self.estimators[column].fit(x_)
            return self.estimators[column].decision_function(x_)
        elif isinstance(self.estimators[column], LocalOutlierFactor):
            self.estimators[column].fit_predict(x_)
            return self.estimators[column].negative_outlier_factor_
        else:    
            raise Exception(f"estimator for {column} has no decision_function")
            
    def __fit_apply(self, X, z, func):
        i = list(range(len(z)))
        sorted_by_column = sorted(zip(X, i, z), key=lambda x: x[-1])
        
        pred = np.zeros((len(z)))
        
        for (z_val, X_z) in groupby(sorted_by_column, key=lambda x: x[-1]):
            self.estimators[z_val] = clone_estimator(self.base_clr, safe=True)
            self.scalers[z_val] = clone_estimator(self.base_scaler, safe=True)
            X_, i_, _ = zip(*X_z)
            pred[list(i_)] = func(z_val, X_)
            
        return np.array(pred)
    
    def fit_predict(self, X, z):
        return self.__fit_apply(X, z, self.__fit_predict)
    
    def fit_decision_function(self, X, z):
        return self.__fit_apply(X, z, self.__fit_decision_function)

# DATASET PREPARATION

In [246]:
datasets_control_novelty = dict()

for anomaly, ds_anomaly in dataset['anomaly'].items():
    dataset_anomaly = dict()
    
    dataset_anomaly['X'] = list()
    dataset_anomaly['y'] = list()
    dataset_anomaly['z'] = list()
    
    for column, features in ds_anomaly.items():
        dataset_anomaly['X'] += features
        dataset_anomaly['y'] += [True] * len(features)
        dataset_anomaly['z'] += [column] * len(features)
    
    datasets_control_novelty[anomaly] = dataset_anomaly

for anomaly in dataset['anomaly'].keys():
    for column, features in dataset['control'].items():
        if column in datasets_control_novelty[anomaly]['z']:
            datasets_control_novelty[anomaly]['X'] += features
            datasets_control_novelty[anomaly]['y'] += [False] * len(features)
            datasets_control_novelty[anomaly]['z'] += [column] * len(features)

In [247]:
dataset_train = {
    'X': list(),
    'z': list()
}

for column, features in dataset['train'].items():
    dataset_train['X'] += features
    dataset_train['z'] += [column] * len(features)

In [255]:
datasets_control_outlier = dict()

for anomaly, ds_anomaly in dataset['anomaly'].items():
    dataset_anomaly = dict()
    
    dataset_anomaly['X'] = list()
    dataset_anomaly['y'] = list()
    dataset_anomaly['z'] = list()
    
    for column, features in ds_anomaly.items():
        dataset_anomaly['X'] += features
        dataset_anomaly['y'] += [True] * len(features)
        dataset_anomaly['z'] += [column] * len(features)
    
    datasets_control_outlier[anomaly] = dataset_anomaly

for anomaly in dataset['anomaly'].keys():
    for column, features in dataset['control'].items():
        if column in datasets_control_outlier[anomaly]['z']:
            datasets_control_outlier[anomaly]['X'] += features
            datasets_control_outlier[anomaly]['y'] += [False] * len(features)
            datasets_control_outlier[anomaly]['z'] += [column] * len(features)
            
            datasets_control_outlier[anomaly]['X'] += dataset['train'][column]
            datasets_control_outlier[anomaly]['y'] += [False] * len(dataset['train'][column])
            datasets_control_outlier[anomaly]['z'] += [column] * len(dataset['train'][column])

# TRAIN / EVALUATE

In [256]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
import scipy.stats as ss

from itertools import groupby

### ANOMALY DETECTION

In [257]:
algorithms_novelty = {
    'LocalOutlierFactorNovelty': LocalOutlierFactor(novelty=True),
    'OneClassSVM': OneClassSVM(),
}

algorithms_outlier = {
    'LocalOutlierFactor': LocalOutlierFactor(novelty=False),
    'IsolationForest': IsolationForest()
}

In [258]:
def sample_from_dataset(dataset, rng, share = 0.1):
    _, counts = np.unique(dataset['y'], return_counts=True)
            
    positive_loc = np.argwhere(np.array(dataset['y']) == True).squeeze()
    negative_loc = np.argwhere(np.array(dataset['y']) == False).squeeze()

    num_negatives = counts[0]
    num_positives = int(num_negatives / (1 - share) - num_negatives)
    new_positives = rng.choice(positive_loc, num_positives)

    X_control_negatives = np.array(dataset['X'], dtype='object')[negative_loc].tolist()
    Y_control_negatives = np.array(dataset['y'], dtype='object')[negative_loc].tolist()
    Z_control_negatives = np.array(dataset['z'], dtype='object')[negative_loc].tolist()

    X_control_positives = np.array(dataset['X'], dtype='object')[new_positives].tolist()
    Y_control_positives = np.array(dataset['y'], dtype='object')[new_positives].tolist()
    Z_control_positives = np.array(dataset['z'], dtype='object')[new_positives].tolist()

    X_control_new = X_control_negatives + X_control_positives
    Y_control_new = np.r_[Y_control_negatives, Y_control_positives]
    Z_control_new = Z_control_negatives + Z_control_positives
    
    return {
        'X': X_control_new,
        'y': Y_control_new,
        'z': Z_control_new
    }

In [259]:
NUM_ITERATIONS = 10

In [260]:
results = list()
for algorithm_name, algorithm in algorithms_novelty.items():
    clr = MetaNoveltyModel(algorithm, StandardScaler())
    clr.fit(dataset_train['X'], dataset_train['z'])
    
    for anomaly, dataset in datasets_control_novelty.items():
        rng = np.random.default_rng(seed=42)
        for i in range(NUM_ITERATIONS):
            new_dataset = sample_from_dataset(dataset, rng, 0.1)
            
            preds = clr.predict(new_dataset['X'], new_dataset['z'])
            score = clr.decision_function(new_dataset['X'], new_dataset['z'])

            results.append({
                'algorithm': algorithm_name,
                'anomaly': anomaly,
                'iteration': i,
                'score': score,
                'pred': (preds == -1).astype(int),
                'true': new_dataset['y']
            })

In [261]:
for algorithm_name, algorithm in algorithms_outlier.items(): 
    for anomaly, dataset in datasets_control_outlier.items():
        rng = np.random.default_rng(seed=42)
        for i in range(NUM_ITERATIONS):
            new_dataset = sample_from_dataset(dataset, rng, 0.1)
            
            clr = MetaOutlierModel(algorithm, StandardScaler())

            preds = clr.fit_predict(new_dataset['X'], new_dataset['z'])
            score = clr.fit_decision_function(new_dataset['X'], new_dataset['z'])

            results.append({
                'algorithm': algorithm_name,
                'anomaly': anomaly,
                'iteration': i,
                'score': score,
                'pred': (preds == -1).astype(int),
                'true': new_dataset['y']
            })

In [262]:
results_sorted = sorted(results, key=lambda x: f"{x['algorithm']}|{x['anomaly']}")

results_new = list()

for gr, grp in groupby(results_sorted, lambda x: f"{x['algorithm']}|{x['anomaly']}"):
    grp = list(grp)
    results_new.append({
        'algorithm': gr.split('|')[0],
        'anomaly': gr.split('|')[1],
        'precision': np.mean([precision_score(result['true'], result['pred']) for result in grp]),
        'recall':np.mean([recall_score(result['true'], result['pred']) for result in grp]),
        'auc': np.mean([roc_auc_score(result['true'], result['pred']) for result in grp])
    })

In [263]:
pd.DataFrame(results_new).sort_values(by=['anomaly', 'algorithm'])

Unnamed: 0,algorithm,anomaly,precision,recall,auc
0,IsolationForest,categorical_category_miss,0.494899,1.0,0.944375
16,LocalOutlierFactor,categorical_category_miss,0.673954,1.0,0.97375
8,LocalOutlierFactorNovelty,categorical_category_miss,0.75,1.0,0.983333
24,OneClassSVM,categorical_category_miss,0.136364,1.0,0.683333
1,IsolationForest,categorical_distribution_changed,0.498601,1.0,0.945
17,LocalOutlierFactor,categorical_distribution_changed,0.661667,1.0,0.972292
9,LocalOutlierFactorNovelty,categorical_distribution_changed,0.75,1.0,0.983333
25,OneClassSVM,categorical_distribution_changed,0.136364,1.0,0.683333
2,IsolationForest,categorical_new_category_influx,0.434449,1.0,0.929375
18,LocalOutlierFactor,categorical_new_category_influx,0.682456,1.0,0.974792
