In [None]:
import pandas as pd
import os
import glob
from os.path import join
import numpy as np
import random
from pycaret.classification import *
from pycaret.datasets import get_data

from create_datasets import create_train_dataset_supervised, \
    create_test_dataset_balanced, \
    create_test_dataset_unbalanced

In [None]:
random.seed(42)

WORK_DIR = os.path.dirname('.')
FM_DIR = os.path.join(WORK_DIR, 'feature_maps')
BENCHMARK_DIR = os.path.join(WORK_DIR, 'benchmark')
DATA_DIR = os.path.join(WORK_DIR, 'data')
OUTPUTS_DIR = os.path.join(WORK_DIR, 'outputs')
NUM_OF_DATA_SETS = 30

if not os.path.isdir(join(OUTPUTS_DIR, 'results_train_sup')):
    os.mkdir(join(OUTPUTS_DIR, 'results_train_sup'))
if not os.path.isdir(join(OUTPUTS_DIR, 'results_test_sup_balanced')):
    os.mkdir(join(OUTPUTS_DIR, 'results_test_sup_balanced'))
if not os.path.isdir(join(OUTPUTS_DIR, 'results_test_sup_unbalanced')):
    os.mkdir(join(OUTPUTS_DIR, 'results_test_sup_unbalanced'))

In [None]:
counter = 0
df_results_test_sup_balanced = None
for data_set_idx in range(NUM_OF_DATA_SETS):
    sup_train_balanced_results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa', 'MCC'])
    X_train_sup, y_train_sup, sizes_train = create_train_dataset_supervised(data_set_idx)
    X_test_b, y_test_b, sizes_balanced = create_test_dataset_balanced(data_set_idx)
    X_test_ub, y_test_ub, sizes_unbalanced = create_test_dataset_unbalanced(data_set_idx)

    data_train_df = pd.DataFrame(X_train_sup)
    data_test_b_df = pd.DataFrame(X_test_b)
    data_test_ub_df = pd.DataFrame(X_test_ub)
    
    data_train_df['Labels'] = y_train_sup
    data_test_b_df['Labels'] = y_test_b
    data_test_ub_df['Labels'] = y_test_ub
    
    
    # TRAIN
    setup(data_train_df, target='Labels', session_id=42, n_jobs=25, use_gpu=True)
    compare_models(fold=5, exclude=['gbc', 'ada', 'lightgbm', 'dt'], n_select=12)
    sup_train_balanced_results_df = pull()
    sup_train_balanced_results_df['n_normal'] = sizes_train[0]
    sup_train_balanced_results_df['n_anomaly'] = sizes_train[1]
    sup_train_balanced_results_df.to_csv(join(OUTPUTS_DIR, 'results_train_sup', f'id_{data_set_idx}.csv'))
    
    # TEST BALANCED
    setup(data_train_df, test_data=data_test_b_df, index=False, target='Labels', session_id=42, n_jobs=25, use_gpu=True)
    list_of_best_models_balanced = compare_models(fold=5, exclude=['gbc', 'ada', 'lightgbm', 'dt'], n_select=12)
    
    df_results_test_sup_balanced = pd.DataFrame()
    for idx, model in enumerate(list_of_best_models_balanced):
        predict_model(list_of_best_models_balanced[idx])
        df_results_test_sup_balanced = df_results_test_sup_balanced.append(pull())

    df_results_test_sup_balanced['n_normal'] = sizes_balanced[0]
    df_results_test_sup_balanced['n_anomaly'] = sizes_balanced[1]
    df_results_test_sup_balanced.to_csv(join(OUTPUTS_DIR, 'results_test_sup_balanced', f'id_{data_set_idx}.csv'))
    
    
    # TEST UNBALANCED
    setup(data_train_df, test_data=data_test_ub_df, index=False, target='Labels', session_id=42, n_jobs=25, use_gpu=True)
    list_of_best_models_unbalanced = compare_models(fold=5, exclude=['gbc', 'ada', 'lightgbm', 'dt'], n_select=12)
    
    df_results_test_sup_unbalanced = pd.DataFrame()
    for idx, model in enumerate(list_of_best_models_unbalanced):
        predict_model(list_of_best_models_unbalanced[idx])
        df_results_test_sup_unbalanced = df_results_test_sup_unbalanced.append(pull())

    df_results_test_sup_unbalanced['n_normal'] = sizes_unbalanced[0]
    df_results_test_sup_unbalanced['n_anomaly'] = sizes_unbalanced[1]
    df_results_test_sup_unbalanced.to_csv(join(OUTPUTS_DIR, 'results_test_sup_unbalanced', f'id_{data_set_idx}.csv'))
    
    counter += 1
        
    print(f'Finnish: {counter} / {NUM_OF_DATA_SETS}')
    
print(f'Finnished all: {counter} / {NUM_OF_DATA_SETS}')

In [None]:
col_names = ['bAccAvg', 'bAccStd', 'bAccMax', 'bAccMin', 'bAccMaxName', 'bAccMinName',
                 'bAUCAvg', 'bAUCStd', 'bAUCMax', 'bAUCMin', 'bAUCMaxName', 'bAUCMinName',
                 'bF1Avg', 'bF1Std', 'bF1Max', 'bF1Min', 'bF1MaxName', 'bF1MinName',
                 'ubAccAvg', 'ubAccStd', 'ubAccMax', 'ubAccMin', 'ubAccMaxName', 'ubAccMinName',
                 'ubAUCAvg', 'ubAUCStd', 'ubAUCMax', 'ubAUCMin', 'ubAUCMaxName', 'ubAUCMinName',
                 'ubF1Avg', 'ubF1Std', 'ubF1Max', 'ubF1Min', 'ubF1MaxName', 'ubF1MinName',
                 'NTrSOrig', 'ATrSOrig', 'NTrSBalanced', 'ATrSBalanced', 'NTsSOrig', 'ATsSOrig',
                 'NTsSUnBalanced', 'ATsSUnBalanced'
                 ]
sup_results_df = pd.DataFrame(columns=col_names)

In [None]:
all_train_files = glob.glob(os.path.join(OUTPUTS_DIR, 'results_train_sup', '*.csv'))
all_test_files_sup_balanced = glob.glob(os.path.join(OUTPUTS_DIR, 'results_test_sup_balanced', '*.csv'))
all_test_files_sup_unbalanced = glob.glob(os.path.join(OUTPUTS_DIR, 'results_test_sup_unbalanced', '*.csv'))
results_train_sup_balanced_df = pd.concat((pd.read_csv(f) for f in all_train_files), ignore_index=True)
results_test_files_sup_balanced_df = pd.concat((pd.read_csv(f) for f in all_test_files_sup_balanced), ignore_index=True)
results_test_files_sup_unbalanced_df = pd.concat((pd.read_csv(f) for f in all_test_files_sup_unbalanced), ignore_index=True)

In [None]:
columns = ['bAccAvg', 'bAccStd', 'bAccMax', 'bAccMin', 'bAccMaxName', 'bAccMinName',
                 'bAUCAvg', 'bAUCStd', 'bAUCMax', 'bAUCMin', 'bAUCMaxName', 'bAUCMinName',
                 'bF1Avg', 'bF1Std', 'bF1Max', 'bF1Min', 'bF1MaxName', 'bF1MinName']

def create_dataset_balanced_summary(df: pd.DataFrame()):
    result = {}
    result['bAccAvg'] = df['Accuracy'].mean()
    result['bAccStd'] = df['Accuracy'].std()
    result['bAccMax'] = df['Accuracy'].max()
    result['bAccMin'] = df['Accuracy'].min()
    result['bAccMaxName'] = df.iloc[df['Accuracy'].idxmax(axis=0)]['Model']
    result['bAccMinName'] = df.iloc[df['Accuracy'].idxmin(axis=0)]['Model']
    result['bAUCAvg'] = df['AUC'].mean()
    result['bAUCStd'] = df['AUC'].std()
    result['bAUCMax'] = df['AUC'].max()
    result['bAUCMin'] = df['AUC'].min()
    result['bAUCMaxName'] = df.iloc[df['AUC'].idxmax(axis=0)]['Model']
    result['bAUCMinName'] = df.iloc[df['AUC'].idxmin(axis=0)]['Model']
    result['bF1Avg'] = df['F1'].mean()
    result['bF1Std'] = df['F1'].std()
    result['bF1Max'] = df['F1'].max()
    result['bF1Min'] = df['F1'].min()
    result['bF1MaxName'] = df.iloc[df['F1'].idxmax(axis=0)]['Model']
    result['bF1MinName'] = df.iloc[df['F1'].idxmin(axis=0)]['Model']
    
    return result
    
def create_dataset_unbalanced_summary(df: pd.DataFrame()):
    result = {}
    result['ubAccAvg'] = df['Accuracy'].mean()
    result['ubAccStd'] = df['Accuracy'].std()
    result['ubAccMax'] = df['Accuracy'].max()
    result['ubAccMin'] = df['Accuracy'].min()
    result['ubAccMaxName'] = df.iloc[df['Accuracy'].idxmax(axis=0)]['Model']
    result['ubAccMinName'] = df.iloc[df['Accuracy'].idxmin(axis=0)]['Model']
    result['ubAUCAvg'] = df['AUC'].mean()
    result['ubAUCStd'] = df['AUC'].std()
    result['ubAUCMax'] = df['AUC'].max()
    result['ubAUCMin'] = df['AUC'].min()
    result['ubAUCMaxName'] = df.iloc[df['AUC'].idxmax(axis=0)]['Model']
    result['ubAUCMinName'] = df.iloc[df['AUC'].idxmin(axis=0)]['Model']
    result['ubF1Avg'] = df['F1'].mean()
    result['ubF1Std'] = df['F1'].std()
    result['ubF1Max'] = df['F1'].max()
    result['ubF1Min'] = df['F1'].min()
    result['ubF1MaxName'] = df.iloc[df['F1'].idxmax(axis=0)]['Model']
    result['ubF1MinName'] = df.iloc[df['F1'].idxmin(axis=0)]['Model']

    return result

results_sup_balanced_test = pd.DataFrame()
for df in [pd.read_csv(f) for f in all_test_files_sup_balanced]:
    del df['Unnamed: 0']
    result = create_dataset_summary(df)
    result['n_normal'] = df['n_normal'][0]
    result['n_anomaly'] = df['n_anomaly'][0]
    results_sup_balanced_test = results.append(result, ignore_index=True)


results_sup_unbalanced_test = pd.DataFrame()
for df in [pd.read_csv(f) for f in all_test_files_sup_unbalanced]:
    del df['Unnamed: 0']
    result = create_dataset_unbalanced_summary(df)
    result['n_normal'] = df['n_normal'][0]
    result['n_anomaly'] = df['n_anomaly'][0]
    results_sup_unbalanced_test = results_sup_unbalanced_test.append(result, ignore_index=True)


In [None]:
results_sup_balanced_test


In [None]:
results_sup_unbalanced_test