# Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np
import multiprocessing as mp
import gc

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import IsolationForest

from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pickle
import json

import time

num_partitions = 10 #number of partitions to split dataframe
num_cores = mp.cpu_count() #number of cores on your machine

pd.options.display.max_columns = 999
p = mp.Pool(mp.cpu_count()) # Data parallelism Object
sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Auxiliary functions and lists

In [None]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = mp.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    del df_split
    return df

In [None]:
def convert_to_numpy(df):
    
    for column in df.columns:
        df[column] = df[column].apply(lambda x: np.fromstring(x.replace('[', '').replace(']', ''),
                          dtype=np.float,
                          sep=' '))
        
    return df

In [None]:
def compute_mean(df):
    
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x.mean())
        
    return df

In [None]:
def compute_mean_samples(df):
    global samples_number
    
    for column in df.columns:
        df[column] = df[column].apply(lambda x: np.mean(list(np.random.choice(x, samples_number))))
    
    return df

In [None]:
def split_test_and_train(df, train_prop=0.8):

    df_1 = df[df['attack_ID'] < 10]
    df_0 = df[df['attack_ID'] >= 10]

    num_train = int(df_1.shape[0]*train_prop)
    df_train = df_1[0:num_train]
    df_test = df_1[num_train:]
    df_attacks = df_0

    df_train = df_train.sample(frac=1)
    df_test = df_test.sample(frac=1)
    df_attacks = df_attacks.sample(frac=1)

    x_train = df_train.drop(['attack_ID'], axis=1)
    x_train = np.asarray(x_train)

    x_test = df_test.drop(['attack_ID'], axis=1)
    x_test = np.asarray(x_test)

    x_attacks = df_attacks.drop(['attack_ID'], axis=1)
    x_attacks = np.asarray(x_attacks)

    return (x_train, x_test, x_attacks), (df_train, df_test, df_attacks)

In [None]:
downscale_features = ['temporal_psnr',
                      'temporal_ssim',
                      'temporal_cross_correlation'
                     ]

upscale_features = ['temporal_difference',
                    'temporal_dct',
                    'temporal_canny',
                    'temporal_gaussian_mse',
                    'temporal_gaussian_difference',
                    'temporal_histogram_distance',
                    'temporal_entropy',
                    'temporal_lbp',
                    'temporal_texture',
                    'temporal_match',
                   ]

features = ['dimension',
            'size',
            'temporal_dct-mean', 
            'temporal_gaussian_mse-mean', 
            'temporal_gaussian_difference-mean',
            'temporal_threshold_gaussian_difference-mean',
           ]

# Data Preparation

In [None]:
path = '../../machine_learning/cloud_functions/data-large.csv'
reduced = False

data = pd.read_csv(path)
if reduced:
    data = data[:reduced]

df = pd.DataFrame(data)

del data

print('ORIGINAL DATASET:')
display(df.head())


In [None]:
df['attack'] = df['attack'].apply(lambda x: MetricProcessor.set_attack_name(x))
df['attack_ID'] = df['attack'].apply(lambda x: MetricProcessor.set_attack_id(x))
df['size_dimension_ratio'] = df['size'] / df['dimension']
df.shape
display(df.head(100))

In [None]:
print('Sampling dataframe')
time_series_df = df[[column for column in df.columns if 'series' in column]]
display(time_series_df.head())
samples_number = 60

start_time = time.time()

time_series_df = parallelize_dataframe(time_series_df, convert_to_numpy)

elapsed_time = time.time() - start_time
print('Conversion time:', elapsed_time)

start_time = time.time()
%reset -f out
display(time_series_df.head())
mean_values_df = parallelize_dataframe(time_series_df, compute_mean)
elapsed_time = time.time() - start_time
print('Mean computation time:', elapsed_time)
mean_values_df['dimension'] = df['dimension']
mean_values_df['size_dimension_ratio'] = df['size_dimension_ratio']
mean_values_df['attack_ID'] = df['attack_ID']
for column in time_series_df.columns:
    
    for label in downscale_features:
        if label in column:
            print('Upscaling', label)
            mean_values_df[column] = mean_values_df[column] / mean_values_df['dimension']
    for label in upscale_features:
        if label in column:
            print('Downscaling', label)
            mean_values_df[column] = mean_values_df[column] * mean_values_df['dimension']
display(mean_values_df)

In [None]:

(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = split_test_and_train(mean_values_df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# One Class SVM

# Dataframe to store results
svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_test',
                                    'TNR', 'model', 'auc', 'f_beta', 'projection'])

# Train the models
svm_results = evaluation.one_class_svm(x_train, x_test, x_attacks, svm_results)
display(svm_results.sort_values('f_beta', ascending=False).head())

# Save the best model
best_svm = svm_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_svm['projection']

reduction = None
if projection == 'PCA':
    reduction = PCA(n_components=best_svm['n_components'])
else:
    print('Unknown projection type')
    X_reduced = x_train
    attack_reduced = x_attacks
    test_reduced = x_test
    
if reduction:    
    X_reduced = reduction.fit_transform(x_train)
    attack_reduced = reduction.transform(x_attacks)
    test_reduced = reduction.transform(x_test)
    pickle.dump(reduction, open('../output/models/reduction_OCSVM.pickle.dat', 'wb'))


OCSVM = svm.OneClassSVM(kernel='rbf',gamma=best_svm['gamma'], nu=best_svm['nu'], cache_size=5000)

OCSVM.fit(X_reduced)

    

In [None]:
samples_series = [5, 10, 15, 20 , 25, 30, 35, 40, 45, 50, 55]

sample_df = pd.DataFrame(columns=['#samples', 'f20', 'tnr', 'tpr_train', 'tpr_test'])
for n in samples_series:
    print('Number of samples:', n)
    samples_number = n
    for i in range(1000):
        
        start_time = time.time()
        
        mean_values_df = parallelize_dataframe(time_series_df, compute_mean_samples)
        elapsed_time = time.time() - start_time
        mean_values_df['dimension'] = df['dimension']
        mean_values_df['size_dimension_ratio'] = df['size_dimension_ratio']
        mean_values_df['attack_ID'] = df['attack_ID']
        for column in time_series_df.columns:

            for label in downscale_features:
                if label in column:
                    mean_values_df[column] = mean_values_df[column] / mean_values_df['dimension']
            for label in upscale_features:
                if label in column:
                    mean_values_df[column] = mean_values_df[column] * mean_values_df['dimension']

        
        (X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = split_test_and_train(mean_values_df)

        # Scaling the data
        ss = StandardScaler()
        x_train = ss.fit_transform(X_train)
        x_test = ss.fit_transform(X_test)
        x_attacks = ss.transform(X_attacks)

        fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, x_train,
                                                                             x_test, x_attacks)
        sample_df = sample_df.append({'#samples': n,
                                      'f20': fb,
                                      'tnr': tnr,
                                      'tpr_train': tpr_train,
                                      'tpr_test': tpr_test},
                                     ignore_index=True)
        del mean_values_df, X_train, X_test, X_attacks, df_train, df_test, df_attacks
        gc.collect()
        elapsed_time = time.time() - start_time
        print('Computation time:', elapsed_time)
    sample_df.to_csv('Samples-{}.csv'.format(n))
    display(sample_df)

# Compute PDFs and make t-test to extract confidence intervals

In [None]:
# Collect generated data from previous process
path1 = 'Samples-5-20.csv'
path2 = 'Samples-25-55.csv'

data1 = pd.read_csv(path1)
data2 = pd.read_csv(path2)


sample_df = pd.DataFrame(data1)
sample_df = pd.concat([sample_df, pd.DataFrame(data2)])
sample_df = sample_df.drop('Unnamed: 0', axis=1)
del data1, data2

print('ORIGINAL DATASET:')
print(sample_df.shape)
display(sample_df.head())

In [None]:
sample_df.groupby('#samples').std()

In [None]:
# Display PDFs
from matplotlib.ticker import StrMethodFormatter
import matplotlib.pyplot as plt

samples_series = [5, 10, 15, 20 , 25, 30, 35, 40, 45, 50, 55]
variable = 'tnr'
for sample_number in samples_series:

    samples_x_frames_df = sample_df[sample_df['#samples']==sample_number]
    display(samples_x_frames_df.describe())
    qqplot(samples_x_frames_df[variable].values, line='s')

    
    ax = samples_x_frames_df.hist(column=variable, bins=25, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)

    ax = ax[0]
    for x in ax:

        # Despine
        x.spines['right'].set_visible(False)
        x.spines['top'].set_visible(False)
        x.spines['left'].set_visible(False)

        # Switch off ticks
        x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")

        # Draw horizontal axis lines
        vals = x.get_yticks()
        for tick in vals:
            x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        # Remove title
        x.set_title("")

        # Set x-axis label
        x.set_xlabel("Variable: {} (samples = {})".format(variable, sample_number), labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Predictions", labelpad=20, weight='bold', size=12)

        # Format y-axis label
        x.yaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))

pyplot.show()

In [None]:
# Compute the data normality
from scipy.stats import shapiro
from scipy.stats import normaltest

test_methods = ['Shapiro', 'DAgostino']

normality_stats_df = pd.DataFrame(columns=test_methods)
normality_p_df = pd.DataFrame(columns=test_methods)

for sample_number in samples_series:

    samples_x_frames_df = sample_df[sample_df['#samples']==sample_number]
    
    stat_shapiro, p_shapiro = shapiro(samples_x_frames_df[variable])
    stat_dagostino, p_dagostino = normaltest(samples_x_frames_df[variable])
    
    normality_p_df = normality_p_df.append(pd.Series([p_shapiro, p_dagostino],
                                                     index=['Shapiro', 'DAgostino']),
                                           ignore_index=True)
    normality_stats_df = normality_stats_df.append(pd.Series([stat_shapiro, stat_dagostino],
                                                     index=['Shapiro', 'DAgostino']),
                                           ignore_index=True)
display(normality_p_df)
print('P-values')
display(normality_stats_df)
print('Stats')

In [None]:
# Compute t-statistics and p-values for each pair
from scipy.stats import ttest_ind

def make_test(func, params=None):
    print(func)
    stats_values_df = pd.DataFrame(columns=samples_series)
    p_values_df = pd.DataFrame()

    for sample_number in samples_series:

        samples_x_frames_df_x = sample_df[sample_df['#samples']==sample_number]
        stats_values_row = []
        p_values_row = []
        indexes_list = []
        for sample_number in samples_series:
            samples_x_frames_df_y = sample_df[sample_df['#samples']==sample_number]
            cat1 = samples_x_frames_df_x[variable].values
            cat2 = samples_x_frames_df_y[variable].values

            try:
                test = func(cat1, cat2)

                stats_values_row.append(test.statistic)
                p_values_row.append(test.pvalue)
                indexes_list.append(sample_number)
            except:
                print('Error', sample_number)
                pass
            
        stats_values_df = stats_values_df.append(pd.Series(stats_values_row,
                                                           index=indexes_list),
                                                 ignore_index=True)

        p_values_df = p_values_df.append(pd.Series(p_values_row,
                                                   index=indexes_list),
                                         ignore_index=True)

    stats_values_df.index = stats_values_df.columns
    p_values_df.index = p_values_df.columns
    display(stats_values_df)
    print('stats')    

    display(p_values_df)
    print('p-values')


In [None]:
from scipy.stats import bartlett, levene, f_oneway, wilcoxon, kruskal
#make_test(ttest_ind, {'equal_var': False})
pd.options.display.float_format = '{:,.6f}'.format

make_test(bartlett)
make_test(levene)
make_test(f_oneway)
make_test(wilcoxon)
make_test(kruskal)

In [None]:
from scipy.stats import friedmanchisquare

samples_x_frames_df_y = sample_df[sample_df['#samples']==sample_number]
cat1 = samples_x_frames_df_x[variable].values
test = friedmanchisquare(sample_df[sample_df['#samples']==5][variable].values,
                        sample_df[sample_df['#samples']==10][variable].values,
                        sample_df[sample_df['#samples']==15][variable].values,
                        sample_df[sample_df['#samples']==20][variable].values,
                        sample_df[sample_df['#samples']==25][variable].values,
                        sample_df[sample_df['#samples']==30][variable].values,
                        sample_df[sample_df['#samples']==35][variable].values,
                        sample_df[sample_df['#samples']==40][variable].values,
                        sample_df[sample_df['#samples']==45][variable].values,
                        sample_df[sample_df['#samples']==50][variable].values,
                        sample_df[sample_df['#samples']==55][variable].values
                        )
print(test)

In [None]:
medians_df = pd.DataFrame(columns=sample_df.columns)
for sample_number in samples_series:

    samples_x_frames_df = sample_df[sample_df['#samples']==sample_number]
    medians_df = medians_df.append(samples_x_frames_df.median(), ignore_index=True)
display(medians_df)