# Import Libraries

In [None]:
import sys
import pandas as pd
#import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import IsolationForest

from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pickle
import json

pd.options.display.max_columns = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Data Preparation

In [None]:
features = ['dimension',
            'size',
            'temporal_dct-mean', 
            'temporal_gaussian_mse-mean', 
            'temporal_gaussian_difference-mean',
            'temporal_threshold_gaussian_difference-mean',
            #'temporal_match-mean'
           ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path, reduced=False, bins=0)
df = metric_processor.read_and_process_data()
df['size_dimension_ratio'] = df['size'] / df['dimension']
df = df.drop(['dimension', 'size'], axis=1)
df.shape

In [None]:
def compute_mean_samples(input_values):
    x, samples_number = input_values
    y = np.fromstring(x.replace('[', '').replace(']', ''),
                      dtype=np.float,
                      sep=' ')

    random_sampler = list(np.random.choice(y,
                                           samples_number))

    mean_value = np.mean(random_sampler)

    return mean_value

In [None]:
print('Sampling dataframe')
samples_number = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
samples_features = []
print('Features', self.features)
for feature in self.features.copy():
    print('Sampling {}'.format(feature))
    if 'temporal' in feature:
        for n in samples_number:
            print('n=', n)
            series_feature = '{}-series'.format(feature.split('-')[0])
            sample_feature = '{}-{}'.format(n, feature)
            df[series_feature] = df[series_feature].str.replace('[', '')
            df[series_feature] = df[series_feature].str.replace(']', '')

            df[sample_feature] = df[series_feature].apply(lambda x:
                                                          self.compute_mean_samples((x, n)))


            samples_features.append(sample_feature)
        self.features.remove(feature)
print(samples_features)
self.features += samples_features
print(self.features)
            

In [None]:
display(df.head(100))
display(pd.DataFrame(df['attack'].unique()))

In [None]:

samples_number = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55]
info_columns = ['attack_ID', 'title', 'attack', 'size_dimension_ratio']

sampled_temporal_columns = [column for column in list(df.columns) if column not in info_columns]


features_full_sampling = [feature for feature in sampled_temporal_columns if 60 == int(feature.split('-')[0])] + info_columns
print('FEATURES:',features_full_sampling)
df_sampling = df[features_full_sampling]
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df_sampling)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# One Class SVM

# Dataframe to store results
svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_test',
                                    'TNR', 'model', 'auc', 'f_beta', 'projection'])

# Train the models
svm_results = evaluation.one_class_svm(x_train, x_test, x_attacks, svm_results)
display(svm_results.sort_values('f_beta', ascending=False).head())

# Save the best model
best_svm = svm_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_svm['projection']

reduction = None
if projection == 'PCA':
    reduction = PCA(n_components=best_svm['n_components'])
elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_svm['n_components'])
else:
    print('Unknown projection type')
    X_reduced = x_train
    attack_reduced = x_attacks
    test_reduced = x_test
    
if reduction:    
    X_reduced = reduction.fit_transform(x_train)
    attack_reduced = reduction.transform(x_attacks)
    test_reduced = reduction.transform(x_test)
    pickle.dump(reduction, open('../output/models/reduction_OCSVM.pickle.dat', 'wb'))


OCSVM = svm.OneClassSVM(kernel='rbf',gamma=best_svm['gamma'], nu=best_svm['nu'], cache_size=5000)

OCSVM.fit(X_reduced)

    

In [None]:
df.to_csv('sampled_df.csv')

In [None]:
sample_df = pd.DataFrame(columns=['f20', 'tnr', 'tpr_train', 'tpr_test'])
for n in samples_number:
    

    features = [feature for feature in sampled_temporal_columns if n == int(feature.split('-')[0])]
    df_sampling = df[features + info_columns]
    (X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df_sampling)

    # Scaling the data
    ss = StandardScaler()
    x_train = ss.fit_transform(X_train)

    x_attacks = ss.transform(X_attacks)

    fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, x_train,
                                                                         x_test, x_attacks)
    sample_df = sample_df.append({'f20':fb,
                                  'tnr':tnr,
                                  'tpr_train': tpr_train,
                                  'tpr_test': tpr_test},
                                 ignore_index=True)
display(sample_df)