# Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import IsolationForest


from keras.layers import Input, Dense
from keras.models import Model
from keras import regularizers
from keras.optimizers import Adam

from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pickle
import json

pd.options.display.max_columns = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Data Preparation

In [None]:
features = ['dimension',
            'size',
            'temporal_dct-mean', 
            'temporal_gaussian_mse-mean', 
            'temporal_gaussian_difference-mean',
            'temporal_threshold_gaussian_difference-mean',
            #'temporal_match-mean'
           ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path, reduced=False, bins=0)
df = metric_processor.read_and_process_data()
df['size_dimension_ratio'] = df['size'] / df['dimension']
df = df.drop(['dimension', 'size'], axis=1)
features.append('size_dimension_ratio')
df.shape

In [None]:
display(df.head())
display(pd.DataFrame(df['attack'].unique()))

In [None]:
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

In [None]:
# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# Save the scaler for inference
pickle.dump(ss, open('../output/models/UL_StandardScaler.pickle.dat', 'wb'))

# One Class SVM

In [None]:
# Dataframe to store results
svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_test',
                                    'TNR', 'model', 'auc', 'f_beta', 'projection'])

# Train the models
svm_results = evaluation.one_class_svm(x_train, x_test, x_attacks, svm_results)



In [None]:
svm_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_svm = svm_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_svm['projection']

reduction = None
if projection == 'PCA':
    reduction = PCA(n_components=best_svm['n_components'])
elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_svm['n_components'])
else:
    print('Unknown projection type')
    X_reduced = x_train
    attack_reduced = x_attacks
    test_reduced = x_test
    
if reduction:    
    X_reduced = reduction.fit_transform(x_train)
    attack_reduced = reduction.transform(x_attacks)
    test_reduced = reduction.transform(x_test)
    pickle.dump(reduction, open('../output/models/reduction_OCSVM.pickle.dat', 'wb'))


OCSVM = svm.OneClassSVM(kernel='rbf',gamma=best_svm['gamma'], nu=best_svm['nu'], cache_size=5000)

OCSVM.fit(X_reduced)

pickle.dump(OCSVM, open('../output/models/OCSVM.pickle.dat', 'wb'))

best_svm_params = best_svm.to_dict()
best_svm_params['features'] = features
with open('../output/models/param_OCSVM.json', 'w') as fp:
    json.dump(best_svm_params, fp)

In [None]:
#Study the attacks that pass through the netowork

y_pred_outliers = OCSVM.predict(attack_reduced)
df_attacks_reset = df_attacks.reset_index()

accurate_outliers_df = pd.DataFrame(y_pred_outliers, columns=['pred']) 
undetected = df_attacks_reset[accurate_outliers_df['pred'] == 1][['attack', 'title']]
undetected.groupby(['attack']).count()

In [None]:
evaluation.plot_roc(OCSVM, test_reduced, attack_reduced, 'OCSVM ROC')

# Isolation Forest

In [None]:
isolation_results = pd.DataFrame(columns=['estimators', 'contamination', 'n_components', 'max_features',
                                          'TPR_test','TPR_train' , 'TNR', 'model', 'auc', 'f_beta', 'projection'])

# Train the models
isolation_results = evaluation.isolation_forest(x_train, x_test, x_attacks, isolation_results)

In [None]:
isolation_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_isolation = isolation_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_isolation['projection']

if projection == 'PCA':
    reduction = PCA(n_components=best_svm['n_components'])
elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_svm['n_components'])
else:
    print('Unknown projection type')
    X_reduced = x_train
if reduction:  
    X_reduced = reduction.fit_transform(x_train)
    attack_reduced = reduction.transform(x_attacks)
    test_reduced = reduction.transform(x_test)
    pickle.dump(reduction, open('../output/models/reduction_IF.pickle.dat', 'wb'))


isolation_forest = IsolationForest(n_estimators=best_isolation['estimators'],
                             contamination=best_isolation['contamination'],
                             max_features=best_isolation['max_features'],
                             n_jobs=7)

isolation_forest.fit(X_reduced)

pickle.dump(isolation_forest, open('../output/models/IF.pickle.dat', 'wb'))

best_isolation_params = best_isolation.to_dict()
best_isolation_params['features'] = features
with open('../output/models/param_IF.json', 'w') as fp:
    json.dump(best_svm_params, fp)

In [None]:
#Study the attacks that pass through the netowork

y_pred_outliers = isolation_forest.predict(attack_reduced)
df_attacks_reset = df_attacks.reset_index()

accurate_outliers_df = pd.DataFrame(y_pred_outliers, columns=['pred']) 
undetected = df_attacks_reset[accurate_outliers_df['pred'] == 1][['attack', 'title']]
undetected.groupby(['attack']).count()

In [None]:
evaluation.plot_roc(isolation_forest, test_reduced, attack_reduced, 'Isolation Forest ROC')

# Autoencoder

In [None]:
latent_dim = 3
input_vector = Input(shape=(X_train.shape[1],))
encoded = Dense(latent_dim, activation='relu')(input_vector)
decoded = Dense(X_train.shape[1], activity_regularizer=regularizers.l1(10e-5))(encoded)
autoencoder = Model(input_vector, decoded)
encoder = Model(input_vector, encoded)
autoencoder.compile(optimizer=Adam(lr=0.001), loss='mse')
network_history = autoencoder.fit(x_train, x_train, shuffle=True, batch_size=16, epochs=100,
                                  validation_data=(x_test, x_test), verbose=True)


In [None]:
def plot_history(network_history, title):
    plt.figure(figsize=(10, 5))
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.semilogy(network_history.history['loss'])
    plt.semilogy(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])
    plt.grid()
    plt.show()
    
plot_history(network_history, 'AE')

In [None]:
print('Mean loss on train: {}'.format(autoencoder.evaluate(x_train, x_train, batch_size=8, verbose=False)))
print('Mean loss on test: {}'.format(autoencoder.evaluate(x_test, x_test, batch_size=8, verbose=False)))
print('Mean loss on attacks: {}'.format(autoencoder.evaluate(x_attacks, x_attacks, batch_size=8, verbose=False)))

In [None]:
x_train_pred = autoencoder.predict(x_train, batch_size=8)
x_test_pred = autoencoder.predict(x_test, batch_size=8)
x_attacks_pred = autoencoder.predict(x_attacks, batch_size=8)

In [None]:
mse_train = (((x_train - x_train_pred)**2).mean(axis=1))
mse_test = (((x_test - x_test_pred)**2).mean(axis=1))
mse_attacks = (((x_attacks - x_attacks_pred)**2).mean(axis=1))

plt.figure()
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=200)

plt.hist(mse_train, **kwargs)
plt.hist(mse_test, **kwargs)
plt.hist(mse_attacks, **kwargs)
plt.legend(['Train', 'Test', 'Attacks'])
plt.title('Histograms of mse')
plt.xlim([0, 5])


In [None]:
x_train_red = encoder.predict(x_train, batch_size=8)
x_test_red = encoder.predict(x_test, batch_size=8)
x_attacks_red = encoder.predict(x_attacks, batch_size=8)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(x_attacks_red[:,0], x_attacks_red[:,1], color='red', label='attack')
plt.scatter(x_train_red[:,0], x_train_red[:,1], color='green', label='Train')
plt.scatter(x_test_red[:,0], x_test_red[:,1], color='yellow', label='Test')
plt.legend()

In [None]:
nus = [0.01]

ae_svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_train', 
                                    'TPR_test', 'TNR', 'model', 'auc', 'f_beta'])

gammas = [x_train_red.shape[1], 2*x_train_red.shape[1], x_train_red.shape[1]/2, 'auto']                                 
for nu in nus:
    for gamma in gammas:
        classifier = svm.OneClassSVM(kernel='rbf',gamma=gamma, nu=nu, cache_size=5000)
        classifier.fit(X_reduced)
        y_pred_train = classifier.predict(X_reduced)
        y_pred_test = classifier.predict(test_reduced)
        y_pred_outliers = classifier.predict(attack_reduced)
        n_accurate_train = y_pred_train[y_pred_train == 1].size
        n_accurate_test = y_pred_test[y_pred_test == 1].size
        n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size


        fpr, tpr, _ = roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                         -1*np.ones(y_pred_outliers.shape[0])]), 
                                          np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)
        fb = fbeta_score(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                         -1*np.ones(y_pred_outliers.shape[0])]), 
                                          np.concatenate([y_pred_test, y_pred_outliers]),beta=20 ,pos_label=1)

        ae_svm_results = ae_svm_results.append({'nu': nu, 'gamma': gamma, 'n_components': latent_dim, 'TPR_train': n_accurate_train/X_reduced.shape[0],
                       'TPR_test': n_accurate_test/test_reduced.shape[0], 'TNR': n_accurate_outliers/attack_reduced.shape[0],
                       'model': 'ae-svm', 'auc': auc(fpr, tpr), 'f_beta': fb}, ignore_index=True)


In [None]:
# Dataframe to store results
ae_svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_train',
                                           'TPR_test', 'TNR', 'model', 'auc', 'f_beta'])
# Train the models
ae_svm_results = evaluation.autoencoder(x_train, x_test, x_attacks, ae_svm_results)

In [None]:
ae_svm_results.sort_values('f_beta', ascending=False).head()