# Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import IsolationForest

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
from plotly import tools

from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pickle
import json

pd.options.display.max_rows = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline
offline.init_notebook_mode()

# Data Preparation

In [None]:
features = ['temporal_dct-mean', 'temporal_gaussian-mean', 'temporal_spatial_complexity-mean',
           'temporal_difference-mean', 'dimension', 'temporal_gaussian_difference-mean']


path = '../../machine_learning/cloud_functions/new-attacks-data-large.csv'

metric_processor = MetricProcessor(features,'UL', path)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
df.head()

In [None]:
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

In [None]:
# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# Fitting model

In [None]:
OCSVM = svm.OneClassSVM(kernel='rbf',gamma='auto', nu=0.01, cache_size=5000)
OCSVM.fit(x_train)

In [None]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, x_train, x_test, x_attacks)

In [None]:
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

# Predicting Distances 

In [None]:
train_scores = OCSVM.decision_function(x_train)
test_scores = OCSVM.decision_function(x_test)
attack_scores = OCSVM.decision_function(x_attacks)

print('Mean score values:\n-Train: {}\n-Test: {}\n-Attacks: {}'.format(np.mean(train_scores),
                                                                       np.mean(test_scores),
                                                                       np.mean(attack_scores)))

In [None]:
trace0 = go.Box(
    y=test_scores,
    name='test'
    
)
trace1 = go.Box(
    y=attack_scores,
    name='attacks'
)
data = [trace0, trace1]

layout = {'title': 'Boxplots', 
          'yaxis': {'title': 'Distance to decision function'}
         }

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

Negative distances mean points outside the decision function thus, classified as attacks.

# Error study

Here we will study the distances to the decision function comparing them to different attacks and resolutions, in order to gain insights of the model we have built.

In [None]:
df_train, df_test, df_attacks = df_train.reset_index(), df_test.reset_index(), df_attacks.reset_index()

In [None]:
df_train['distance_to_dec_func'] = train_scores
df_test['distance_to_dec_func'] = test_scores
df_attacks['distance_to_dec_func'] = attack_scores

In [None]:
resolutions = df_test['dimension'].unique()
attacks = df_attacks['attack'].unique()

In [None]:
data = []
resolutions = np.sort(resolutions)
for res in resolutions:
    selection = df_test[df_test['dimension'] == res]
    trace = go.Box(y = selection['distance_to_dec_func'], name = str(res) + 'p',
                   text = selection['title']
)
    data.append(trace)

layout = go.Layout(
            title=go.layout.Title(text='Test Set'),
            yaxis = go.layout.YAxis(title = 'Distance to decision function'),
            xaxis = go.layout.XAxis(
                title = 'Resolutions',
                tickmode = 'array',
                ticktext = [str(i) + 'p' for i in resolutions]
            )
)

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
data = []
resolutions = np.sort(resolutions)
for res in resolutions:
    selection = df_attacks[df_attacks['dimension'] == res]
    trace = go.Box(y = selection['distance_to_dec_func'], name = str(res) + 'p',
                   text = selection['title']
)
    data.append(trace)

layout = go.Layout(
            title=go.layout.Title(text='Attack Set'),
            yaxis = go.layout.YAxis(title = 'Distance to decision function'),
            xaxis = go.layout.XAxis(
                title = 'Resolutions',
                tickmode = 'array',
                ticktext = [str(i) + 'p' for i in resolutions]
            )
)

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
data = []
attack_types = list(set([i.split('_', 1)[1] for i in attacks]))
for attk in attack_types:
    selection = df_attacks[df_attacks['attack'].str.contains(attk)]
    trace = go.Box(y = selection['distance_to_dec_func'], name = attk, text = selection['title'])
    data.append(trace)

layout = go.Layout(
            title=go.layout.Title(text='Attack Set'),
            yaxis = go.layout.YAxis(title = 'Distance to decision function'),
            xaxis = go.layout.XAxis(
                title = 'Attack Type',
                tickmode = 'array',
                ticktext = attack_types
            )
)

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
data = []

for res in resolutions:
    for attk in attack_types:
        selection = df_attacks[(df_attacks['attack'].str.contains(attk)) & (df_attacks['dimension'] == res)]
        trace = go.Box(y = selection['distance_to_dec_func'], name = '{}p-{}'.format(res,attk),
        text = selection['title'])
        data.append(trace)



    layout = go.Layout(
            title=go.layout.Title(text=str(res)+ 'p'),
            yaxis = go.layout.YAxis(title = 'Distance to decision function'),
            xaxis = go.layout.XAxis(
                title = 'Attack Type',
                tickmode = 'array',
                ticktext = attack_types
            )
    )

    fig = go.Figure(data=data, layout=layout)
    offline.iplot(fig)
    data = []


# Modelling

In [None]:
from sklearn.neural_network import BernoulliRBM

In [None]:
# rbm = BernoulliRBM(n_components=2, verbose=1, batch_size=1024, learning_rate=0.00004, n_iter=1000) # -3.35
rbm = BernoulliRBM(n_components=2, verbose=1, batch_size=1024, learning_rate=0.00004, n_iter=1000)

rbm.fit(x_train)

In [None]:
latent_train = rbm.transform(x_train)
latent_test = rbm.transform(x_test)
latent_attacks = rbm.transform(x_attacks)

In [None]:
np.mean(latent_train, axis=0), np.mean(latent_test, axis=0), np.mean(latent_attacks, axis=0)

In [None]:
th = 0.2
# train_pred = [x > th and y > th for x,y in latent_train]
# test_pred = [x > th and y > th for x,y in latent_test]
# attacks_pred = [x > th and y > th for x,y in latent_attacks]

train_pred = [np.sum(i) > th for i in latent_train]
test_pred = [np.sum(i) > th for i in latent_test]
attacks_pred = [np.sum(i) > th for i in latent_attacks]

In [None]:
print('Train TPR: {}'.format(sum(train_pred)/len(train_pred)))
print('Test TPR: {}'.format(sum(test_pred)/len(test_pred)))
print('TNR: {}'.format((len(attacks_pred) - sum(attacks_pred))/len(attacks_pred)))

true_positives = sum(test_pred)
false_negatives = len(test_pred) - true_positives
false_positives = sum(attacks_pred)
true_negatives = len(attacks_pred) - false_positives

beta = 20
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
F20 = (1 + (beta ** 2))*precision*recall/((beta ** 2)*precision + recall)
print('F20: {}'.format(F20))

In [None]:
score_train = rbm.score_samples(x_train)
score_test = rbm.score_samples(x_test)
score_attacks = rbm.score_samples(x_attacks)

In [None]:
np.mean(score_train), np.mean(score_test), np.mean(score_attacks)

In [None]:
th = np.quantile(score_train, 0.99)


print('Thresholding the 99% quantile')
print('Train TPR: {}'.format(1 - sum(score_train > th) / len(score_train)))
print('Test TPR: {}'.format(1 - sum(score_test > th) / len(score_test)))
print('TNR: {}'.format(1 - sum(score_attacks < th) / len(score_attacks)))

true_positives = sum(score_test < th)
false_negatives = sum(score_test > th)
false_positives = sum(score_attacks < th)
true_negatives = sum(score_attacks > th)

beta = 20
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
F20 = (1 + (beta ** 2))*precision*recall/((beta ** 2)*precision + recall)
print('F20: {}'.format(F20))

In [None]:
plt.scatter(latent_attacks[:100,0], latent_attacks[:100,1])
plt.scatter(latent_train[:100,0], latent_train[:100,1])
plt.scatter(latent_test[:100,0], latent_test[:100,1])

In [None]:
from sklearn import svm

OCSVM = svm.OneClassSVM(kernel='linear',gamma='auto', nu=0.01, cache_size=5000)

OCSVM.fit(latent_train)

In [None]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, latent_train, latent_test, latent_attacks)

In [None]:
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

In [None]:
def plot_train_history_loss(history):
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()

plot_train_history_loss(history)

In [None]:
from scipy.stats import norm

from keras.layers import Input, Dense, Lambda, Flatten, Reshape
from keras.layers import Conv2D, Conv2DTranspose
from keras.models import Model
from keras import backend as K
from keras import metrics

from keras.datasets import fashion_mnist

batch_size = 512
original_dim = x_train.shape[1]
latent_dim = 2
intermediate_dim = 2
epochs = 50
epsilon_std = 1.0


x = Input(shape=(original_dim,))
h = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)


def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# to reuse these later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# instantiate VAE model
vae = Model(x, x_decoded_mean)


In [None]:
# Compute VAE loss
xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

In [None]:
history = vae.fit(x_train,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(x_test, None))

plot_train_history_loss(history)

In [None]:
# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

y_test = df_test['dimension']

# display a 2D plot of the digit classes in the latent space
def plot_latentSpace(encoder, x_test, y_test, batch_size):
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test, cmap='tab10')
    plt.colorbar()
    plt.show()

plot_latentSpace(encoder, x_test, y_test, batch_size)
test_latent = encoder.predict(x_test, batch_size=batch_size)


In [None]:
y_train = df_train['dimension']
plot_latentSpace(encoder, x_train, y_train, batch_size)
train_latent = encoder.predict(x_train, batch_size=batch_size)

In [None]:
y_attacks = df_attacks['dimension']
plot_latentSpace(encoder, x_attacks, y_attacks, batch_size)
attacks_latent = encoder.predict(x_attacks, batch_size=batch_size)

In [None]:
from sklearn import svm

OCSVM = svm.OneClassSVM(kernel='rbf',gamma='auto', nu=0.01, cache_size=5000)

OCSVM.fit(train_latent)

In [None]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, train_latent, test_latent, attacks_latent)

In [None]:
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

In [None]:
from minisom import MiniSom

In [None]:
som = MiniSom(10, 10, 6, sigma=1, learning_rate=0.05) # initialization of 6x6 SOM
som.train_batch(x_train, 100) # trains the SOM with 100 iterations

In [None]:
score_train = np.linalg.norm(som.quantization(x_train) - x_train, axis=1)
score_test = np.linalg.norm(som.quantization(x_test) - x_test, axis=1)
score_attacks = np.linalg.norm(som.quantization(x_attacks) - x_attacks, axis=1)

In [None]:
np.mean(score_train), np.mean(score_test), np.mean(score_attacks)

In [None]:
th = np.quantile(score_train, 0.99)


print('Thresholding the 99% quantile')
print('Train TPR: {}'.format(1 - sum(score_train > th) / len(score_train)))
print('Test TPR: {}'.format(1 - sum(score_test > th) / len(score_test)))
print('TNR: {}'.format(1 - sum(score_attacks < th) / len(score_attacks)))

true_positives = sum(score_test < th)
false_negatives = sum(score_test > th)
false_positives = sum(score_attacks < th)
true_negatives = sum(score_attacks > th)

beta = 20
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
F20 = (1 + (beta ** 2))*precision*recall/((beta ** 2)*precision + recall)
print('F20: {}'.format(F20))

In [None]:
from sklearn.manifold import Isomap


In [None]:
embedding = Isomap(n_components=2, n_jobs=7)
embedding.fit(x_train)

In [None]:
latent_train = embedding.transform(x_train)
latent_test = embedding.transform(x_test)


In [None]:
latent_attacks = embedding.transform(x_attacks[:10000])

In [None]:
plt.scatter(latent_train[:, 0], latent_train[:, 1])
plt.scatter(latent_test[:, 0], latent_test[:, 1])


In [None]:
plt.scatter(latent_attacks[:, 0], latent_attacks[:, 1])
