In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
from sklearn.manifold import TSNE


offline.init_notebook_mode(connected=True)

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation


In [None]:
features = ['temporal_dct-mean', 'temporal_gaussian-mean', 'size',
            'dimension', 'temporal_gaussian_difference-mean']


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))
df.head()


# Correlation

In [None]:
df_corr = df.corr()
plt.figure(figsize=(10,10))
corr = df_corr.corr('spearman')
corr.style.background_gradient().set_precision(2)

# Dimensionality Reduction

## PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

variances = []
components = reversed(range(1,x_test.shape[1]+1))
for i in components:
    pca = PCA(n_components=i)
    pca.fit(x_train)
    variances.append(sum(pca.explained_variance_ratio_))
    
trace = go.Scatter(
x = list(reversed(range(1,x_test.shape[1]+1))),
y = variances)

data=[trace]

layout = {'title': 'PCA', 
      'xaxis': {'title': 'Number of components', }, 
      'yaxis': {'title': 'Variance explained'},
      }

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)

f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()


## Random Projections

In [None]:
from sklearn import random_projection

rp = random_projection.GaussianRandomProjection(n_components=2)

X_reduced = rp.fit_transform(x_train)
test_reduced = rp.transform(x_test)
attack_reduced = rp.transform(x_attacks)

In [None]:
f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.legend()

## T-SNE

For t-SNE, we will use a different number of features. We need to redefine the dataset

In [None]:
features = ['dimension', 
            'size',
            'fps',
            'temporal_gaussian-euclidean', 
            'temporal_gaussian-manhattan',
            'temporal_gaussian-max', 
            'temporal_gaussian-mean',
            'temporal_gaussian-std', 
            'temporal_histogram_distance-euclidean',
            'temporal_histogram_distance-manhattan',
            'temporal_histogram_distance-max', 
            'temporal_histogram_distance-mean',
            'temporal_histogram_distance-std']

path = '../../machine_learning/cloud_functions/data-large.csv'



metric_processor = MetricProcessor(features,'UL', path, reduced=False)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
X, y = None, None
N=20000

df_corr = df.corr()
feat_cols = df_corr.columns

print(df['attack_ID'].unique())
X = df[feat_cols]

X = np.asarray(X)
x = ss.fit_transform(X)

df_tsne = pd.DataFrame(x,columns=feat_cols)
df_tsne  = df_tsne.loc[:N]

df_tsne_unscaled = pd.DataFrame(ss.inverse_transform(df_tsne.values), columns=feat_cols)

df_tsne_unscaled['attack'] = df['attack']

print('Size of the dataframe: {}'.format(df_tsne.shape))

### 2D plot

In [None]:

tsne = TSNE(n_components=2, verbose=1, perplexity=25, n_iter=500)
tsne_results = tsne.fit_transform(df_tsne)

offline.init_notebook_mode(connected=True)


traceTSNE = go.Scattergl(
    x = tsne_results[:,0],
    y = tsne_results[:,1],
#name = df_subset,
#     hoveron = Target,
    mode = 'markers',
    text = df_tsne_unscaled['attack_ID'].values,
    showlegend = True,
    marker = dict(
        #size = df_pca_unscaled['attack_ID']*2,
        color = df_tsne_unscaled['attack_ID'],
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        opacity = 0.8
    )
)
data = [traceTSNE]

layout = dict(title = 'TSNE (T-Distributed Stochastic Neighbour Embedding)',
              hovermode= 'closest',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False),
              showlegend= True,
             )

fig = dict(data=data, layout=layout)
offline.plot(fig)

### 3D plot

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=25, n_iter=500)
tsne_results = tsne.fit_transform(df_tsne)

traceTSNE = go.Scatter3d(
    x = tsne_results[:,0],
    y = tsne_results[:,1],
    z = tsne_results[:,2],
#name = df_subset,
#     hoveron = Target,
    mode = 'markers',
    text = df_tsne_unscaled['attack_ID'].values,
    showlegend = True,
    marker = dict(
        size = 1,
        color = df_tsne_unscaled['attack_ID'],
        colorscale ='Jet',
        showscale = False,

        opacity = 0.8
    )
)
data = [traceTSNE]

layout = dict(title = 'TSNE (T-Distributed Stochastic Neighbour Embedding)',
              hovermode= 'closest',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False),
              showlegend= True,
             )

fig = dict(data=data, layout=layout)
offline.plot(fig)

# EDA

In [None]:
import seaborn as sns

In [None]:
resolutions = df_test['dimension'].unique()
resolutions = np.sort(resolutions)
attacks = df_attacks['attack'].unique()

In [None]:
sns.pairplot(df_train)

In [None]:
sns.pairplot(df_attacks)

## Temporal Gaussian Mean

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_train['temporal_gaussian-mean'][df_train['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_attacks['temporal_gaussian-mean'][df_attacks['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

## Temporal DCT Difference

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_train['temporal_dct-mean'][df_train['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_attacks['temporal_dct-mean'][df_attacks['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

## Temporal Gaussian Difference Mean

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_train['temporal_gaussian_difference-mean'][df_train['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

In [None]:
data = []
for res in resolutions:
    data.append(go.Histogram(x=df_attacks['temporal_gaussian_difference-mean'][df_attacks['dimension'] == res], 
                             name = '{}p'.format(res), opacity=0.75))
fig = go.Figure(data=data)
fig.layout.update(barmode='stack') # For stack mode change 'overlay' for 'stack'

offline.iplot(fig)

# Relations between Main Features

In [None]:
sns.pairplot(df_train[['temporal_gaussian_difference-mean', 'temporal_gaussian-mean', 'temporal_dct-mean']])

In [None]:
sns.pairplot(df_attacks[['temporal_gaussian_difference-mean', 'temporal_gaussian-mean', 'temporal_dct-mean']])

In [None]:
display(df_train[['temporal_dct-mean', 'dimension']].groupby('dimension').describe())
display(df_attacks[['temporal_dct-mean', 'dimension']].groupby('dimension').describe())

In [None]:
display(df_train[['temporal_gaussian-mean', 'dimension']].groupby('dimension').describe())
display(df_attacks[['temporal_gaussian-mean', 'dimension']].groupby('dimension').describe())

In [None]:
display(df_train[['temporal_gaussian_difference-mean', 'dimension']].groupby('dimension').describe())
display(df_attacks[['temporal_gaussian_difference-mean', 'dimension']].groupby('dimension').describe())

# Centrality and dispersion study

In [None]:
feature_list = ['temporal_dct-mean', 'temporal_gaussian-mean', 'temporal_gaussian_difference-mean']

for feat in feature_list:
    means = []
    stds = []
    for res in resolutions:
        selection = df_train[feat][df_train['dimension'] == res]
        means.append(selection.mean())
        stds.append(selection.std())

    data = []
    data.append(go.Scatter(x=resolutions, y=means, name='mean', mode='markers'))
    data.append(go.Scatter(x=resolutions, y=stds, name='std', mode='markers'))

    layout = go.Layout(
        title=feat,
        xaxis=go.layout.XAxis(
              tickmode='array',
              tickvals=resolutions,
              ticktext=[str(res) + 'p' for res in resolutions]
        )
    )


    fig = go.Figure(data=data, layout=layout)


    offline.iplot(fig)

In [None]:
for feat in feature_list:
    means = []
    stds = []
    for res in resolutions:
        selection = df_attacks[feat][df_attacks['dimension'] == res]
        means.append(selection.mean())
        stds.append(selection.std())

    data = []
    data.append(go.Scatter(x=resolutions, y=means, name='mean', mode='markers'))
    data.append(go.Scatter(x=resolutions, y=stds, name='std', mode='markers'))

    layout = go.Layout(
        title=feat,
        xaxis=go.layout.XAxis(
              tickmode='array',
              tickvals=resolutions,
              ticktext=[str(res) + 'p' for res in resolutions]
        )
    )


    fig = go.Figure(data=data, layout=layout)


    offline.iplot(fig)

## Digging into centrality and dispersion of attacks

In [None]:
attacks = df_attacks['attack'].unique()
attacks = list(set([attack[attack.find('p') + 2:] for attack in attacks]))

metric = 'temporal_dct-mean'

for res in resolutions:
    selection = df_train[(df_train['dimension'] == res)]
    data = []
    trace = go.Box(y=selection[metric], name='legit',
               text = selection['title'])
    data.append(trace)
    
    for attack in attacks:
        selection = df_attacks[(df_attacks['dimension'] == res) & (df_attacks['attack'].str.contains(attack))]
        trace = go.Box(y=selection[metric], name=attack,
                       text = selection['title'])
        data.append(trace)

    layout = go.Layout(
                title=go.layout.Title(text=str(res) + 'p'),
                yaxis = go.layout.YAxis(title = metric),
                xaxis = go.layout.XAxis(
                    title = 'Attack type',
                    tickmode = 'array',
                    ticktext = attacks
                )
    )

    fig = go.Figure(data=data, layout=layout)
    offline.iplot(fig)

# Understanding the Model

In [None]:

metrics = ['temporal_gaussian-mean', 'size']
data = []

for res in resolutions:
    selection = df_train[(df_train['dimension'] == res)].sample(50)
    trace = go.Scatter(x=selection[metrics[0]], y=selection[metrics[1]], name='legit-' + str(res),
               text = selection['title'], mode='markers')
    data.append(trace)
    
    for attack in attacks:
        selection = df_attacks[(df_attacks['dimension'] == res) &
                               (df_attacks['attack'].str.contains(attack))].sample(50)
        trace = go.Scatter(x=selection[metrics[0]], y=selection[metrics[1]], name=attack + '-' + str(res),
                       text = selection['title'], mode='markers')
        data.append(trace)

    layout = go.Layout(
                title=go.layout.Title(text='Feature space'),
                yaxis = go.layout.YAxis(title = metrics[1]),
                xaxis = go.layout.XAxis(
                    title = metrics[1],

                )
    )

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import matplotlib.font_manager


# Scaling the data
ss = StandardScaler()
X_train_ = X_train[:, 1:3]
X_test_ = X_test[:, 1:3]
X_attacks_ = X_attacks[:, 1:3]


x_train = ss.fit_transform(X_train_)
x_test = ss.transform(X_test_)
x_attacks = ss.transform(X_attacks_)

clf = svm.OneClassSVM(nu=0.01, kernel='rbf', gamma='auto', cache_size=5000)
clf.fit(x_train)
y_pred_train = clf.predict(x_train)
y_pred_test = clf.predict(x_test)
y_pred_outliers = clf.predict(x_attacks)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

In [None]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(clf, x_train, x_test, x_attacks)
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

In [None]:
xx, yy = np.meshgrid(np.linspace(-1, 10, 500), np.linspace(-1, 10, 500))

In [None]:
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:
plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')

s = 40
b1 = plt.scatter(x_train[:500, 0], x_train[:500, 1], c='white', s=s, edgecolors='k')
b2 = plt.scatter(x_test[:, 0], x_test[:, 1], c='blueviolet', s=s,
                 edgecolors='k')
c = plt.scatter(x_attacks[:500, 0], x_attacks[:500, 1], c='gold', s=s,
                edgecolors='k')
plt.axis('tight')
plt.xlim((-1, 10))
plt.ylim((-1, 10))
plt.legend([a.collections[0], b1, b2, c],
           ["learned frontier", "training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper right",
           prop=matplotlib.font_manager.FontProperties(size=11))
plt.xlabel(metrics[0])
plt.ylabel(metrics[1])

plt.show()

In [None]:
df_train['dist_to_dec_funct'] = clf.decision_function(x_train)
df_test['dist_to_dec_funct'] = clf.decision_function(x_test)
df_attacks['dist_to_dec_funct'] = clf.decision_function(x_attacks)

In [None]:
for metric in metrics:
    selection = df_train[(df_train['dist_to_dec_funct'] >= 0)]
    data = []
    trace = go.Box(y=selection[metric], name='legit',
                   text=selection['title'])
    data.append(trace)
    
    selection = df_train[(df_train['dist_to_dec_funct'] < 0)]

    trace = go.Box(y=selection[metric], name='misclasified',
                   text=selection['title'])
    data.append(trace)
    
    layout = go.Layout(
                title=go.layout.Title(text='Classification Results'),
                yaxis = go.layout.YAxis(title = metric),
                xaxis = go.layout.XAxis(
                    title = 'Classification',
                    tickmode = 'array',
                    ticktext = ['Legit', 'Misclassifed']
                )
    )

    fig = go.Figure(data=data, layout=layout)
    offline.iplot(fig)

In [None]:
for metric in metrics:
    selection = df_attacks[(df_attacks['dist_to_dec_funct'] >= 0)]
    data = []
    trace = go.Box(y=selection[metric], name='misclassified',
                   text=selection['title'])
    data.append(trace)
    
    selection = df_attacks[(df_attacks['dist_to_dec_funct'] < 0)]

    trace = go.Box(y=selection[metric], name='attacks',
                   text=selection['title'])
    data.append(trace)
    
    layout = go.Layout(
                title=go.layout.Title(text='Classification Results'),
                yaxis = go.layout.YAxis(title = metric),
                xaxis = go.layout.XAxis(
                    title = 'Classification',
                    tickmode = 'array',
                    ticktext = ['Misclassified', 'Attacks']
                )
    )

    fig = go.Figure(data=data, layout=layout)
    offline.iplot(fig)

In [None]:

metrics = ['temporal_gaussian-mean', 'size']

for metric in metrics:
    data = []
    for res in resolutions:
        selection = df_train[(df_train['dimension'] == res)].sample(50)
        trace = go.Scatter(y=selection['dist_to_dec_funct'], x=selection[metric], name='legit-' + str(res),
                   text = selection['title'], mode='markers')
        data.append(trace)

        for attack in attacks:
            selection = df_attacks[(df_attacks['dimension'] == res) &
                                   (df_attacks['attack'].str.contains(attack))].sample(50)
            trace = go.Scatter(y=selection['dist_to_dec_funct'], x=selection[metric], name=attack + '-' + str(res),
                           text = selection['title'], mode='markers')
            data.append(trace)

        layout = go.Layout(
                    title=go.layout.Title(text='Feature space'),
                    xaxis = go.layout.XAxis(title = metric),
                    yaxis = go.layout.YAxis(
                        title = 'Distance to decision Function',

                    )
        )

    fig = go.Figure(data=data, layout=layout)
    offline.iplot(fig)