In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
from sklearn.manifold import TSNE


offline.init_notebook_mode(connected=True)

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor

In [None]:
# Load data

features = ['temporal_canny-euclidean', 
            'temporal_cross_correlation-euclidean',
            'temporal_difference-euclidean', 
            'temporal_histogram_distance-euclidean',
            'temporal_dct-euclidean', 
            'size', 
            'dimension', 
            'fps',
            'temporal_dct-std', 
            'temporal_dct-manhattan']

path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path, reduced=True)
df = metric_processor.read_and_process_data()
display(df.head())

(X_train, X_test, X_attacks), _ = metric_processor.split_test_and_train(df)

print('{} training samples. {} test samples. {} attack samples'.format(X_train.shape[0], X_test.shape[0], 
                                                                       X_attacks.shape[0]))

# Correlation

In [None]:
df_corr = df.corr()
plt.figure(figsize=(10,10))
corr = df_corr.corr('spearman')
corr.style.background_gradient().set_precision(2)

# Dimensionality Reduction

## PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

variances = []
components = reversed(range(1,x_test.shape[1]+1))
for i in components:
    pca = PCA(n_components=i)
    pca.fit(x_train)
    variances.append(sum(pca.explained_variance_ratio_))
    
trace = go.Scatter(
x = list(reversed(range(1,x_test.shape[1]+1))),
y = variances)

data=[trace]

layout = {'title': 'PCA', 
      'xaxis': {'title': 'Number of components', }, 
      'yaxis': {'title': 'Variance explained'},
      }

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)

f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()


## Random Projections

In [None]:
from sklearn import random_projection

rp = random_projection.GaussianRandomProjection(n_components=2)

X_reduced = rp.fit_transform(x_train)
test_reduced = rp.transform(x_test)
attack_reduced = rp.transform(x_attacks)

In [None]:
f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.legend()

## T-SNE

For t-SNE, we will use a different number of features. We need to redefine the dataset

In [None]:
features = ['dimension', 
            'size',
            'fps',
            'temporal_gaussian-euclidean', 
            'temporal_gaussian-manhattan',
            'temporal_gaussian-max', 
            'temporal_gaussian-mean',
            'temporal_gaussian-std', 
            'temporal_histogram_distance-euclidean',
            'temporal_histogram_distance-manhattan',
            'temporal_histogram_distance-max', 
            'temporal_histogram_distance-mean',
            'temporal_histogram_distance-std']

path = '../../machine_learning/cloud_functions/data-large.csv'



metric_processor = MetricProcessor(features,'UL', path, reduced=False)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
X, y = None, None
N=20000

df_corr = df.corr()
feat_cols = df_corr.columns

print(df['attack_ID'].unique())
X = df[feat_cols]

X = np.asarray(X)
x = ss.fit_transform(X)

df_tsne = pd.DataFrame(x,columns=feat_cols)
df_tsne  = df_tsne.loc[:N]

df_tsne_unscaled = pd.DataFrame(ss.inverse_transform(df_tsne.values), columns=feat_cols)

df_tsne_unscaled['attack'] = df['attack']

print('Size of the dataframe: {}'.format(df_tsne.shape))

### 2D plot

In [None]:

tsne = TSNE(n_components=2, verbose=1, perplexity=25, n_iter=500)
tsne_results = tsne.fit_transform(df_tsne)

offline.init_notebook_mode(connected=True)


traceTSNE = go.Scattergl(
    x = tsne_results[:,0],
    y = tsne_results[:,1],
#name = df_subset,
#     hoveron = Target,
    mode = 'markers',
    text = df_tsne_unscaled['attack_ID'].values,
    showlegend = True,
    marker = dict(
        #size = df_pca_unscaled['attack_ID']*2,
        color = df_tsne_unscaled['attack_ID'],
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        opacity = 0.8
    )
)
data = [traceTSNE]

layout = dict(title = 'TSNE (T-Distributed Stochastic Neighbour Embedding)',
              hovermode= 'closest',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False),
              showlegend= True,
             )

fig = dict(data=data, layout=layout)
offline.plot(fig)

### 3D plot

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=25, n_iter=500)
tsne_results = tsne.fit_transform(df_tsne)

traceTSNE = go.Scatter3d(
    x = tsne_results[:,0],
    y = tsne_results[:,1],
    z = tsne_results[:,2],
#name = df_subset,
#     hoveron = Target,
    mode = 'markers',
    text = df_tsne_unscaled['attack_ID'].values,
    showlegend = True,
    marker = dict(
        size = 1,
        color = df_tsne_unscaled['attack_ID'],
        colorscale ='Jet',
        showscale = False,

        opacity = 0.8
    )
)
data = [traceTSNE]

layout = dict(title = 'TSNE (T-Distributed Stochastic Neighbour Embedding)',
              hovermode= 'closest',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False),
              showlegend= True,
             )

fig = dict(data=data, layout=layout)
offline.plot(fig)