In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline


offline.init_notebook_mode()

sys.path.insert(0, '../../scripts/asset_processor/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor

In [None]:
# Load data

features = ['temporal_canny-euclidean', 'temporal_cross_correlation-euclidean',
            'temporal_difference-euclidean', 'temporal_histogram_distance-euclidean',
            'temporal_dct-euclidean', 'size', 'attack_ID', 'title', 'attack', 'dimension', 'fps',
            'temporal_dct-std', 'temporal_dct-manhattan']

path = '../../machine_learning/cloud_functions/data.csv'

metric_processor = MetricProcessor(features,'UL', path)
df = metric_processor.read_and_process_data()
display(df.head())

(X_train, X_test, X_attacks), _ = metric_processor.split_test_and_train(df)

print('{} training samples. {} test samples. {} attack samples'.format(X_train.shape[0], X_test.shape[0], 
                                                                       X_attacks.shape[0]))



# Dimensionality Reduction

## PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

variances = []
components = reversed(range(1,x_test.shape[1]+1))
for i in components:
    pca = PCA(n_components=i)
    pca.fit(x_train)
    variances.append(sum(pca.explained_variance_ratio_))
    
trace = go.Scatter(
x = list(reversed(range(1,x_test.shape[1]+1))),
y = variances)

data=[trace]

layout = {'title': 'PCA', 
      'xaxis': {'title': 'Number of components', }, 
      'yaxis': {'title': 'Variance explained'},
      }

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)

f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.legend()


## Random Projections

In [None]:
from sklearn import random_projection

rp = random_projection.GaussianRandomProjection(n_components=2)

X_reduced = rp.fit_transform(x_train)
test_reduced = rp.transform(x_test)
attack_reduced = rp.transform(x_attacks)

In [None]:
f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.legend()