In [1]:
# Author: Antoine DELPLACE
# Last update: 17/01/2020
"""
Dimensionality reduction algorithms using PCA and t-SNE

Parameters
----------
data_window.h5         : extracted data from preprocessing1.py
data_window3.h5        : extracted data from preprocessing2.py
data_window_labels.npy : label numpy array from preprocessing1.py

Return
----------
Plot 2D representation of the data thanks to PCA or t-SNE
"""

'\nDimensionality reduction algorithms using PCA and t-SNE\n\nParameters\n----------\ndata_window.h5         : extracted data from preprocessing1.py\ndata_window3.h5        : extracted data from preprocessing2.py\ndata_window_labels.npy : label numpy array from preprocessing1.py\n\nReturn\n----------\nPlot 2D representation of the data thanks to PCA or t-SNE\n'

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py

In [3]:
from sklearn import model_selection, manifold, decomposition

In [4]:
print("Import data")

Import data


In [5]:
X = pd.read_hdf('data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [6]:
X2 = pd.read_hdf('data_window3_botnet3.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [7]:
X = X.join(X2)

In [8]:
X.drop('window_id', axis=1, inplace=True)

In [9]:
y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

In [10]:
labels = np.load("data_window_botnet3_labels.npy", allow_pickle=True)

In [11]:
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=To-Backgro' 'flow=From-Backg' 'flow=From-Norma'
 'flow=To-Normal-' 'flow=Normal-V42' 'flow=From-Botne']
6


In [12]:
y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=123456)

In [13]:
print("y", np.unique(y, return_counts=True))
print("y_train", np.unique(X_train, return_counts=True))
print("y_test", np.unique(y_test, return_counts=True))

y (array([list([0]), list([1]), list([2]), list([3]), list([4]), list([6])],
      dtype=object), array([2207092,   18047,     263,     984,      48,     286]))
y_train (array([ -14.64152587,  -14.62718816,  -14.62477672, ..., 1018.93716717,
       1038.07597368, 1038.42284291]), array([5405,    1,    1, ...,    1,    1,    2]))
y_test (array([False,  True]), array([734736,     82]))


In [None]:
print("t-SNE") # Beware: this is very time-consuming
clf = manifold.TSNE(n_components=2, random_state=123456)
clf.fit(X[['Dport_nunique', 'TotBytes_sum', 'Dur_sum', 'Dur_mean', 'TotBytes_std']])

t-SNE




In [None]:
print(clf.embedding_)

In [None]:
y_plot = np.where(y_bin6 == True)[0]
print(len(y_plot))

In [None]:
y_plot2 = np.random.choice(np.where(y_bin6 == False)[0], size=len(y_plot)*100, replace=False)
print(len(y_plot2))

In [None]:
index = list(y_plot)+list(y_plot2)
print(len(index))

In [None]:
plt.scatter(clf.embedding_[index, 0], clf.embedding_[index, 1], c=y[index])
plt.colorbar()
plt.show()

In [None]:
print("PCA")
clf = decomposition.PCA(n_components=2, random_state=123456)
clf.fit(X[['Dport_nunique', 'TotBytes_sum', 'Dur_sum', 'Dur_mean', 'TotBytes_std']].transpose())

In [None]:
print(clf.components_)
print(clf.explained_variance_ratio_)

In [None]:
y_plot = np.where(y_bin6 == True)[0]
print(len(y_plot))

In [None]:
y_plot2 = np.random.choice(np.where(y_bin6 == False)[0], size=len(y_plot)*100, replace=False)
print(len(y_plot2))

In [None]:
index = list(y_plot)+list(y_plot2)
print(len(index))

In [None]:
plt.scatter(clf.components_[0, index], clf.components_[1, index], c=y[index])
plt.colorbar()
plt.show()