In [25]:
import numpy as np
import pandas as pd
import scipy.io
from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *

In [26]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            X_row = X.shape[0]
            current_input = X[:, t].reshape(X_row,1)
            new_input_weights = self._input_weights.reshape(self._n_internal_units, 1)

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + new_input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        N, T = X.shape
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=5, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=5, bidir=True)


        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                   
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])  

        coeff_tr = []
        biases_tr = []   

        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1], red_states[i, 1:])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [27]:
data = pd.read_csv("Datasets/Raw_Dataset/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv")
data.head()

# plot some of it
#figure(10).clear()
#plot(data[0:1000])
#title('A sample of data')

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,15/02/2018 08:25:18,112641158,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320579.0,704.2784,56321077,56320081,Benign
1,22,6,15/02/2018 08:29:05,37366762,14,12,2168,2993,712,0,...,32,1024353.0,649038.754495,1601183,321569,11431221.0,3644991.0,15617415,8960247,Benign
2,47514,6,15/02/2018 08:29:42,543,2,0,64,0,64,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,0,0,15/02/2018 08:28:07,112640703,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320351.5,366.9884,56320611,56320092,Benign
4,0,0,15/02/2018 08:30:56,112640874,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320437.0,719.8347,56320946,56319928,Benign


In [28]:
features = ['Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg',
       'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
       'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts',
       'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
       'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
       'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max',
       'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']

In [29]:
def targetify(s):
    if s == 'Benign':
        return 0
    else:
        return 1

In [36]:
num_col = data.shape[1]
num_row = data.shape[0]

X_data = data[features]
print(max(X_data.max()))
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = pd.DataFrame(np.nan_to_num(x_scaled))

y = data['Label'].apply(targetify)

120000000.0


ValueError: Input contains infinity or a value too large for dtype('float64').

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# Working with 0.01% of the data for now because my laptop sucks
X_train2 = X_train.sample(frac=0.0005, replace=True, random_state=1)
y_train2 = y_train.sample(frac=0.0005, replace=True, random_state=1)
X_test2 = X_test.sample(frac=0.0005, replace=True, random_state=1)
y_test2 = y_test.sample(frac=0.0005, replace=True, random_state=1)
print(X_train2.shape, y_train2.shape)
print(X_test2.shape, y_test2.shape)

(217, 76) (217,)
(217, 76) (217,)


In [9]:
y_train2, y_test2

(280610    1
 575303    0
 339149    0
 387454    0
 176816    0
          ..
 102919    1
 17289     1
 473524    1
 69493     1
 71242     1
 Name: Label, Length: 217, dtype: int64, 684385    0
 225389    1
 356268    1
 405951    1
 731484    1
          ..
 725702    1
 779509    0
 991       1
 2838      1
 85172     1
 Name: Label, Length: 217, dtype: int64)

In [10]:
pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

In [11]:
res = Reservoir(n_internal_units=450, spectral_radius=0.6, leak=0.6,
                 connectivity=0.25, input_scaling=0.1, noise_level=0.01, circle=False)

In [12]:
input_repr = res.getReservoirEmbedding(np.array(X_train2), pca, ridge_embedding,  n_drop=5, bidir=True, test = False)
input_repr

array([[ 2.79067803e-01, -4.52966542e-03, -9.53321760e-02, ...,
        -1.24577156e-04,  1.43241308e-04,  2.33797285e-04],
       [ 3.83648038e-01,  1.95321313e-02, -1.33223641e-01, ...,
         6.05123511e-04,  3.76582511e-04,  1.35710900e-04],
       [ 3.86092394e-01,  1.12259298e-02, -1.39078004e-01, ...,
        -4.40132740e-05,  3.14878467e-04,  3.53285844e-04],
       ...,
       [ 2.78495232e-01, -6.95508256e-03, -9.97249501e-02, ...,
         6.88239768e-04,  2.23203590e-04,  5.39162492e-05],
       [ 2.50645138e-01, -3.50369996e-02, -2.99649993e-02, ...,
         1.25176333e-05, -3.88477883e-04, -1.38675190e-04],
       [ 2.35697097e-01, -6.00979114e-02, -1.04561599e-02, ...,
        -3.94758300e-04, -3.96415758e-04, -1.68510555e-05]])

In [13]:
input_repr_te = res.getReservoirEmbedding(np.array(X_test2), pca, ridge_embedding,  n_drop=5, bidir=True, test = True)
input_repr_te

array([[ 3.87419579e-01,  1.84755793e-02, -1.36031842e-01, ...,
         1.27909923e-04, -1.09590484e-04, -3.21338929e-04],
       [ 2.82391638e-01, -2.73882310e-03, -9.94820062e-02, ...,
        -2.56772384e-04, -1.61103495e-05,  3.17644504e-04],
       [ 2.77460396e-01, -9.33554877e-04, -1.01468255e-01, ...,
         2.20166973e-04, -7.27238954e-05, -2.30160851e-04],
       ...,
       [ 2.49256094e-01, -4.30476305e-02, -3.90209133e-02, ...,
        -3.24588359e-04, -1.71286909e-04,  4.55467820e-04],
       [ 3.09271995e-01, -1.61734670e-02, -9.88342091e-02, ...,
        -6.05691345e-04, -4.49870529e-04, -3.80140340e-04],
       [ 2.35901905e-01, -6.13236810e-02, -1.52295534e-02, ...,
         2.47418458e-04, -2.35755201e-04,  1.17816276e-04]])

In [14]:
readout.fit(input_repr, y_train2)
pred_class = readout.predict(input_repr_te)
print(pred_class)
predictions = [int(round(x)) for x in pred_class]

[0.10684051 0.92983248 0.93443069 0.89726583 0.89349977 0.91078312
 0.91245775 0.93024863 0.08722222 0.09520192 0.91723061 0.92121319
 0.93458534 0.96839943 0.12767198 0.11660853 0.0977496  0.92543386
 0.95501082 0.92808709 0.92404588 0.12250746 0.94458688 0.10957271
 0.08823592 0.95505284 0.08896763 0.11686613 0.10327713 0.11080393
 0.92470462 0.11798038 0.12986482 0.15601605 0.93022622 0.10730854
 0.91047887 0.15678829 0.06634082 0.07493896 0.93763771 0.94419083
 0.90999526 0.13374329 0.9352181  0.39279999 0.94168984 0.10509894
 0.94178258 0.91402726 0.12016742 0.96464508 0.10455699 0.07169232
 0.07496622 0.94714287 0.88531756 0.39948227 0.91385189 0.10914914
 0.09756559 0.92483037 0.35500964 0.09876666 0.9292927  0.11151673
 0.95045733 0.92519636 0.11265909 0.12212483 0.96545494 0.92054592
 0.92490878 0.92403157 0.92496533 0.09074793 0.95155542 0.90776642
 0.10890745 0.93796717 0.89813517 0.90924762 0.94820203 0.94575699
 0.13856579 0.94039721 0.095375   0.10558182 0.92564439 0.1187

In [15]:
true_class = list(y_test2)
true_class

[0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1]

In [16]:
df = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
df = df.sort_values('pred_class', ascending=False)
print(df.to_string())

     pred_class  true_class
180    0.972572           1
131    0.971240           1
13     0.968399           1
185    0.966542           1
70     0.965455           1
206    0.964887           1
51     0.964645           1
214    0.963814           1
187    0.958513           1
25     0.955053           1
18     0.955011           1
76     0.951555           1
66     0.950457           1
201    0.949756           1
150    0.948526           1
82     0.948202           1
55     0.947143           1
161    0.946223           1
171    0.946034           1
83     0.945757           1
22     0.944587           1
189    0.944336           1
41     0.944191           1
144    0.943709           1
135    0.943684           1
181    0.943493           1
215    0.943450           1
105    0.943158           1
96     0.942112           1
48     0.941783           1
46     0.941690           1
212    0.941681           1
85     0.940397           1
130    0.940026           1
190    0.939728     

In [17]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)

#accuracy_score(true_class, pred_class, normalize=False)

0.9631336405529954

In [157]:
#f1_score(true_class, pred_class, average='weighted')