In [1]:
import numpy as np
import pandas as pd
import scipy.io
from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *

In [2]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T, _ = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            current_input = X[:, t, :]

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + self._input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop, :] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        N, T, V = X.shape
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units, V]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1, :]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=5, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=5, bidir=True)


        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                   
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])  

        coeff_tr = []
        biases_tr = []   

        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1, :], red_states[i, 1:, :])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [3]:
def targetify(s):
    if s == 'Benign':
        return 0
    else:
        return 1

In [4]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [5]:
datasets = ["Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv", "Friday-16-02-2018_TrafficForML_CICFlowMeter.csv"]

features_Th15022018 = ['Fwd Seg Size Min', 'Init Fwd Win Byts', 'Fwd IAT Min', 'Bwd Pkt Len Max', 'Bwd IAT Mean', 'Pkt Len Max', 'Fwd IAT Tot', 'Fwd IAT Max', 'Bwd Pkt Len Std', 'Flow IAT Mean', 'ACK Flag Cnt', 'Fwd IAT Mean', 'Flow Duration', 'Flow IAT Min', 'Flow IAT Max', 'Bwd IAT Max', 'Idle Max', 'Init Bwd Win Byts', 'Idle Min', 'Flow IAT Std']
features_Fr16022018 = ['Fwd Pkt Len Std', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Pkt Len Std', 'Fwd Seg Size Avg', 'Pkt Len Mean', 'Bwd Pkt Len Mean', 'TotLen Fwd Pkts', 'Subflow Fwd Byts', 'Bwd Pkt Len Max', 'Bwd Pkt Len Std', 'Pkt Len Var', 'Flow IAT Mean', 'ACK Flag Cnt', 'Pkt Len Max']

#numFeatures = [10, 15, 20]
#fracOfData = [0.5, 0.75, 1]
#numInternalUnits = [5, 10, 15, 20]

In [6]:
dataset = "Friday-16-02-2018_TrafficForML_CICFlowMeter.csv"
path = "../Datasets/Raw_Dataset/" + dataset
df1 = pd.read_csv(path)
df2 = df1.groupby(['Timestamp'])['Flow Duration'].count()
df2 = pd.DataFrame(df2).reset_index()
df2.columns=['Timestamp','CountOfOtherBiFlowsAtThisTimestamp']
df = df1.merge(df2, left_on='Timestamp', right_on='Timestamp')
df = df.sort_values('Timestamp')
num_features = 15 # should be 10, 15, or 20
features = features_Fr16022018[0:num_features]
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,CountOfOtherBiFlowsAtThisTimestamp
140234,0,0,16/02/2018 01:00:32,112640723,3,0,0,0,0,0,...,0,0,0,0,5.63e+07,101.116,56300000,56300000,Benign,1
140233,22,6,16/02/2018 01:01:42,2146470,14,12,1335,2273,744,0,...,0,0,0,0,0,0,0,0,Benign,1
140236,0,0,16/02/2018 01:03:21,112640737,3,0,0,0,0,0,...,0,0,0,0,5.63e+07,17.6777,56300000,56300000,Benign,1
140264,67,17,16/02/2018 01:03:50,721,1,1,300,329,300,300,...,0,0,0,0,0,0,0,0,Benign,1
140238,0,0,16/02/2018 01:06:10,112640647,3,0,0,0,0,0,...,0,0,0,0,5.63e+07,58.6899,56300000,56300000,Benign,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140229,0,0,16/02/2018 12:54:54,112640695,3,0,0,0,0,0,...,0,0,0,0,5.63e+07,26.163,56300000,56300000,Benign,1
140232,0,0,16/02/2018 12:57:43,112640664,3,0,0,0,0,0,...,0,0,0,0,5.63e+07,28.2843,56300000,56300000,Benign,1
140230,22,6,16/02/2018 12:58:13,10162102,9,7,1063,1297,744,0,...,989739,0,989739,989739,9.17234e+06,0,9172344,9172344,Benign,1
140231,42453,6,16/02/2018 12:58:24,855,2,0,848,0,848,0,...,0,0,0,0,0,0,0,0,Benign,1


In [7]:
fraction = 0.5
print(str(num_features) + " features")
print("fraction:" + str(fraction))
data = df.sample(frac=fraction, replace=True, random_state=1)

# get X and y. Normalize X and make it into 3D shape for reservoir
num_col = data.shape[1]
num_row = data.shape[0]

X_data = data[features]
X_data[features] = X_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
#norm_scaler = preprocessing.StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = np.nan_to_num(x_scaled)
if len(X.shape) < 3:
    X = np.atleast_3d(X)
y = data['Label'].apply(targetify)
print("Finished loading X and y......")

# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:" + str(X_train.shape), "y_train shape:" + str(y_train.shape))
print("X_test shape:" + str(X_test.shape), "y_test shape:" + str(y_test.shape))

pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

15 features
fraction:0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Finished loading X and y......
X_train shape:(419430, 15, 1) y_train shape:(419430,)
X_test shape:(104858, 15, 1) y_test shape:(104858,)


In [8]:
n=10 #number of internal units
print(str(n) + " internal units")

#run through reservoir
res = Reservoir(n_internal_units=n, spectral_radius=0.9, leak=0.2,
     connectivity=0.25, input_scaling=0.3, noise_level=0.01, circle=False)
input_repr = res.getReservoirEmbedding(np.array(X_train), pca, ridge_embedding,  n_drop=5, bidir=False, test = False)
print("Finished loading training reservoir embedding......")
input_repr_te = res.getReservoirEmbedding(np.array(X_test), pca, ridge_embedding,  n_drop=5, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")

#fit output
readout.fit(input_repr, y_train)
pred_class = readout.predict(input_repr_te)
#predictions = [int(round(x)) for x in pred_class]
true_class = list(y_test)

#analysis
compdf = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
compdf = compdf.sort_values('pred_class', ascending=False)
print(str(compdf.head(10)))
#compdf.to_csv(str(dataset.split('_')[0]) + '_' + str(fraction) + '_' + str(num_features) + '_' + str(n) + '.csv')
#accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
#f1 = f1_score(true_class, predictions)
#auc = roc_auc_score(true_class, predictions)

#print("# of nonzero:" + str(np.count_nonzero(predictions)))
#print("accuracy is " + str(accuracy))
#print("f1 is " + str(f1))
#print("auc is " + str(auc))
#print("*******************************************************************")

10 internal units
Finished loading training reservoir embedding......
Finished loading testing reservoir embedding......
       pred_class  true_class
16528    1.299044           1
28200    1.250129           1
65193    1.197335           1
47406    1.186611           1
24289    1.182642           1
47987    1.180157           1
25834    1.177142           1
16105    1.174702           1
37604    1.172053           1
81100    1.169738           1


In [9]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [10]:
predictions = list(compdf['pred_class'].apply(myRound, r=225))
true_class = list(compdf['true_class'])
accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
accuracy

0.9957084819470141

# Calculating Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
confm = confusion_matrix(true_class, predictions)
confm

array([[44275,   450],
       [    0, 60133]])

In [13]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

0.9957084819470141

# Predicting Hulk Attack

In [14]:
hulkdf = pd.read_csv("../CustomDatasets/RegularHulkAttack.csv")
hulkdf2 = hulkdf.groupby(['Timestamp'])['Flow Duration'].count()
hulkdf2 = pd.DataFrame(hulkdf2).reset_index()
hulkdf2.columns=['Timestamp','CountOfOtherBiFlowsAtThisTimestamp']
hulkdf = hulkdf.merge(hulkdf2, left_on='Timestamp', right_on='Timestamp')
hulkdf = hulkdf.sort_values('Timestamp')
hulkdf.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,CountOfOtherBiFlowsAtThisTimestamp
700,192.168.0.15-224.0.0.1-59695-8612-17,192.168.0.15,59695,224.0.0.1,8612,17,6/12/2019 9:19,107,1,1,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Benign,7
699,192.168.0.255-192.168.0.15-8612-59018-17,192.168.0.15,59018,192.168.0.255,8612,17,6/12/2019 9:19,68,1,1,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Benign,7
698,192.168.0.12-52.177.165.30-63590-443-6,192.168.0.12,63590,52.177.165.30,443,6,6/12/2019 9:19,95804125,5,4,...,141099.0,3923.028422,143873.0,138325,47700000.0,3219230.901,49900000.0,45400000.0,Benign,7
697,192.168.0.12-224.0.0.22-0-0-0,192.168.0.12,0,224.0.0.22,0,0,6/12/2019 9:19,2505068,2,1,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Benign,7
696,192.168.0.1-224.0.0.1-0-0-0,192.168.0.1,0,224.0.0.1,0,0,6/12/2019 9:19,174,1,1,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,Benign,7


In [15]:
features = ['Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg',
       'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
       'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts',
       'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
       'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
       'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max',
       'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']

In [16]:
X_hulk_data = hulkdf[features]
X_hulk_data[features] = X_hulk_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
min_max_scaler = preprocessing.MinMaxScaler()
x_hulk_scaled = min_max_scaler.fit_transform(X_hulk_data.values)
X_hulk = np.nan_to_num(x_hulk_scaled)
if len(X_hulk.shape) < 3:
    X_hulk = np.atleast_3d(X_hulk)
y_hulk = hulkdf['Label'].apply(targetify)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [17]:
input_repr_te_hulk = res.getReservoirEmbedding(np.array(X_hulk), pca, ridge_embedding,  n_drop=5, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")

Finished loading testing reservoir embedding......


In [18]:
pred_class_hulk = readout.predict(input_repr_te_hulk)
predictions_hulk = [int(round(x)) for x in pred_class_hulk]
true_class_hulk = list(y_hulk)

In [19]:
compdf_hulk = pd.DataFrame({'pred_class':pred_class_hulk, 'true_class':true_class_hulk})
compdf_hulk = compdf_hulk.sort_values('pred_class', ascending=False)

In [20]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [21]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [22]:
predictions_hulk = list(compdf_hulk['pred_class'].apply(myRound, r=225))
true_class_hulk = list(compdf_hulk['true_class'])
accuracy_hulk = np.sum(list(map(eqArray, predictions_hulk, true_class_hulk))) / len(true_class_hulk)
accuracy_hulk

0.06591639871382636

# Calculating Confusion Matrix

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
confm = confusion_matrix(true_class_hulk, predictions_hulk)
confm

array([[ 251,    7],
       [8127,  323]])

In [25]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

0.06591639871382636