# Intrusion Detection 

### Dataset from https://github.com/defcom17/NSL_KDD/
* [more info](https://docs.google.com/spreadsheets/d/1oAx320Vo9Z6HrBrL6BcfLH6sh2zIk9EKCv2OlaMGmwY/edit#gid=0)

### Sample code used: https://www.kaggle.com/meesterwaffles/nicholas-brougher-neb5211-project4


In [2]:
%config IPCompleter.greedy=True
import pandas as pd
import seaborn as sns
import numpy as np
import re
import sklearn

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import matplotlib as matplot
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv')
test = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv')
train.shape
test.shape 
train.columns = range(train.shape[1])
test.columns = range(test.shape[1])
labels = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
'num_access_files', 'num_outbound_cmds', 'is_host_login',
'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level']# subclass - > attack_type
combined_data = pd.concat([train, test])
combined_data.shape
combined_data.head(5)

(125972, 43)

(22542, 43)

(148514, 43)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [5]:
combined_data.columns = labels
combined_data = combined_data.drop('difficulty_level', 1)
combined_data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal


### Reduce train size for faster trainin, remove when in production

### The following few cells are taken from the 'sample code'

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

print(set(list(combined_data['attack_type']))) # use print to make it print on single line 
combined_data['attack_type'] = le.fit_transform(combined_data['attack_type'])
combined_data['protocol_type'] = le.fit_transform(combined_data['protocol_type'])
combined_data['service'] = le.fit_transform(combined_data['service'])
combined_data['flag'] = le.fit_transform(combined_data['flag'])

print('\nDescribing attack_type: ')
print("min", combined_data['attack_type'].min())
print("max", combined_data['attack_type'].max())
print("mean", combined_data['attack_type'].mean())
print("mode", combined_data['attack_type'].mode())
print("looks like 16 is 'normal' ")


{'apache2', 'smurf', 'mscan', 'pod', 'guess_passwd', 'loadmodule', 'satan', 'worm', 'udpstorm', 'processtable', 'neptune', 'warezclient', 'snmpgetattack', 'ipsweep', 'ftp_write', 'xsnoop', 'ps', 'perl', 'phf', 'land', 'httptunnel', 'mailbomb', 'sqlattack', 'nmap', 'multihop', 'teardrop', 'portsweep', 'back', 'buffer_overflow', 'warezmaster', 'snmpguess', 'named', 'imap', 'xterm', 'spy', 'xlock', 'rootkit', 'sendmail', 'saint', 'normal'}

Describing attack_type: 
min 0
max 39
mean 15.782242751525109
mode 0    16
Name: attack_type, dtype: int64
looks like 16 is 'normal' 


# Reduce feature space

In [7]:
# select least correlated
corr_matrix = combined_data.corr().abs().sort_values('attack_type')
# tmp.head(10) # to view CORR matrix 
leastCorrelated = corr_matrix['attack_type'].nsmallest(10)
leastCorrelated = list(leastCorrelated.index)

# select least correlated
leastSTD =  combined_data.std().to_frame().nsmallest(5, columns=0)
leastSTD = list(leastSTD.transpose().columns)  #fuckin pandas.core.indexes.base.Index   -_-
#tmp = tmp.append('num_outbound_cmds')  # might not work...
featureElimination = set(leastCorrelated + leastSTD)
len(featureElimination)
featureElimination

14

{'dst_bytes',
 'is_host_login',
 'land',
 'logged_in',
 'num_access_files',
 'num_compromised',
 'num_file_creations',
 'num_outbound_cmds',
 'num_root',
 'num_shells',
 'root_shell',
 'srv_rerror_rate',
 'su_attempted',
 'urgent'}

## Drop features and preform train_test_split

In [8]:
# dont change combined_data, we will neeed it latter 
combined_data_reduced = combined_data.drop(featureElimination,axis=1)
data_x = combined_data_reduced.drop('attack_type', axis=1)
data_y = combined_data_reduced.loc[:,['attack_type']]
# del combined_data # free mem
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.5, random_state=42) # TODO

In [10]:
print("Thats how to rid rid of {0} dimentions of data, from the 10 lowest STD and 5 lowest correlation".format(len(featureElimination)))

X_train
X_test.shape

Thats how to rid rid of 14 dimentions of data, from the 10 lowest STD and 5 lowest correlation


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,wrong_fragment,hot,num_failed_logins,is_guest_login,count,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
11354,0,1,24,9,241,0,0,0,0,54,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
120182,0,1,24,9,215,0,0,0,0,2,...,26,255,1.00,0.00,0.04,0.15,0.00,0.00,0.04,0.62
94589,2,1,54,9,1691,0,0,0,0,1,...,31,29,0.94,0.06,0.03,0.00,0.00,0.00,0.00,0.00
8334,0,1,24,9,54540,0,2,0,0,5,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.05,0.05
15671,0,1,49,1,0,0,0,0,0,252,...,255,11,0.04,0.07,0.00,0.00,0.00,0.00,1.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,0,1,49,5,0,0,0,0,0,126,...,255,17,0.07,0.07,0.00,0.00,1.00,1.00,0.00,0.00
103694,0,1,66,5,0,0,0,0,0,134,...,255,5,0.02,0.08,0.00,0.00,1.00,1.00,0.00,0.00
5960,0,2,49,9,1,0,0,0,0,24,...,255,1,0.00,0.69,0.98,0.00,0.00,0.00,0.02,0.00
20895,0,1,24,9,223,0,0,0,0,10,...,133,255,1.00,0.00,0.01,0.02,0.00,0.00,0.00,0.00


(74257, 27)

# Feature selection with ML 

In [11]:
from sklearn import linear_model

from sklearn.ensemble import VotingClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import gc 
gc.collect()

179

In [None]:
LR = linear_model.LinearRegression()
LR.fit(X_train, y_train)
lr_score = LR.score(X_test, y_test)
print('Linear regression processing')
print('Linear regression Score: %.2f' % lr_score)

Linear regression processing
Linear regression Score: 0.41


In [24]:
AB = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, learning_rate=1.0)
RF = RandomForestClassifier(n_estimators=10, criterion='entropy', max_features='auto', bootstrap=True)
ET = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False)
GB = GradientBoostingClassifier(loss='deviance', learning_rate=0.5, n_estimators=75, max_features='auto') # learning_rate=0.1 # very slow 

In [14]:

AB_feature = AB.feature_importances_

ab_score = AB.score(X_test, y_test)
print('AdaBoostClassifier processing ...')
















print('AdaBoostClassifier Score: %.3f' % ab_score)

AdaBoostClassifier processing ...
AdaBoostClassifier Score: 0.991


In [28]:
model=RF.fit(X_train, y_train)
RF_feature = RF.feature_importances_
print(model==RF)
rf_score = RF.score(X_test, y_test)
print('RandomForestClassifier processing ,,,')
print('RandomForestClassifier Score: %.3f' % rf_score)

True
RandomForestClassifier processing ,,,
RandomForestClassifier Score: 0.993


In [None]:
from pickle import dump, load
dump(RF,open("ff",'wb'))

In [13]:
ET.fit(X_train, y_train)
ET_feature = ET.feature_importances_

et_score = ET.score(X_test, y_test)
print(ET_feature)
print('ExtraTreesClassifier processing ,,,')
print('ExtraTreeClassifier: %.3f' % et_score)

[0.01073894 0.04521562 0.02921517 0.07518461 0.03440338 0.01298276
 0.01253607 0.00483246 0.00449011 0.03123397 0.02441589 0.04402394
 0.11119815 0.01255421 0.04460608 0.01935396 0.01219457 0.02567648
 0.02280916 0.10478765 0.03429681 0.04456769 0.0259068  0.01360972
 0.11666994 0.03036914 0.05212672]
ExtraTreesClassifier processing ,,,
ExtraTreeClassifier: 0.992


In [16]:
feature_df = pd.DataFrame({'features': X_train.columns.values, # names
                           'AdaBoost' : AB_feature,
                           'RandomForest' : RF_feature,
                           'ExtraTree' : ET_feature
                          })
feature_df.head(5)

NameError: name 'RF_feature' is not defined

In [15]:
n = 10
a_f = feature_df.nlargest(n, 'AdaBoost')
r_f = feature_df.nlargest(n, 'RandomForest')
e_f = feature_df.nlargest(n, 'ExtraTree')
result = pd.concat([a_f, e_f, r_f]).drop_duplicates() 
len(result.index)

NameError: name 'feature_df' is not defined

In [None]:
selected_features = result['features'].values.tolist()
X_train_SF = X_train[selected_features]
X_test_SF = X_test[selected_features]

y = y_train['attack_type'].ravel()

print(X_train.shape)
print(x.shape)

(74257, 27)
(74257, 19)


In [27]:
clf1 = DecisionTreeClassifier() 
clf2 = RandomForestClassifier(n_estimators=25, random_state=1)# .98
# clf3 = GradientBoostingClassifier(learning_rate=.001, n_estimators=200) # .99
ET = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) # .
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),('et',ET)], voting='hard') # .('gnb', clf3)

for clf, label in zip([clf1, clf2,ET, eclf], ['DecisionTreeClassifier', 'Random Forest', 'ExtraTreesClassifier', 'Ensemble']): 
    _ = clf.fit(X_train_SF, y_train)
    pred = clf.score(X_test_SF,y_test)
    print("Acc: %0.4f [%s]" % (pred,label))

NameError: name 'X_train_SF' is not defined

In [None]:
clf1 = DecisionTreeClassifier() 
clf2 = RandomForestClassifier(n_estimators=25, random_state=1)# .98
# clf3 = GradientBoostingClassifier(learning_rate=.001, n_estimators=200) # .99
ET = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) # .
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),('et',ET)], voting='hard') # .('gnb', clf3)

for clf, label in zip([clf1, clf2,ET, eclf], ['DecisionTreeClassifier', 'Random Forest', 'ExtraTreesClassifier', 'Ensemble']): 
    _ = clf.fit(X_train, y_train)
    pred = clf.score(X_test,y_test)
    print("Acc: %0.4f [%s]" % (pred,label))

Acc: 0.9905 [DecisionTreeClassifier]
Acc: 0.9930 [Random Forest]
Acc: 0.9917 [ExtraTreesClassifier]
Acc: 0.9932 [Ensemble]


# Comparison

In [None]:
DTC = DecisionTreeClassifier() 
RFC = RandomForestClassifier(n_estimators=25, random_state=1)# .98
ETC = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False) # .

selection the top 2 features from our 3 models, given by `.feature_importances_`, our will be 
`0.9619160483` and our 
`number of features` will be 6

selection the top 5 features from our 3 models, given by `.feature_importances_`, our will be 
`0.9832743041` and our 
`number of features` will be 11

In [None]:
eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(X_train_SF,y)
pred = eclf.score(X_test_SF,y_test)
print("Acc: %0.10f" % (pred))

print("nubmer of features",X_train_SF.shape[1])

Acc: 0.9923508895
nubmer of features 19


In [None]:
eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(X_train, y)
pred = eclf.score(X_test,y_test)
print("Acc: %0.10f" % (pred))

print("nubmer of features",X_train.shape[1])

Acc: 0.9932531613
nubmer of features 27


In [None]:
XX_train, XX_test, yy_train, yy_test = train_test_split(combined_data.drop('attack_type', axis=1), combined_data.loc[:,['attack_type']], test_size=.5, random_state=42) 
eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(XX_train, yy_train)# y would work, but random_state is = 42, a hardcored variable 
pred = eclf.score(XX_test, yy_test)
print("Acc: %0.10f" % (pred))
print("nubmer of features",XX_train.shape[1])

Acc: 0.9935494297
nubmer of features 41


# Conclusion 

* Acc: 0.9619160483 from Feature Elimination(statistical) + extra feature Elimination(ML); 6 features
* Acc: 0.9832743041 from Feature Elimination(statistical) + extra feature Elimination(ML); 11 features
* Acc: 0.9865467229 from  Feature Elimination(statistical) + feature Elimination(ML); 18 features
* Acc: 0.9890784707 from  Feature Elimination(statistical); 27 features
* Acc: 0.9935763632 = from all data; 41 features

```
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),('et',ET)], voting='hard')
_ = eclf.fit(X_train, y_train['attack_type'].ravel())
pred = eclf.score(X_test,y_test)
print("Acc: %0.10f" % (pred))
```



# Bonus 
## What if we reduce the features will PCA, SVD, RFE 

In [None]:
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA,TruncatedSVD

from sklearn.svm import LinearSVC
n= 6 

### Both are realtively fast to fit ("train") 

In [None]:
rfe = RFE(DTC, n) # decision tree classifier

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
rfe = rfe.fit(XX_train, yy_train['attack_type'].ravel())
# print(rfe.support_) # bool array, |true| = n, as specified above 
# print(rfe.ranking_) # not wanted, but you might...  

desiredIndices = np.where(rfe.support_==True)[0]
whitelist = XX_train.columns.values[desiredIndices]
XX_train_RFE,XX_test_RFE = XX_train[whitelist],XX_test[whitelist]

eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(XX_train_RFE,yy_train)
pred = eclf.score(XX_test_RFE,yy_test)
print("Acc: %0.10f" % (pred))
print("number of features",XX_train_RFE.shape[1])

Acc: 0.9798941514
number of features 6


In [None]:
rfe = RFE(RFC, n) # random forest classifier
rfe = rfe.fit(XX_train, yy_train['attack_type'].ravel())

desiredIndices = np.where(rfe.support_==True)[0]
whitelist = XX_train.columns.values[desiredIndices]
XX_train_RFE,XX_test_RFE = XX_train[whitelist],XX_test[whitelist]

eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(XX_train_RFE,yy_train)

pred = eclf.score(XX_test_RFE,yy_test)
print("Acc: %0.10f" % (pred))
print("number of features",XX_train_RFE.shape[1])

Acc: 0.9751403908
number of features 6


### PCA is much faster to train and gave worst accuracy

https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html#PCA-as-dimensionality-reduction

In [None]:
pca = PCA(n_components=n)
_ = pca.fit(XX_train)  
XX_train_pca = pca.transform(XX_train)
XX_test_pca = pca.transform(XX_test)
# print(pca.explained_variance_ratio_)  
# print(pca.singular_values_)  

eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(XX_train_pca,yy_train)

pred = eclf.score(XX_test_pca,yy_test)
print("Acc: %0.10f" % (pred))
print("nubmer of features",XX_train_pca.shape[1])

Acc: 0.9736186487
nubmer of features 6


In [None]:
print(f'explained_variance_ratio_ is: {np.sum(pca.explained_variance_ratio_):.15f}')  

print(pca.explained_variance_ratio_)  
print(pca.singular_values_)  

print(len(pca.components_))
print(len(pca.components_[0]))
print("6 components, 41 in length")

explained_variance_ratio_ is: 0.999999999952008
[5.50107175e-01 4.49892716e-01 1.08357432e-07 3.81375180e-10
 1.92710577e-10 1.17330222e-10]
[1.51474528e+09 1.36984186e+09 6.72272433e+05 3.98833991e+04
 2.83510226e+04 2.21218222e+04]
6
41
6 components, 41 in length


### TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=n) 
_ = svd.fit(XX_train)  

In [None]:
XX_train_svd = svd.transform(XX_train)
XX_test_svd = svd.transform(XX_test)

eclf = VotingClassifier(estimators=[('DecisionTreeClassifier', DTC), ('RandomForestClassifier', RFC),('ExtraTreesClassifier',ETC)], voting='hard')
_ = eclf.fit(XX_train_svd,yy_train)

pred = eclf.score(XX_test_svd,yy_test)
print("Acc: %0.10f" % (pred))
print("number of features",XX_train_svd.shape[1])

Acc: 0.9770796019
number of features 6


In [None]:
print(svd.explained_variance_ratio_)  
print(svd.singular_values_)  

print(len(svd.components_))
print(len(svd.components_[0]))
print("6 components, 41 in length")

[5.50107172e-01 4.49892719e-01 1.08344868e-07 2.10633443e-10
 3.37436622e-10 1.49984514e-10]
[1.51477859e+09 1.36986009e+09 6.76506292e+05 7.04951620e+04
 3.79438065e+04 2.51512669e+04]
6
41
6 components, 41 in length


# We conclude our conclusion
#### RFE; 6
* Acc: 0.9798537512

#### PCA; 6
* Acc: 0.9737667829

#### TruncatedSVD; 6
* Acc: 0.9765543989
  * if dims = 2, Acc: 0.9441938134

#### Feature Elimination(statistical) + extra feature Elimination(ML); 6 features
* Acc: 0.9619160483  

#### Feature Elimination(statistical) + extra feature Elimination(ML); 11 features
* Acc: 0.9832743041  

#### Feature Elimination(statistical) + feature Elimination(ML); 18 features
* Acc: 0.9865467229 

#### Feature Elimination(statistical); 27 features
* Acc: 0.9890784707 

#### from all data; 41 features
* Acc: 0.9935763632