## Case Study 3 - Network Intrusion Detection System

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn

In [2]:
# Column names to the dataset
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack","last_flag"]

df_train = pd.read_csv('E:/Train.txt', header=None, names = col_names)
df_test = pd.read_csv("E:/Test.txt", header=None, names = col_names)

In [3]:
df_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
df_test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [5]:
print(df_train.shape)
print(df_test.shape)

(125973, 43)
(22544, 43)


In [6]:
print('Attack distribution of train data:')
print(df_train['attack'].value_counts())

Attack distribution of train data:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: attack, dtype: int64


In [7]:
print('Attack distribution of test data:')
print(df_test['attack'].value_counts())

Attack distribution of test data:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
sqlattack             2
phf                   2
perl                  2
loadmodule            2
worm                  2
udpstorm              2
imap                  1
Name: attack, dtype: int64


### Identify Categorical features

In [8]:
# Train data
print("Train Data")
for col_name in df_train.columns:
    if df_train[col_name].dtypes == 'object' :
        unique_cat = len(df_train[col_name].unique())
        print("Feature",col_name,"is having",unique_cat,"categories")

Train Data
Feature protocol_type is having 3 categories
Feature service is having 70 categories
Feature flag is having 11 categories
Feature attack is having 23 categories


In [9]:
print('Distribution of categories in service:')
print(df_train['service'].value_counts().sort_values(ascending=False).head())

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [10]:
# Test data
print("Test Data")
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature",col_name,"is having",unique_cat,"categories")

Test Data
Feature protocol_type is having 3 categories
Feature service is having 64 categories
Feature flag is having 11 categories
Feature attack is having 38 categories


##### Need to create dummy vaiables for all the these categories

### Label Encoder

In [11]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

categorical_columns=['protocol_type', 'service', 'flag']

# Get the categorical values into a 2D numpy array
train_df_categorical_values = df_train[categorical_columns]
test_df_categorical_values = df_test[categorical_columns]

In [12]:
train_df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [13]:
test_df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,private,REJ
1,tcp,private,REJ
2,tcp,ftp_data,SF
3,icmp,eco_i,SF
4,tcp,telnet,RSTO


### Add dummy variables

In [14]:
# Protocol_type
unique_protocol = sorted(df_train.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol_dummy = [string1 + x for x in unique_protocol]

# service
unique_service = sorted(df_train.service.unique())
string2 = 'service_'
unique_service_dummy = [string2 + x for x in unique_service]

# flag
unique_flag = sorted(df_train.flag.unique())
string3 = 'flag_'
unique_flag_dummy = [string3 + x for x in unique_flag]

# put together
train_dum_cols = unique_protocol_dummy + unique_service_dummy + unique_flag_dummy
print(train_dum_cols)

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'serv

In [15]:
# For Test, counnt if unique service categories is less as compared to train data
unique_service_test = sorted(df_test.service.unique())
unique_service_test_dummy = [string2 + x for x in unique_service_test]
test_dum_cols = unique_protocol_dummy + unique_service_test_dummy + unique_flag_dummy
print(test_dum_cols)

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_hostnames', 'service_http', 'service_http_443', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net', 'service_ssh', 'service_sunrpc'

### Converting categorical coulumns into numbers using LabelEncoder

In [16]:
# train data
print("Train data:")
train_df_categorical_values_enc = train_df_categorical_values.apply(LabelEncoder().fit_transform)
print(train_df_categorical_values_enc.head())

# test data
print()
print("Test data:")
test_df_categorical_values_enc=test_df_categorical_values.apply(LabelEncoder().fit_transform)
print(test_df_categorical_values_enc.head())

Train data:
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9

Test data:
   protocol_type  service  flag
0              1       45     1
1              1       45     1
2              1       19     9
3              0       13     9
4              1       55     2


### One-hot Encoding

In [17]:
enc = OneHotEncoder()

# train data
train_df_categorical_values_encode = enc.fit_transform(train_df_categorical_values_enc)
train_df_cat_data = pd.DataFrame(train_df_categorical_values_encode.toarray(),columns=train_dum_cols)

# test data
test_df_categorical_values_encode = enc.fit_transform(test_df_categorical_values_enc)
test_df_cat_data = pd.DataFrame(test_df_categorical_values_encode.toarray(),columns=test_dum_cols)

In [18]:
train_df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
test_df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
train_service = df_train['service'].tolist()
test_service = df_test['service'].tolist()
difference = list(set(train_service) - set(test_service))
string = 'service_'
difference = [string + x for x in difference]
difference

['service_aol',
 'service_http_8001',
 'service_urh_i',
 'service_red_i',
 'service_harvest',
 'service_http_2784']

In [21]:
for col in difference:
    test_df_cat_data[col] = 0

test_df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,...,flag_S2,flag_S3,flag_SF,flag_SH,service_aol,service_http_8001,service_urh_i,service_red_i,service_harvest,service_http_2784
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,0,0,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0


### Merge the data

In [22]:
# Merge Train data
new_df_train = df_train.join(train_df_cat_data)
new_df_train.drop('flag', axis=1, inplace=True)
new_df_train.drop('protocol_type', axis=1, inplace=True)
new_df_train.drop('service', axis=1, inplace=True)

# test data
new_df_test = df_test.join(test_df_cat_data)
new_df_test.drop('flag', axis=1, inplace=True)
new_df_test.drop('protocol_type', axis=1, inplace=True)
new_df_test.drop('service', axis=1, inplace=True)

print(new_df_train.shape)
print(new_df_test.shape)

(125973, 124)
(22544, 124)


### Rename attack : 0 - Normal, 1 - DOS, 2 - Probe, 3 - R2L, 4 - U2R

In [23]:
# Attack column
train_attack = new_df_train['attack']
test_attack = new_df_test['attack']

# rename the attack column
new_train_attack = train_attack.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1,
                'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,
                'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

new_test_attack = test_attack.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,
                'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,
                'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

# put the new attack column in actual dataframe
new_df_train['attack'] = new_train_attack
new_df_test['attack'] = new_test_attack

In [24]:
new_df_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
drop_DoS = [2,3,4]
drop_Probe = [1,3,4]
drop_R2L = [1,2,4]
drop_U2R = [1,2,3]

# Train data
DoS_df_train = new_df_train[~new_df_train['attack'].isin(drop_DoS)];
Probe_df_train = new_df_train[~new_df_train['attack'].isin(drop_Probe)];
R2L_df_train = new_df_train[~new_df_train['attack'].isin(drop_R2L)];
U2R_df_train = new_df_train[~new_df_train['attack'].isin(drop_U2R)];

# Test data
DoS_df_test = new_df_test[~new_df_test['attack'].isin(drop_DoS)];
Probe_df_test = new_df_test[~new_df_test['attack'].isin(drop_Probe)];
R2L_df_test = new_df_test[~new_df_test['attack'].isin(drop_R2L)];
U2R_df_test = new_df_test[~new_df_test['attack'].isin(drop_U2R)];

print('Train:')
print('Dimensions of DoS:' ,DoS_df_train.shape)
print('Dimensions of Probe:' ,Probe_df_train.shape)
print('Dimensions of R2L:' ,R2L_df_train.shape)
print('Dimensions of U2R:' ,U2R_df_train.shape)

print('Test:')
print('Dimensions of DoS:' ,DoS_df_test.shape)
print('Dimensions of Probe:' ,Probe_df_test.shape)
print('Dimensions of R2L:' ,R2L_df_test.shape)
print('Dimensions of U2R:' ,U2R_df_test.shape)

Train:
Dimensions of DoS: (113270, 124)
Dimensions of Probe: (78999, 124)
Dimensions of R2L: (68338, 124)
Dimensions of U2R: (67395, 124)
Test:
Dimensions of DoS: (17171, 124)
Dimensions of Probe: (12132, 124)
Dimensions of R2L: (12596, 124)
Dimensions of U2R: (9778, 124)


### Split dataset into X and Y

In [26]:
# Split dataframes into X & Y
# assign X as a dataframe of feautures and Y as a outcome variables

# train data
X_DoS = DoS_df_train.drop('attack',1)
Y_DoS = DoS_df_train.attack
X_Probe = Probe_df_train.drop('attack',1)
Y_Probe = Probe_df_train.attack
X_R2L = R2L_df_train.drop('attack',1)
Y_R2L = R2L_df_train.attack
X_U2R = U2R_df_train.drop('attack',1)
Y_U2R = U2R_df_train.attack

# test data
X_DoS_test = DoS_df_test.drop('attack',1)
Y_DoS_test = DoS_df_test.attack
X_Probe_test = Probe_df_test.drop('attack',1)
Y_Probe_test = Probe_df_test.attack
X_R2L_test = R2L_df_test.drop('attack',1)
Y_R2L_test = R2L_df_test.attack
X_U2R_test = U2R_df_test.drop('attack',1)
Y_U2R_test = U2R_df_test.attack

In [27]:
colName1 = list(X_DoS)
colName1_test = list(X_DoS_test)

### Use Standard Scaler to scale the dataframes

In [28]:
from sklearn import preprocessing

# train data
scaler1 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS = scaler1.transform(X_DoS) 
scaler2 = preprocessing.StandardScaler().fit(X_Probe)
X_Probe = scaler2.transform(X_Probe) 
scaler3 = preprocessing.StandardScaler().fit(X_R2L)
X_R2L = scaler3.transform(X_R2L) 
scaler4 = preprocessing.StandardScaler().fit(X_U2R)
X_U2R = scaler4.transform(X_U2R)

# test data
scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test = scaler5.transform(X_DoS_test) 
scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
X_Probe_test = scaler6.transform(X_Probe_test) 
scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
X_R2L_test = scaler7.transform(X_R2L_test) 
scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
X_U2R_test = scaler8.transform(X_U2R_test)

In [29]:
X_DoS.std(axis=0);
X_Probe.std(axis=0);
X_R2L.std(axis=0);
X_U2R.std(axis=0);

## Feature Selection

### Univariate Feature Selection

In [30]:
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=10)
np.seterr(divide='ignore', invalid='ignore');

### Features for DoS

In [31]:
X_newDoS = selector.fit_transform(X_DoS,Y_DoS)
X_newDoS.shape



(113270, 13)

In [32]:
true=selector.get_support()
newcolindex_DoS = [a for a, x in enumerate(true) if x]
newcolname_DoS = list( colName1[a] for a in newcolindex_DoS )
newcolname_DoS

['logged_in',
 'count',
 'serror_rate',
 'srv_serror_rate',
 'same_srv_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'service_http',
 'flag_S0',
 'flag_SF']

### Features for Probe

In [33]:
X_newProbe = selector.fit_transform(X_Probe,Y_Probe)
X_newProbe.shape



(78999, 13)

In [34]:
true = selector.get_support()
newcolindex_Probe = [b for b, x in enumerate(true) if x]
newcolname_Probe = list( colName1[b] for b in newcolindex_Probe )
newcolname_Probe

['logged_in',
 'rerror_rate',
 'srv_rerror_rate',
 'dst_host_srv_count',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'last_flag',
 'Protocol_type_icmp',
 'service_eco_i',
 'service_private',
 'flag_SF']

### Features for R2L

In [35]:
X_newR2L = selector.fit_transform(X_R2L,Y_R2L)
X_newR2L.shape

  69  71  72  73  74  75  77  78  79  80  81  82  83  84  87  88  90  93
  94  97  99 100 101 108 109 110 111 115] are constant.


(68338, 13)

In [36]:
true = selector.get_support()
newcolindex_R2L = [i for i, x in enumerate(true) if x]
newcolname_R2L = list( colName1[i] for i in newcolindex_R2L)
newcolname_R2L

['dst_bytes',
 'hot',
 'num_failed_logins',
 'is_guest_login',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'last_flag',
 'service_ftp',
 'service_ftp_data',
 'service_http',
 'service_imap4',
 'flag_RSTO']

### Features for U2R

In [37]:
X_newU2R = selector.fit_transform(X_U2R,Y_U2R)
X_newU2R.shape

  69  71  72  73  74  75  76  77  78  79  80  81  82  83  84  87  88  90
  93  94  97  99 100 101 108 109 110 111 115] are constant.


(67395, 13)

In [38]:
true=selector.get_support()
newcolindex_U2R=[i for i, x in enumerate(true) if x]
newcolname_U2R=list( colName1[i] for i in newcolindex_U2R)
newcolname_U2R

['urgent',
 'hot',
 'root_shell',
 'num_file_creations',
 'num_shells',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'last_flag',
 'service_ftp_data',
 'service_http',
 'service_telnet']

## Features selected by Univariate Analysis

In [39]:
print('Features selected for DoS:',newcolname_DoS)
print()
print('Features selected for Probe:',newcolname_Probe)
print()
print('Features selected for R2L:',newcolname_R2L)
print()
print('Features selected for U2R:',newcolname_U2R)

Features selected for DoS: ['logged_in', 'count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'service_http', 'flag_S0', 'flag_SF']

Features selected for Probe: ['logged_in', 'rerror_rate', 'srv_rerror_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'last_flag', 'Protocol_type_icmp', 'service_eco_i', 'service_private', 'flag_SF']

Features selected for R2L: ['dst_bytes', 'hot', 'num_failed_logins', 'is_guest_login', 'dst_host_srv_count', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'last_flag', 'service_ftp', 'service_ftp_data', 'service_http', 'service_imap4', 'flag_RSTO']

Features selected for U2R: ['urgent', 'hot', 'root_shell', 'num_file_creations', 'num_shells', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_

## Recursive Feature Elimination for feature Ranking

In [40]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)

In [41]:
rfe = RFE(clf, n_features_to_select=1)
rfe.fit(X_newDoS, Y_DoS)
print ("DoS Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), newcolname_DoS)))

DoS Features sorted by their rank:
[(1, 'same_srv_rate'), (2, 'count'), (3, 'flag_SF'), (4, 'dst_host_serror_rate'), (5, 'dst_host_same_srv_rate'), (6, 'dst_host_srv_count'), (7, 'dst_host_count'), (8, 'logged_in'), (9, 'serror_rate'), (10, 'dst_host_srv_serror_rate'), (11, 'srv_serror_rate'), (12, 'service_http'), (13, 'flag_S0')]


In [42]:
rfe.fit(X_newProbe, Y_Probe)
print ("Probe Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), newcolname_Probe)))

Probe Features sorted by their rank:
[(1, 'last_flag'), (2, 'dst_host_diff_srv_rate'), (3, 'service_private'), (4, 'dst_host_srv_count'), (5, 'logged_in'), (6, 'dst_host_rerror_rate'), (7, 'dst_host_same_src_port_rate'), (8, 'service_eco_i'), (9, 'flag_SF'), (10, 'rerror_rate'), (11, 'dst_host_srv_rerror_rate'), (12, 'Protocol_type_icmp'), (13, 'srv_rerror_rate')]


In [43]:
rfe.fit(X_newR2L, Y_R2L)
print ("R2L Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), newcolname_R2L)))

R2L Features sorted by their rank:
[(1, 'last_flag'), (2, 'dst_host_srv_diff_host_rate'), (3, 'dst_host_same_src_port_rate'), (4, 'hot'), (5, 'service_ftp_data'), (6, 'dst_host_srv_count'), (7, 'dst_bytes'), (8, 'service_imap4'), (9, 'num_failed_logins'), (10, 'is_guest_login'), (11, 'service_ftp'), (12, 'flag_RSTO'), (13, 'service_http')]


In [44]:
rfe.fit(X_newU2R, Y_U2R)
print ("U2R Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), newcolname_U2R)))

U2R Features sorted by their rank:
[(1, 'last_flag'), (2, 'dst_host_srv_count'), (3, 'hot'), (4, 'num_file_creations'), (5, 'dst_host_count'), (6, 'service_ftp_data'), (7, 'dst_host_srv_diff_host_rate'), (8, 'dst_host_same_src_port_rate'), (9, 'service_telnet'), (10, 'service_http'), (11, 'num_shells'), (12, 'root_shell'), (13, 'urgent')]


### Recusive Feature Elimination

In [45]:
from sklearn.feature_selection import RFE
clf = DecisionTreeClassifier(random_state=0)
rfe = RFE(estimator=clf, n_features_to_select=13, step=1)

rfe.fit(X_DoS, Y_DoS)
X_rfeDoS = rfe.transform(X_DoS)
true = rfe.support_
rfecolindex_DoS = [i for i, x in enumerate(true) if x]
rfecolname_DoS = list(colName1[i] for i in rfecolindex_DoS)

In [47]:
rfe.fit(X_Probe, Y_Probe)
X_rfeProbe = rfe.transform(X_Probe)
true = rfe.support_
rfecolindex_Probe = [i for i, x in enumerate(true) if x]
rfecolname_Probe = list(colName1[i] for i in rfecolindex_Probe)

In [48]:
rfe.fit(X_R2L, Y_R2L)
X_rfeR2L = rfe.transform(X_R2L)
true = rfe.support_
rfecolindex_R2L = [i for i, x in enumerate(true) if x]
rfecolname_R2L = list(colName1[i] for i in rfecolindex_R2L)

In [49]:
rfe.fit(X_U2R, Y_U2R)
X_rfeU2R = rfe.transform(X_U2R)
true = rfe.support_
rfecolindex_U2R = [i for i, x in enumerate(true) if x]
rfecolname_U2R = list(colName1[i] for i in rfecolindex_U2R)

In [50]:
print('Features selected for DoS:',rfecolname_DoS)
print()
print('Features selected for Probe:',rfecolname_Probe)
print()
print('Features selected for R2L:',rfecolname_R2L)
print()
print('Features selected for U2R:',rfecolname_U2R)

Features selected for DoS: ['src_bytes', 'dst_bytes', 'wrong_fragment', 'count', 'same_srv_rate', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'last_flag', 'service_X11', 'service_ecr_i', 'service_ftp_data']

Features selected for Probe: ['src_bytes', 'dst_bytes', 'rerror_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_rerror_rate', 'last_flag', 'service_finger', 'service_ftp_data', 'service_http', 'service_private', 'service_smtp']

Features selected for R2L: ['duration', 'src_bytes', 'dst_bytes', 'hot', 'logged_in', 'num_root', 'serror_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate', 'last_flag', 'service_ftp_data']

Features selected for U2R: ['dst_bytes', 'num_compromised', 'count', 'dst_host_count', 'dst_host_srv_diff_host_rate', 'last_flag', 'service_ftp_data', 'service_nntp', 'service_ntp_u', 'service_other', 'service

### Model building

In [51]:
# all features
clf_DoS = DecisionTreeClassifier(random_state=0)
clf_Probe = DecisionTreeClassifier(random_state=0)
clf_R2L = DecisionTreeClassifier(random_state=0)
clf_U2R = DecisionTreeClassifier(random_state=0)
clf_DoS.fit(X_DoS, Y_DoS)
clf_Probe.fit(X_Probe, Y_Probe)
clf_R2L.fit(X_R2L, Y_R2L)
clf_U2R.fit(X_U2R, Y_U2R)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [52]:
# selected features
clf_rfeDoS = DecisionTreeClassifier(random_state=0)
clf_rfeProbe = DecisionTreeClassifier(random_state=0)
clf_rfeR2L = DecisionTreeClassifier(random_state=0)
clf_rfeU2R = DecisionTreeClassifier(random_state=0)
clf_rfeDoS.fit(X_rfeDoS, Y_DoS)
clf_rfeProbe.fit(X_rfeProbe, Y_Probe)
clf_rfeR2L.fit(X_rfeR2L, Y_R2L)
clf_rfeU2R.fit(X_rfeU2R, Y_U2R)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

## Prediction and Validation

### Confusion Matrix for all features

In [53]:
# DoS
clf_DoS.predict(X_DoS_test)

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [54]:
# DoS
Y_DoS_pred=clf_DoS.predict(X_DoS_test)
pd.crosstab(Y_DoS_test, Y_DoS_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9454,257
1,1667,5793


In [55]:
# Probe
Y_Probe_pred=clf_Probe.predict(X_Probe_test)
pd.crosstab(Y_Probe_test, Y_Probe_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,2
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2861,6850
2,164,2257


In [56]:
# R2L
Y_R2L_pred=clf_R2L.predict(X_R2L_test)
pd.crosstab(Y_R2L_test, Y_R2L_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,3
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9692,19
3,2875,10


In [57]:
# U2R
Y_U2R_pred=clf_U2R.predict(X_U2R_test)
pd.crosstab(Y_U2R_test, Y_U2R_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,4
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
4,59,8


### Validation : Accuracy, Precision, Recall, F-measure

In [58]:
# DoS
from sklearn.model_selection import cross_val_score
from sklearn import metrics
accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99738 (+/- 0.00251)
Precision: 0.99626 (+/- 0.00392)
Recall: 0.99772 (+/- 0.00433)
F-measure: 0.99699 (+/- 0.00289)


In [59]:
# Probe
accuracy = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99769 (+/- 0.00178)
Precision: 0.99655 (+/- 0.00410)
Recall: 0.99623 (+/- 0.00222)
F-measure: 0.99639 (+/- 0.00277)


In [60]:
# R2L
accuracy = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99769 (+/- 0.00178)
Precision: 0.99655 (+/- 0.00410)
Recall: 0.99623 (+/- 0.00222)
F-measure: 0.99639 (+/- 0.00277)


In [61]:
# U2R
accuracy = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99847 (+/- 0.00333)
Precision: 0.93886 (+/- 0.11895)
Recall: 0.95549 (+/- 0.14716)
F-measure: 0.94502 (+/- 0.12321)


### Confusion Matrix for 13 features

In [None]:
# reduce test dataset to 13 features, use only features described in rfecolname_DoS etc.
X_DoS_test = X_DoS_test[:,rfecolindex_DoS]
X_Probe_test = X_Probe_test[:,rfecolindex_Probe]
X_R2L_test = X_R2L_test[:,rfecolindex_R2L]
X_U2R_test = X_U2R_test[:,rfecolindex_U2R]

In [64]:
# DoS
Y_DoS_pred2 = clf_rfeDoS.predict(X_DoS_test)
pd.crosstab(Y_DoS_test, Y_DoS_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9148,563
1,1701,5759


In [65]:
# Probe
Y_Probe_pred2 = clf_rfeProbe.predict(X_Probe_test)
pd.crosstab(Y_Probe_test, Y_Probe_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,2
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8644,1067
2,818,1603


In [66]:
# R2L
Y_R2L_pred2 = clf_rfeR2L.predict(X_R2L_test)
pd.crosstab(Y_R2L_test, Y_R2L_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,3
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9707,4
3,2875,10


In [67]:
# U2R
Y_U2R_pred2 = clf_rfeU2R.predict(X_U2R_test)
pd.crosstab(Y_U2R_test, Y_U2R_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,4
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
4,59,8


### Validation : Accuracy, Precision, Recall, F-measure

### DoS

In [69]:
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99785 (+/- 0.00209)
Precision: 0.99719 (+/- 0.00279)
Recall: 0.99786 (+/- 0.00299)
F-measure: 0.99752 (+/- 0.00240)


### Probe

In [70]:
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99580 (+/- 0.00290)
Precision: 0.99308 (+/- 0.00652)
Recall: 0.99381 (+/- 0.00500)
F-measure: 0.99343 (+/- 0.00453)


### R2L

In [71]:
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99301 (+/- 0.00375)
Precision: 0.99001 (+/- 0.00583)
Recall: 0.99023 (+/- 0.00657)
F-measure: 0.99011 (+/- 0.00530)


### U2R

In [72]:
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99877 (+/- 0.00200)
Precision: 0.96660 (+/- 0.08669)
Recall: 0.94736 (+/- 0.12001)
F-measure: 0.95319 (+/- 0.07663)


### Stratified CV

In [75]:
# DoS
from sklearn.model_selection import StratifiedKFold
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=StratifiedKFold(10), scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99785 (+/- 0.00209)


In [76]:
# Probe
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=StratifiedKFold(10), scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99580 (+/- 0.00290)


In [78]:
# R2L
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=StratifiedKFold(10), scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99301 (+/- 0.00375)


In [79]:
# U2R
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=StratifiedKFold(10), scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99877 (+/- 0.00200)


### CV - 5, 10, 30, 50 fold

### DoS

In [80]:
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99750 (+/- 0.00114)


In [81]:
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99785 (+/- 0.00209)


In [82]:
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=30, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99767 (+/- 0.00304)


In [83]:
accuracy = cross_val_score(clf_rfeDoS, X_DoS_test, Y_DoS_test, cv=50, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99744 (+/- 0.00555)


### Probe

In [84]:
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99604 (+/- 0.00298)


In [85]:
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99580 (+/- 0.00290)


In [86]:
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=30, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99637 (+/- 0.00473)


In [87]:
accuracy = cross_val_score(clf_rfeProbe, X_Probe_test, Y_Probe_test, cv=50, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99637 (+/- 0.00731)


### R2L

In [88]:
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99270 (+/- 0.00217)


In [89]:
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99301 (+/- 0.00375)


In [90]:
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=30, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99325 (+/- 0.00977)


In [91]:
accuracy = cross_val_score(clf_rfeR2L, X_R2L_test, Y_R2L_test, cv=50, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99365 (+/- 0.00940)


### U2R

In [93]:
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99847 (+/- 0.00145)


In [94]:
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99877 (+/- 0.00200)


In [95]:
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=30, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99898 (+/- 0.00330)


In [96]:
accuracy = cross_val_score(clf_rfeU2R, X_U2R_test, Y_U2R_test, cv=50, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

Accuracy: 0.99908 (+/- 0.00489)
