In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

In [2]:
# Load the data
Attack_Back = pd.read_csv('Data_of_Attack_Back.csv')
BufferOverflow = pd.read_csv('Data_of_Attack_Back_BufferOverflow.csv')
FTPWrite = pd.read_csv('Data_of_Attack_Back_FTPWrite.csv', names=['duration', ' protocol_type', ' service', ' flag', ' src_bytes',
       ' dst_bytes', ' land', ' wrong_fragment', ' urgent', ' hot',
       ' num_failed_logins', ' logged_in', ' num_compromised', ' root_shell',
       ' su_attempted', ' num_root', ' num_file_creations', ' num_shells',
       ' num_access_files', ' num_outbound_cmds', ' is_host_login',
       ' is_guest_login', ' count', ' srv_count', ' serror_rate',
       ' srv_error_rate', ' rerror_rate', ' srv_rerror_rate', ' same_srv_rate',
       ' diff_srv_rate', ' srv_diff_host_rate', ' dst_host_count',
       ' dst_host_srv_count', ' dst_host_same_srv_rate',
       ' dst_host_diff_srv_rate', ' dst_host_same_src_port_rate',
       ' dst_host_srv_diff_host_rate', ' dst_host_serror_rate',
       ' dst_host_srv_serror_rate', ' dst_host_rerror_rate',
       ' dst_host_srv_rerror_rate'])
GuessPassword = pd.read_csv('Data_of_Attack_Back_GuessPassword.csv')
Neptune = pd.read_csv('Data_of_Attack_Back_Neptune.csv')
NMap = pd.read_csv('Data_of_Attack_Back_NMap.csv')
Normal = pd.read_csv('Data_of_Attack_Back_Normal.csv')
PortSweep = pd.read_csv('Data_of_Attack_Back_PortSweep.csv')
RootKit = pd.read_csv('Data_of_Attack_Back_RootKit.csv')
Satan = pd.read_csv('Data_of_Attack_Back_Satan.csv')
Smurf = pd.read_csv('Data_of_Attack_Back_Smurf.csv')


In [3]:
dataFrame_ls = [Attack_Back,BufferOverflow,FTPWrite,GuessPassword,Neptune,NMap,Normal,PortSweep,RootKit,Satan,Smurf]
final_df = pd.concat(dataFrame_ls, axis=0,  keys=['Attack_Back','BufferOverflow','FTPWrite','GuessPassword','Neptune','NMap','Normal','PortSweep','RootKit','Satan','Smurf'])
final_df.reset_index(inplace=True)
final_df.drop(columns=['level_1'], inplace=True)
final_df.rename(columns={'level_0':'attack'},inplace=True)

In [4]:
final_df.rename(columns=lambda x: x.replace(' ', ''), inplace=True)

In [5]:
final_df.columns

Index(['attack', 'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_error_rate',
       'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'],
      dtype='object')

In [6]:
#pd.set_option('display.max_columns', None)
#final_df.head(50)

In [7]:
#Checking Missing Values
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817551 entries, 0 to 817550
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   attack                       817551 non-null  object 
 1   duration                     817551 non-null  float64
 2   protocol_type                817551 non-null  float64
 3   service                      817551 non-null  float64
 4   flag                         817551 non-null  float64
 5   src_bytes                    817551 non-null  float64
 6   dst_bytes                    817551 non-null  float64
 7   land                         817551 non-null  int64  
 8   wrong_fragment               817551 non-null  float64
 9   urgent                       817551 non-null  float64
 10  hot                          817551 non-null  float64
 11  num_failed_logins            817551 non-null  float64
 12  logged_in                    817551 non-null  float64
 13 

In [8]:
#Dropping variables with all 0 value
final_df.drop(columns=['land','root_shell','su_attempted','is_host_login','is_guest_login'], inplace = True)

In [9]:
#Dropping numeric variable with all 0
final_df.drop(columns=['num_outbound_cmds'], inplace=True)


In [10]:
var=final_df.select_dtypes(exclude=['object']).columns.difference(['flag','service','protocol_type','logged_in'])
final_df[var].quantile([0.0,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,1]).T

Unnamed: 0,0.00,0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95,0.99,1.00
count,0.0,0.001,0.001,0.001,0.003,0.011,0.106,0.226,0.263,0.296,0.511
diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.007,0.007,0.057,0.1
dst_bytes,0.0,0.0,0.0,0.0,0.0,0.00367,0.01969,0.06691,0.11762,0.309345,1.0
dst_host_count,0.0,0.001,0.006,0.013,0.054,0.255,0.255,0.255,0.255,0.255,0.255
dst_host_diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.008,0.009,0.04,0.1
dst_host_rerror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076,0.1,0.1
dst_host_same_src_port_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.014,0.035,0.1,0.1
dst_host_same_srv_rate,0.0,0.0,0.001,0.003,0.007,0.1,0.1,0.1,0.1,0.1,0.1
dst_host_serror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.014,0.1,0.1,0.1,0.1
dst_host_srv_count,0.0,0.001,0.003,0.007,0.017,0.255,0.255,0.255,0.255,0.255,0.255


In [11]:
#Handling Outliers - Method2
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.99), lower =x.quantile(0.05) )
    return x

final_df[var] = final_df[var].apply(outlier_capping)

In [12]:
final_df[var].quantile([0.0,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,1]).T

Unnamed: 0,0.00,0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95,0.99,1.00
count,0.001,0.001,0.001,0.001,0.003,0.011,0.106,0.226,0.263,0.296,0.296
diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.007,0.007,0.057,0.057
dst_bytes,0.0,0.0,0.0,0.0,0.0,0.00367,0.01969,0.06691,0.11762,0.309337,0.309345
dst_host_count,0.006,0.006,0.006,0.013,0.054,0.255,0.255,0.255,0.255,0.255,0.255
dst_host_diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.008,0.009,0.04,0.04
dst_host_rerror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076,0.1,0.1
dst_host_same_src_port_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.014,0.035,0.1,0.1
dst_host_same_srv_rate,0.001,0.001,0.001,0.003,0.007,0.1,0.1,0.1,0.1,0.1,0.1
dst_host_serror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.014,0.1,0.1,0.1,0.1
dst_host_srv_count,0.003,0.003,0.003,0.007,0.017,0.255,0.255,0.255,0.255,0.255,0.255


In [13]:
#Converting datatype of Nominal and categorical variable before passing to SMOTE
#final_df[['flag','service','protocol_type','logged_in']]=final_df[['flag','service','protocol_type','logged_in']].astype('object')

In [14]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817551 entries, 0 to 817550
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   attack                       817551 non-null  object 
 1   duration                     817551 non-null  float64
 2   protocol_type                817551 non-null  float64
 3   service                      817551 non-null  float64
 4   flag                         817551 non-null  float64
 5   src_bytes                    817551 non-null  float64
 6   dst_bytes                    817551 non-null  float64
 7   wrong_fragment               817551 non-null  float64
 8   urgent                       817551 non-null  float64
 9   hot                          817551 non-null  float64
 10  num_failed_logins            817551 non-null  float64
 11  logged_in                    817551 non-null  float64
 12  num_compromised              817551 non-null  float64
 13 

In [15]:
from imblearn.over_sampling import SMOTE

In [16]:
#Creating binary target variable
final_df['attack_Y_N'] = final_df['attack'].apply(lambda x: 0 if x == 'Normal' else 1)

In [17]:
final_df['attack_Y_N'].value_counts()

attack_Y_N
0    576710
1    240841
Name: count, dtype: int64

In [18]:
X=final_df.drop(columns=['attack_Y_N','attack'], axis=1)
y=final_df['attack_Y_N']
sm = SMOTE(random_state =123)
X_res, y_res = sm.fit_resample(X, y)

In [28]:
y_res.value_counts()

attack_Y_N
1    576710
0    576710
Name: count, dtype: int64

In [29]:
final_df1=pd.concat([X_res,y_res], axis=1)

In [30]:
#Dropping variables with 0 values
final_df1.drop(columns=['wrong_fragment','urgent','hot','num_failed_logins','num_file_creations','urgent','num_root','num_shells','num_compromised','num_access_files'],inplace=True)

In [31]:
var1 = var.difference(['wrong_fragment','urgent','hot','num_failed_logins','num_file_creations','urgent','num_root','num_shells','num_compromised','num_access_files'])
final_df1[var1].quantile([0.0,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,1]).T

Unnamed: 0,0.00,0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95,0.99,1.00
count,0.001,0.001,0.001,0.001,0.005,0.029,0.178,0.256,0.279,0.296,0.296
diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.007,0.008,0.057,0.057
dst_bytes,0.0,0.0,0.0,0.0,0.0,0.0,0.011,0.04362,0.09183,0.26039,0.309345
dst_host_count,0.006,0.006,0.008,0.021,0.105,0.255,0.255,0.255,0.255,0.255,0.255
dst_host_diff_srv_rate,0.0,0.0,0.0,0.0,0.0,0.005,0.007,0.008,0.009,0.04,0.04
dst_host_rerror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1
dst_host_same_src_port_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.01,0.033,0.1,0.1
dst_host_same_srv_rate,0.001,0.001,0.001,0.002,0.004,0.023,0.1,0.1,0.1,0.1,0.1
dst_host_serror_rate,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
dst_host_srv_count,0.003,0.003,0.003,0.004,0.01,0.029,0.255,0.255,0.255,0.255,0.255


In [32]:
#((final_df1['protocol_type'].value_counts()/final_df1['protocol_type'].count())*100).head(20)

In [33]:
#Combining Nominal levels based on frequency 
final_df1['flag_new'] = final_df1['flag'].apply(lambda x: x if x in ([0.00,0.07,0.06,0.05,0.08]) else 'others')
final_df1['service_new'] = final_df1['service'].apply(lambda x: x if x in ([0.12,0.05,0.00,0.09,0.06,0.14,0.1]) else 'others')
final_df1['protocol_type_new'] = final_df1['protocol_type'].apply(lambda x: x if x in ([0.0,0.01,0.02]) else 'others')

In [34]:
final_df1.drop(columns=['flag','service','protocol_type'], inplace = True)

In [35]:
#Creating dummy Variables for Nominal 
final_df2 = pd.get_dummies(final_df1, columns = ['protocol_type_new','flag_new', 'service_new',], drop_first=True,dtype=int)

In [36]:
final_df2.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

In [37]:
final_list = final_df2.columns.difference(['attack_Y_N','attack','protocol_type_new_0_02','srv_error_rate','dst_host_srv_serror_rate',
                                        'srv_rerror_rate','rerror_rate','dst_host_srv_rerror_rate','same_srv_rate','dst_host_same_srv_rate',
                                         'serror_rate','dst_host_serror_rate','dst_host_count'])

In [38]:
#multicolinieity
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_values = pd.concat([pd.Series(final_list), pd.Series([variance_inflation_factor(final_df2[final_list].values, i) for i in range(len(final_list))])], axis=1)

In [38]:
vif_values.columns = ['features', 'vif']
vif_values.sort_values(by='vif', ascending=False)

Unnamed: 0,features,vif
14,logged_in,21.497512
6,dst_host_srv_count,20.632601
11,flag_new_0_07,19.619911
9,flag_new_0_05,18.307611
4,dst_host_rerror_rate,18.127771
21,service_new_0_12,15.129505
0,count,6.773738
23,service_new_others,5.678787
3,dst_host_diff_srv_rate,5.054317
15,protocol_type_new_0_01,3.685929


In [39]:
list(vif_values[vif_values['vif'] <= 10]['features'])

['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'duration',
 'flag_new_0_06',
 'flag_new_0_08',
 'flag_new_others',
 'protocol_type_new_0_01',
 'protocol_type_new_others',
 'service_new_0_05',
 'service_new_0_06',
 'service_new_0_09',
 'service_new_0_1',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_count',
 'srv_diff_host_rate']

In [125]:
#split the data into train & test
train, test = train_test_split(final_df2, test_size=0.3, random_state=123)

## Logistic Regression

In [254]:
formula = 'attack_Y_N~'+'+'.join(['count',
 #'diff_srv_rate',
 'dst_bytes',
# 'dst_host_diff_srv_rate',
 #'dst_host_same_src_port_rate',
# 'dst_host_srv_diff_host_rate',
 #'duration',
'logged_in',
 #'flag_new_0_05',
 'flag_new_0_08',
 'flag_new_others',
'service_new_0_05',
'protocol_type_new_0_01',
 'protocol_type_new_0_02',
 #'protocol_type_new_others',
 'service_new_0_1',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_diff_host_rate'])

In [258]:
formula = 'attack_Y_N~'+'+'.join([
'count',
# 'diff_srv_rate',
 #'dst_bytes',
 #'dst_host_diff_srv_rate',
 #'dst_host_same_src_port_rate',
 #'dst_host_srv_diff_host_rate',
# 'duration',
 #'flag_new_0_06',
 'flag_new_0_08',
 'flag_new_others',
 'protocol_type_new_0_01',
 #'protocol_type_new_others',
 'service_new_0_05',
 'service_new_0_06',
 #'service_new_0_09',
 'service_new_0_1',
 #'service_new_0_14',
 #'service_new_others',
 #'src_bytes',
 #'srv_count',
 #'srv_diff_host_rate'
])

In [259]:
formula

'attack_Y_N~count+flag_new_0_08+flag_new_others+protocol_type_new_0_01+service_new_0_05+service_new_0_06+service_new_0_1'

In [260]:
import statsmodels.formula.api as smf

In [261]:
model = smf.logit(formula, data=train).fit()

Optimization terminated successfully.
         Current function value: 0.136676
         Iterations 10


In [262]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:             attack_Y_N   No. Observations:               807394
Model:                          Logit   Df Residuals:                   807386
Method:                           MLE   Df Model:                            7
Date:                Sun, 03 Mar 2024   Pseudo R-squ.:                  0.8028
Time:                        20:24:07   Log-Likelihood:            -1.1035e+05
converged:                       True   LL-Null:                   -5.5964e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -3.4968      0.008   -430.205      0.000      -3.513      -3.481
count                     66.8359      0.196    341.233      0.000      66.452      67.220
flag

In [263]:
#Metrics based on the probability - AUC, Gini
train['prob'] = model.predict(train)
test['prob'] = model.predict(test)

In [264]:
train_auc = metrics.roc_auc_score(train.attack_Y_N, train.prob)
train_auc

0.9820693356871842

In [265]:
test_auc = metrics.roc_auc_score(test.attack_Y_N, test.prob)
test_auc

0.9821170626021145

In [266]:
#Calculating Gini (somerceD) for train & Test
train_gini = 2*metrics.roc_auc_score(train.attack_Y_N, train.prob)-1
train_gini

0.9641386713743685

In [267]:
test_gini = 2*metrics.roc_auc_score(test.attack_Y_N, test.prob)-1
test_gini

0.964234125204229

In [268]:
temp = train
roc_df = pd.DataFrame()
for cut_off in np.linspace(0,1):
    temp['y_pred'] = np.where(train.prob>cut_off, 1, 0)
    temp['TP'] = np.where(((train.attack_Y_N ==1) & (train.y_pred==1)), 1,0)
    temp['TN'] = np.where(((train.attack_Y_N ==0) & (train.y_pred==0)), 1,0)
    temp['FP'] = np.where(((train.attack_Y_N ==0) & (train.y_pred==1)), 1,0)
    temp['FN'] = np.where(((train.attack_Y_N ==1) & (train.y_pred==0)), 1,0)
    sensitivity = temp.TP.sum()/(temp.TP.sum()+temp.FN.sum())
    specificity = temp.TN.sum()/(temp.TN.sum()+temp.FP.sum())
    accuracy = (temp.TN.sum()+temp.TP.sum())/(temp.y_pred.count())
    temp_df = pd.DataFrame([cut_off, sensitivity, specificity, accuracy]).T
    temp_df.columns = ['cutoff', 'sensitivity', 'specificity', 'accuracy']
    roc_df = pd.concat([roc_df, temp_df], axis=0)

In [269]:
roc_df['total'] = roc_df.sensitivity + roc_df.specificity

In [270]:
roc_df.sort_values(by = 'total', ascending=False).head(1)

Unnamed: 0,cutoff,sensitivity,specificity,accuracy,total
0,0.346939,0.949119,0.985409,0.96726,1.934528


In [271]:
#From above step, the cut-off is: 0.265306
train['attack_pred'] = np.where(train.prob>0.346939, 1, 0)
test['attack_pred'] = np.where(test.prob>0.346939, 1, 0)

In [272]:
#Good ness of fit metrics based on categorical predicted output for train & test

print(metrics.classification_report(train.attack_Y_N, train.attack_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    403610
           1       0.98      0.95      0.97    403784

    accuracy                           0.97    807394
   macro avg       0.97      0.97      0.97    807394
weighted avg       0.97      0.97      0.97    807394



In [273]:
print(metrics.classification_report(test.attack_Y_N, test.attack_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    173100
           1       0.98      0.95      0.97    172926

    accuracy                           0.97    346026
   macro avg       0.97      0.97      0.97    346026
weighted avg       0.97      0.97      0.97    346026



## Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
y=final_df2['attack_Y_N']
X=final_df2[['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'duration',
 'flag_new_0_08',
 'flag_new_others',
 'protocol_type_new_0_01',
 'protocol_type_new_others',
 'service_new_0_1',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_diff_host_rate']]
train_X, test_X, train_y, test_y=train_test_split(X,y, test_size=0.3, random_state=123)

In [279]:
pargrid_ada = {'n_estimators': [10,20],
                'max_features': [5,6,7],
              'max_depth': [5,6,7]}
gscv_Rf = GridSearchCV(estimator=RandomForestClassifier(), 
                        param_grid=pargrid_ada, 
                        cv=5,
                        verbose=True, n_jobs=-1, scoring='accuracy')

In [280]:
#Apply gridsearchCV
gscv_results = gscv_Rf.fit(train_X, train_y)
gscv_results.best_score_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0.9993039334446756

In [281]:
gscv_results.best_params_

{'max_depth': 7, 'max_features': 7, 'n_estimators': 20}

In [290]:
#Checking accuracy of pridicted value for train and test data overall
train_y_rf = pd.DataFrame(train_y)
train_y_rf['pred'] = gscv_results.predict(train_X)
print(metrics.accuracy_score(train_y_rf['attack_Y_N'], train_y_rf['pred']))


test_y_rf = pd.DataFrame(test_y)
test_y_rf['pred'] = gscv_results.predict(test_X)
print(metrics.accuracy_score(test_y_rf['attack_Y_N'], test_y_rf['pred']))

0.9994067332677726
0.9993150803696832


In [291]:
#Train data - AUC Score
print(metrics.roc_auc_score(train_y_rf['attack_Y_N'], train_y_rf['pred']))

#Test data - AUC Score
print(metrics.roc_auc_score(test_y_rf['attack_Y_N'], test_y_rf['pred']))

0.9994068285300264
0.9993148113504677


## XG Boost

In [284]:
from xgboost import XGBClassifier

In [293]:
param_grid = {'learning_rate' : [0.01,0.05,0.1],
                               'n_estimators' : [40,50,60]}

xgcv = GridSearchCV(XGBClassifier(), param_grid, cv=5)

xgcv.fit(train_X, train_y)

print(xgcv.best_params_)

print(xgcv.best_score_)

{'learning_rate': 0.1, 'n_estimators': 60}
0.9996953160813691


In [294]:
#Checking accuracy of pridicted value for train and test data overall
train_y_xgbst = pd.DataFrame(train_y)
train_y_xgbst['pred'] = xgcv.predict(train_X)
print(metrics.accuracy_score(train_y_xgbst['attack_Y_N'], train_y_xgbst['pred']))


test_y_xgbst = pd.DataFrame(test_y)
test_y_xgbst['pred'] = xgcv.predict(test_X)
print(metrics.accuracy_score(test_y_xgbst['attack_Y_N'], test_y_xgbst['pred']))

0.9997683906494227
0.9997052244629017


In [295]:
#Train data - AUC Score
print(metrics.roc_auc_score(train_y_xgbst['attack_Y_N'], train_y_xgbst['pred']))

#Test data - AUC Score
print(metrics.roc_auc_score(test_y_xgbst['attack_Y_N'], test_y_xgbst['pred']))

0.9997683919732571
0.9997052011368162


### Multinomial Classification

In [18]:
from imblearn.over_sampling import SMOTE

In [19]:
final_df['attack'].value_counts()

attack
Normal            576710
Neptune           227228
Satan               5019
Smurf               3007
PortSweep           2964
NMap                1554
Attack_Back          968
GuessPassword         53
BufferOverflow        30
RootKit               10
FTPWrite               8
Name: count, dtype: int64

In [20]:
X_attack=final_df.drop(columns=['attack'], axis=1)
y_attack=final_df['attack']
sm = SMOTE(random_state =123)
X_res_attack, y_res_attack = sm.fit_resample(X_attack, y_attack)

In [21]:
y_res_attack.value_counts()

attack
Attack_Back       576710
BufferOverflow    576710
FTPWrite          576710
GuessPassword     576710
Neptune           576710
NMap              576710
Normal            576710
PortSweep         576710
RootKit           576710
Satan             576710
Smurf             576710
Name: count, dtype: int64

In [22]:
final_attack_df1=pd.concat([X_res_attack,y_res_attack], axis=1)

In [23]:
#Dropping variables with 0 values
final_attack_df1.drop(columns=['wrong_fragment','urgent','hot','num_failed_logins','num_file_creations','urgent','num_root','num_shells','num_compromised','num_access_files'],inplace=True)

In [24]:
#Combining Nominal levels based on frequency 
final_attack_df1['flag_new'] = final_attack_df1['flag'].apply(lambda x: x if x in ([0.00,0.07,0.06,0.05,0.08]) else 'others')
final_attack_df1['service_new'] = final_attack_df1['service'].apply(lambda x: x if x in ([0.12,0.05,0.00,0.09,0.06,0.14,0.1]) else 'others')
final_attack_df1['protocol_type_new'] = final_attack_df1['protocol_type'].apply(lambda x: x if x in ([0.0,0.01,0.02]) else 'others')

In [25]:
final_attack_df1.drop(columns=['flag','service','protocol_type'], inplace = True)

In [26]:
#Creating dummy Variables for Nominal 
final_attack_df2 = pd.get_dummies(final_attack_df1, columns = ['protocol_type_new','flag_new', 'service_new',], drop_first=True,dtype=int)

In [27]:
final_attack_df2.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

In [28]:
final_list = final_attack_df2.columns.difference(['attack','protocol_type_new_0_02','service_new_0_09','service_new_0_06','srv_error_rate','dst_host_srv_serror_rate',
                                                 'srv_rerror_rate','rerror_rate','dst_host_srv_rerror_rate','same_srv_rate','serror_rate','dst_host_same_srv_rate',
                                                 'flag_new_0_07','dst_host_serror_rate'])

In [31]:
#multicolinieity
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_values = pd.concat([pd.Series(final_list), pd.Series([variance_inflation_factor(final_attack_df2[final_list].values, i) for i in range(len(final_list))])], axis=1)

In [56]:
vif_values.columns = ['features', 'vif']
vif_values.sort_values(by='vif', ascending=False)

Unnamed: 0,features,vif
0,attack_Y_N,27.294413
4,dst_host_count,9.637545
20,service_new_0_12,9.010895
18,service_new_0_05,8.336121
15,logged_in,7.705015
24,srv_count,6.602197
6,dst_host_rerror_rate,6.513239
7,dst_host_same_src_port_rate,6.389231
8,dst_host_srv_count,6.358425
1,count,6.312707


In [57]:
list(vif_values[vif_values['vif'] <= 10]['features'])

['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_count',
 'dst_host_diff_srv_rate',
 'dst_host_rerror_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_count',
 'dst_host_srv_diff_host_rate',
 'duration',
 'flag_new_0_05',
 'flag_new_0_06',
 'flag_new_0_08',
 'flag_new_others',
 'logged_in',
 'protocol_type_new_0_01',
 'protocol_type_new_others',
 'service_new_0_05',
 'service_new_0_1',
 'service_new_0_12',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_count',
 'srv_diff_host_rate']

In [29]:
#split the data into train & test
X = final_attack_df2 [['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_count',
 'dst_host_diff_srv_rate',
 'dst_host_rerror_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_count',
 'dst_host_srv_diff_host_rate',
 'duration',
 'flag_new_0_05',
 'flag_new_0_06',
 'flag_new_0_08',
 'flag_new_others',
 'logged_in',
 'protocol_type_new_0_01',
 'protocol_type_new_others',
 'service_new_0_05',
 'service_new_0_1',
 'service_new_0_12',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_count',
 'srv_diff_host_rate']]
y = final_attack_df2['attack']
train_multi_X, test_multi_X, train_multi_y, test_multi_y=train_test_split(X,y, test_size=0.3, random_state=123)

In [30]:
from sklearn.linear_model import LogisticRegression

In [43]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [44]:
model.fit(train_multi_X, train_multi_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
#Checking accuracy of pridicted value for train and test data overall
train_y_logit = pd.DataFrame(train_multi_y)
train_y_logit['pred'] = model.predict(train_multi_X)
print(metrics.accuracy_score(train_y_logit['attack'], train_y_logit['pred']))


test_y_logit = pd.DataFrame(test_multi_y)
test_y_logit['pred'] = model.predict(test_multi_X)
print(metrics.accuracy_score(test_y_logit['attack'], test_y_logit['pred']))

0.9796618390885874
0.9796174013198168


In [56]:
print(metrics.classification_report(train_y_logit['attack'], train_y_logit['pred']))

                precision    recall  f1-score   support

   Attack_Back       1.00      1.00      1.00    403358
BufferOverflow       0.98      0.93      0.95    404006
      FTPWrite       0.92      0.98      0.95    404258
 GuessPassword       1.00      1.00      1.00    403247
          NMap       0.97      0.98      0.98    403758
       Neptune       0.99      0.98      0.99    404012
        Normal       0.99      0.97      0.98    403575
     PortSweep       0.99      0.98      0.98    403342
       RootKit       0.98      0.99      0.98    403875
         Satan       0.98      0.96      0.97    403730
         Smurf       0.99      1.00      1.00    403506

      accuracy                           0.98   4440667
     macro avg       0.98      0.98      0.98   4440667
  weighted avg       0.98      0.98      0.98   4440667



In [58]:
print(metrics.classification_report(test_y_logit['attack'], test_y_logit['pred']))

                precision    recall  f1-score   support

   Attack_Back       1.00      1.00      1.00    173352
BufferOverflow       0.98      0.93      0.95    172704
      FTPWrite       0.92      0.98      0.95    172452
 GuessPassword       1.00      1.00      1.00    173463
          NMap       0.97      0.98      0.98    172952
       Neptune       0.99      0.98      0.99    172698
        Normal       0.99      0.97      0.98    173135
     PortSweep       0.99      0.98      0.98    173368
       RootKit       0.98      0.99      0.98    172835
         Satan       0.98      0.96      0.97    172980
         Smurf       0.99      1.00      1.00    173204

      accuracy                           0.98   1903143
     macro avg       0.98      0.98      0.98   1903143
  weighted avg       0.98      0.98      0.98   1903143



### Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text

In [30]:
final_attack_df2_sample =final_attack_df2.groupby('attack', group_keys=False).apply(lambda x: x.sample(frac=0.2))

  final_attack_df2_sample =final_attack_df2.groupby('attack', group_keys=False).apply(lambda x: x.sample(frac=0.2))


In [31]:
#split the data into train & test
X = final_attack_df2_sample [['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_count',
 'dst_host_diff_srv_rate',
 'dst_host_rerror_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_count',
 'dst_host_srv_diff_host_rate',
 'duration',
 'flag_new_0_05',
 'flag_new_0_06',
 'flag_new_0_08',
 'flag_new_others',
 'logged_in',
 'protocol_type_new_0_01',
 'protocol_type_new_others',
 'service_new_0_05',
 'service_new_0_1',
 'service_new_0_12',
 'service_new_0_14',
 'service_new_others',
 'src_bytes',
 'srv_count',
 'srv_diff_host_rate']]
y = final_attack_df2_sample['attack']


In [52]:
param_grid = {'max_depth': [10,11,12,13,14],
              'max_leaf_nodes': [12,13,14,15]
}
#model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='f1_weighted')
dcsn_tree = model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
dcsn_tree.fit(X, y)

In [59]:
model = DecisionTreeClassifier(max_depth=14, max_leaf_nodes=1)
model.fit(train_multi_X, train_multi_y)

In [60]:
#Checking accuracy of pridicted value for train and test data overall
train_y_dcsntree = pd.DataFrame(train_multi_y)
train_y_dcsntree['pred'] = model.predict(train_multi_X)
print(metrics.accuracy_score(train_y_dcsntree['attack'], train_y_dcsntree['pred']))


test_y_dcsntree = pd.DataFrame(test_multi_y)
test_y_dcsntree['pred'] = model.predict(test_multi_X)
print(metrics.accuracy_score(test_y_dcsntree['attack'], test_y_dcsntree['pred']))

0.938475233562886
0.9382484658273182


In [61]:
print(metrics.classification_report(train_y_dcsntree['attack'], train_y_dcsntree['pred']))

                precision    recall  f1-score   support

   Attack_Back       0.99      1.00      0.99    403358
BufferOverflow       0.93      1.00      0.96    404006
      FTPWrite       0.99      0.95      0.97    404258
 GuessPassword       0.95      0.98      0.97    403247
          NMap       0.91      0.95      0.93    403758
       Neptune       1.00      0.88      0.94    404012
        Normal       1.00      0.84      0.91    403575
     PortSweep       0.97      1.00      0.99    403342
       RootKit       0.84      0.85      0.85    403875
         Satan       0.83      0.94      0.88    403730
         Smurf       0.94      0.94      0.94    403506

      accuracy                           0.94   4440667
     macro avg       0.94      0.94      0.94   4440667
  weighted avg       0.94      0.94      0.94   4440667



In [62]:
print(metrics.classification_report(test_y_dcsntree['attack'], test_y_dcsntree['pred']))

                precision    recall  f1-score   support

   Attack_Back       0.99      1.00      0.99    173352
BufferOverflow       0.93      1.00      0.96    172704
      FTPWrite       0.99      0.95      0.97    172452
 GuessPassword       0.95      0.98      0.97    173463
          NMap       0.91      0.95      0.93    172952
       Neptune       1.00      0.88      0.93    172698
        Normal       1.00      0.84      0.91    173135
     PortSweep       0.97      1.00      0.98    173368
       RootKit       0.84      0.85      0.85    172835
         Satan       0.83      0.94      0.88    172980
         Smurf       0.94      0.94      0.94    173204

      accuracy                           0.94   1903143
     macro avg       0.94      0.94      0.94   1903143
  weighted avg       0.94      0.94      0.94   1903143



### Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
pargrid_ada = {'n_estimators': [10,20],
                'max_features': [5,6,7],
              'max_depth': [5,6,7]}
gscv_Rf_multi = GridSearchCV(estimator=RandomForestClassifier(), 
                        param_grid=pargrid_ada, 
                        cv=5,
                        verbose=True, n_jobs=-1, scoring='accuracy')

In [41]:
#Apply gridsearchCV
gscv_results = gscv_Rf_multi.fit(X, y)
gscv_results.best_score_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0.9940477398683317

In [42]:
gscv_results.best_score_

0.9940477398683317

In [43]:
gscv_results.best_params_

{'max_depth': 7, 'max_features': 7, 'n_estimators': 20}

In [47]:
rndm_frst = RandomForestClassifier(max_depth= 7, max_features= 7, n_estimators= 20)

In [48]:
rndm_frst.fit(train_multi_X, train_multi_y)

In [50]:
#Checking accuracy of pridicted value for train and test data overall
train_y_rndmfrst = pd.DataFrame(train_multi_y)
train_y_rndmfrst['pred'] = rndm_frst.predict(train_multi_X)
print(metrics.accuracy_score(train_y_rndmfrst['attack'], train_y_rndmfrst['pred']))


test_y_rndmfrst = pd.DataFrame(test_multi_y)
test_y_rndmfrst['pred'] = rndm_frst.predict(test_multi_X)
print(metrics.accuracy_score(test_y_rndmfrst['attack'], test_y_rndmfrst['pred']))

0.9921518546650762
0.9921545569618258


In [53]:
print(metrics.classification_report(train_y_rndmfrst['attack'], train_y_rndmfrst['pred']))

                precision    recall  f1-score   support

   Attack_Back       1.00      1.00      1.00    403358
BufferOverflow       1.00      0.97      0.98    404006
      FTPWrite       0.97      0.99      0.98    404258
 GuessPassword       1.00      1.00      1.00    403247
          NMap       0.97      1.00      0.99    403758
       Neptune       1.00      1.00      1.00    404012
        Normal       1.00      0.98      0.99    403575
     PortSweep       1.00      1.00      1.00    403342
       RootKit       0.99      0.99      0.99    403875
         Satan       1.00      0.98      0.99    403730
         Smurf       1.00      1.00      1.00    403506

      accuracy                           0.99   4440667
     macro avg       0.99      0.99      0.99   4440667
  weighted avg       0.99      0.99      0.99   4440667



In [54]:
print(metrics.classification_report(test_y_rndmfrst['attack'], test_y_rndmfrst['pred']))

                precision    recall  f1-score   support

   Attack_Back       1.00      1.00      1.00    173352
BufferOverflow       1.00      0.97      0.98    172704
      FTPWrite       0.96      0.99      0.98    172452
 GuessPassword       1.00      1.00      1.00    173463
          NMap       0.97      1.00      0.99    172952
       Neptune       1.00      1.00      1.00    172698
        Normal       1.00      0.98      0.99    173135
     PortSweep       1.00      1.00      1.00    173368
       RootKit       0.99      0.99      0.99    172835
         Satan       1.00      0.98      0.99    172980
         Smurf       1.00      1.00      1.00    173204

      accuracy                           0.99   1903143
     macro avg       0.99      0.99      0.99   1903143
  weighted avg       0.99      0.99      0.99   1903143



### XG Boost

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
pargrid_ada = {'n_estimators': [400,600,800],
               'learning_rate': [10 ** x for x in range(-1, 1)]}

In [34]:
gscv_gbm = GridSearchCV(estimator=GradientBoostingClassifier(), 
                        param_grid=pargrid_ada, 
                        cv=5,
                        verbose=True, n_jobs=-1)

In [None]:
gscv_results = gscv_gbm.fit(X, y)
gscv_results.best_score_

In [None]:
gscv_results.best_score_

In [None]:
gscv_results.best_params_