In [None]:
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
train_data = pd.read_csv("/content/Train_data.csv")
#test_data = pd.read_csv("/content/Test_data.csv")

In [None]:
train_data_df = pd.DataFrame(train_data)
print(train_data_df.head())
print(train_data_df.shape)

   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10                    0.05   


In [None]:
test_data_df = pd.DataFrame(test_data)
print(test_data_df.head())
print(test_data_df.shape)

   duration protocol_type   service  flag  src_bytes  dst_bytes  land  \
0         0           tcp   private   REJ          0          0     0   
1         0           tcp   private   REJ          0          0     0   
2         2           tcp  ftp_data    SF      12983          0     0   
3         0          icmp     eco_i    SF         20          0     0   
4         1           tcp    telnet  RSTO          0         15     0   

   wrong_fragment  urgent  hot  ...  dst_host_count  dst_host_srv_count  \
0               0       0    0  ...             255                  10   
1               0       0    0  ...             255                   1   
2               0       0    0  ...             134                  86   
3               0       0    0  ...               3                  57   
4               0       0    0  ...              29                  86   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.04                    0.06   
1   

In [None]:
# Train Data with class label {normal, anomaly}
set(train_data_df).difference(set(test_data_df))

{'class'}

In [None]:
# filter is the features you select
filter = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'class']
train_data_df = train_data_df[filter]

In [None]:
train_data_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes
count,25192.0,25192.0,25192.0
mean,305.054104,24330.63,3491.847
std,2686.55564,2410805.0,88830.72
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,44.0,0.0
75%,0.0,279.0,530.25
max,42862.0,381709100.0,5151385.0


In [None]:
train_data['class'].value_counts()

normal     13449
anomaly    11743
Name: class, dtype: int64

#Feature Engineering
##protocol_type

In [None]:
# Convert protocol_type type column into numerical values (For matrix calculation)
packets = train_data_df['protocol_type']
for i in range(len(packets)):
  packet = packets[i]
  if packet == 'tcp':
    train_data_df.loc[i, 'protocol_type_n'] = 0
  elif packet == 'udp':
    train_data_df.loc[i, 'protocol_type_n'] = 1
  elif packet == 'icmp':
    train_data_df.loc[i, 'protocol_type_n'] = 2

In [None]:
train_data_df['protocol_type_n'].unique()

array([0., 1., 2.])

##service

In [None]:
print(train_data_df['service'].unique())

['ftp_data' 'other' 'private' 'http' 'remote_job' 'name' 'netbios_ns'
 'eco_i' 'mtp' 'telnet' 'finger' 'domain_u' 'supdup' 'uucp_path' 'Z39_50'
 'smtp' 'csnet_ns' 'uucp' 'netbios_dgm' 'urp_i' 'auth' 'domain' 'ftp'
 'bgp' 'ldap' 'ecr_i' 'gopher' 'vmnet' 'systat' 'http_443' 'efs' 'whois'
 'imap4' 'iso_tsap' 'echo' 'klogin' 'link' 'sunrpc' 'login' 'kshell'
 'sql_net' 'time' 'hostnames' 'exec' 'ntp_u' 'discard' 'nntp' 'courier'
 'ctf' 'ssh' 'daytime' 'shell' 'netstat' 'pop_3' 'nnsp' 'IRC' 'pop_2'
 'printer' 'tim_i' 'pm_dump' 'red_i' 'netbios_ssn' 'rje' 'X11' 'urh_i'
 'http_8001']


In [None]:
service_dict = dict()
for i, service in enumerate(train_data_df['service'].unique()):
  service_dict[service] = i
print(service_dict)

{'ftp_data': 0, 'other': 1, 'private': 2, 'http': 3, 'remote_job': 4, 'name': 5, 'netbios_ns': 6, 'eco_i': 7, 'mtp': 8, 'telnet': 9, 'finger': 10, 'domain_u': 11, 'supdup': 12, 'uucp_path': 13, 'Z39_50': 14, 'smtp': 15, 'csnet_ns': 16, 'uucp': 17, 'netbios_dgm': 18, 'urp_i': 19, 'auth': 20, 'domain': 21, 'ftp': 22, 'bgp': 23, 'ldap': 24, 'ecr_i': 25, 'gopher': 26, 'vmnet': 27, 'systat': 28, 'http_443': 29, 'efs': 30, 'whois': 31, 'imap4': 32, 'iso_tsap': 33, 'echo': 34, 'klogin': 35, 'link': 36, 'sunrpc': 37, 'login': 38, 'kshell': 39, 'sql_net': 40, 'time': 41, 'hostnames': 42, 'exec': 43, 'ntp_u': 44, 'discard': 45, 'nntp': 46, 'courier': 47, 'ctf': 48, 'ssh': 49, 'daytime': 50, 'shell': 51, 'netstat': 52, 'pop_3': 53, 'nnsp': 54, 'IRC': 55, 'pop_2': 56, 'printer': 57, 'tim_i': 58, 'pm_dump': 59, 'red_i': 60, 'netbios_ssn': 61, 'rje': 62, 'X11': 63, 'urh_i': 64, 'http_8001': 65}


In [None]:
for row, service in enumerate(train_data_df['service']):
  train_data_df.loc[row, "service_n"] = service_dict[service]

In [None]:
train_data_df['service_n'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
       52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64.,
       65.])

##flag

In [None]:
print(train_data_df['flag'].unique())

['SF' 'S0' 'REJ' 'RSTR' 'SH' 'RSTO' 'S1' 'RSTOS0' 'S3' 'S2' 'OTH']


In [None]:
flag_dict = {}
for i, flag in enumerate(train_data_df['flag'].unique()):
  flag_dict[flag] = i

In [None]:
for row, flag in enumerate(train_data_df['flag']):
  train_data_df.loc[row, 'flag_n'] = flag_dict[flag]

In [None]:
train_data_df['flag_n'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

##class (which we want to predict)

In [None]:
res_labels = train_data_df['class']
for i in range(len(res_labels)):
  res = res_labels[i]
  if res == 'normal':
    train_data_df.loc[i, 'class_n'] = 0
  elif res == 'anomaly':
    train_data_df.loc[i, 'class_n'] = 1

In [None]:
train_data_df['class_n'].unique()

array([0., 1.])

##train-to-test split

In [None]:
# Features we select
filter_features = ['duration', 'protocol_type_n', 'service_n', 'flag_n', 'src_bytes', 'dst_bytes']
filter_res = ['class_n']
X_train, y_train = train_data_df[filter_features], train_data_df[filter_res]

In [None]:
# Split training data into training data and validation data. (80%, 20%)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
print('Training set: ', X_train.shape, y_train.shape)
print('Validation set: ', X_test.shape, y_test.shape)

Training set:  (20153, 6) (20153, 1)
Validation set:  (5039, 6) (5039, 1)


In [None]:
y_train.value_counts()

class_n
0.0        10775
1.0         9378
dtype: int64

In [None]:
y_test.value_counts()

class_n
0.0        2674
1.0        2365
dtype: int64

##Null value check

In [None]:
X_train.isnull().any().sum()
X_test.isnull().any().sum()
y_train.isnull().any().sum()
y_test.isnull().any().sum()

0

#Modeling

In [None]:
random_forest = RandomForestClassifier(max_depth=2)
random_forest.fit(X_train, y_train)

  random_forest.fit(X_train, y_train)


In [None]:
# Predict the result of class
y_test_predict = random_forest.predict(X_test)

In [None]:
accuracy_score(y_test, y_test_predict)

0.9531653105774955

#Optimization
*   Method 1: GridSearchCV



In [None]:
# define a new random forest classifier
new_random_forest = RandomForestClassifier()

In [None]:
# define a range of hyperparameters to test
param_grid = {'n_estimators': list(range(2, 10)),
              'max_depth': list(range(16)),
              'min_samples_split': list(range(2, 11, 2))
              #'min_samples_leaf': list(range(1, 9, 2))
              }

In [None]:
# create a grid search object
grid_search = GridSearchCV(estimator=new_random_forest, param_grid=param_grid, cv=5)

In [None]:
# fit the grid search object to the data
grid_search.fit(X_train, np.ravel(y_train))

200 fits failed out of a total of 3200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.9/dist-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
skl

In [None]:
# print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

Best hyperparameters: {'max_depth': 13, 'min_samples_split': 2, 'n_estimators': 5}


In [None]:
best_model = grid_search.best_estimator_

In [None]:
accuracy = best_model.score(X_test, y_test)
print('Test set accuracy:', accuracy)

Test set accuracy: 0.9928557253423298


#Outcome with hyperparameters
*   **n_estimators** is the number of trees in the forest
*   **max_depth** is the maximum depth of a tree
*   **min_samples_split** is the minimum number of samples in a node



In [None]:
best_random_forest = RandomForestClassifier(
                        max_depth=13,
                        min_samples_split=2,
                        n_estimators=5
                      )

In [None]:
best_random_forest.fit(X_train, y_train)

  best_random_forest.fit(X_train, y_train)


In [None]:
# Predict the result of class
best_y_test_predict = best_random_forest.predict(X_test)
print(best_y_test_predict.shape)
print(y_test.shape)

(5039,)
(5039, 1)


In [None]:
accuracy_score(y_test, best_y_test_predict)

0.9926572732685056