In [None]:
import pandas as pd

# import dataset
dataset = pd.read_csv('sklearn-multi.csv')
dataset.shape

In [32]:
import numpy as np

# split into features and labels
features = dataset.iloc[:, 0:17]
labels = dataset.iloc[:, 17]

# generate dummies from categorical values (if present)
features = pd.get_dummies(features)
features = features.replace(np.inf, np.nan)
features = features.fillna(0)

from sklearn.preprocessing import LabelEncoder

# encode labels
le = LabelEncoder()
le_fitted_labels = le.fit(labels)
labels = le.fit_transform(labels)

In [9]:
from sklearn.model_selection import train_test_split

# split in train subset (used for RandomizedSearch) and test dataset (used for later validation at the end)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.25)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # simple baseline RandomForestClassifier (only for first testing)
# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
# clf.fit(features_train, labels_train)
# labels_pred = clf.predict(features_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create random grid as parameters for RandomizedSearch

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
rf = RandomForestClassifier(verbose = 100)

# Find best model using RandomizedSearch

rf_random = RandomizedSearchCV(estimator = rf, param_distributions=random_grid, 
                               n_iter = 100, cv = 5, verbose = 100, n_jobs = -2)

rf_random.fit(features_train, labels_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-2)]: Done   9 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-2)]: Done  13 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed: 18

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(verbose=100),
                   n_jobs=-2,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 20, 30, 40, 50, 60,
                                                         70, 80, 90, 100]},
                   verbose=100)

In [19]:
from joblib import dump, load

# print and save best result
print(rf_random.best_estimator_)

dump(rf_random.best_estimator_, 'best_estimator.joblib')

RandomForestClassifier(bootstrap=False, max_depth=70, max_features='sqrt',
                       min_samples_leaf=2, n_estimators=50, verbose=100)


['best_estimator.joblib']

In [20]:
# predict validation dataset

labels_pred = rf_random.best_estimator_.predict(features_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

In [24]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# print scores

labels_test_tf = le.inverse_transform(labels_test)
labels_pred_tf = le.inverse_transform(labels_pred) 

print(metrics.classification_report(labels_test_tf, labels_pred_tf, digits=6))
print(metrics.balanced_accuracy_score(labels_test, labels_pred, adjusted=True))

              precision    recall  f1-score   support

      BENIGN   0.999544  0.999092  0.999318    658896
        DDoS   0.996623  0.997680  0.997152     15088
         DoS   0.996664  0.994731  0.995696     30933
 FTP-Patator   0.999204  1.000000  0.999602      1255
    PortScan   0.994287  0.998566  0.996422     79477
 SSH-Patator   1.000000  0.999329  0.999665      1491

    accuracy                       0.998843    787140
   macro avg   0.997720  0.998233  0.997976    787140
weighted avg   0.998845  0.998843  0.998843    787140

0.9978796351181313


In [30]:
from pycm import ConfusionMatrix
cm = ConfusionMatrix(actual_vector=labels_test_tf,predict_vector=labels_pred_tf)
print(cm)

Predict           BENIGN            DDoS              DoS               FTP-Patator       PortScan          SSH-Patator       
Actual
BENIGN            658298            49                94                1                 454               0                 

DDoS              32                15053             3                 0                 0                 0                 

DoS               159               2                 30770             0                 2                 0                 

FTP-Patator       0                 0                 0                 1255              0                 0                 

PortScan          108               0                 6                 0                 79363             0                 

SSH-Patator       1                 0                 0                 0                 0                 1490              





Overall Statistics : 

95% CI                                                            (0.99

In [25]:
# print feature impotances

feature_importances = pd.Series(rf_random.best_estimator_.feature_importances_, index=features.columns)
print(feature_importances)

hash_collision    0.001406
tcp_src_port      0.038294
tcp_dst_port      0.054940
ipv4_protocol     0.027074
pkt_ctr           0.074397
byte_ctr          0.212604
ack_ctr           0.038170
rst_ctr           0.028242
syn_ctr           0.029023
fin_ctr           0.013934
byte_ctr_sq       0.082390
flow_duration     0.064885
byte_avg          0.139667
byte_sd           0.042412
iat_avg           0.038837
bps               0.066056
pps               0.047669
dtype: float64


In [None]:
import matplotlib.pyplot as mpl

# plot feature importances

feature_importances.sort_values().plot(kind='barh')
mpl.tight_layout()
mpl.savefig('save.pdf')