In [1]:
"""
Use Random Forest with a bootstrap method to predict which flow is a malware.

Parameters
----------
data_window_botnetx.h5         : extracted data from preprocessing1.py
data_window3_botnetx.h5        : extracted data from preprocessing2.py
data_window_botnetx_labels.npy : label numpy array from preprocessing1.py

Return
----------
Print train and test accuracy, precison, recall, f1 and support
"""

'\nUse Random Forest with a bootstrap method to predict which flow is a malware.\n\nParameters\n----------\ndata_window_botnetx.h5         : extracted data from preprocessing1.py\ndata_window3_botnetx.h5        : extracted data from preprocessing2.py\ndata_window_botnetx_labels.npy : label numpy array from preprocessing1.py\n\nReturn\n----------\nPrint train and test accuracy, precison, recall, f1 and support\n'

In [2]:
! python3 -m pip install -U matplotlib==3.2
! python3 -m pip install --upgrade sklearn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
# import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py
import csv

In [4]:
from sklearn import model_selection, feature_selection, utils, ensemble, linear_model, metrics

In [5]:
print("Import data")

Import data


In [6]:
X = pd.read_hdf('data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [7]:
X2 = pd.read_hdf('data_window3_botnet3.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [8]:
X = X.join(X2)

In [9]:
X.drop('window_id', axis=1, inplace=True)

In [10]:
y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

In [11]:
labels = np.load("data_window_botnet3_labels.npy", allow_pickle=True)

In [12]:
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=To-Backgro' 'flow=From-Backg' 'flow=From-Norma'
 'flow=To-Normal-' 'flow=Normal-V42' 'flow=From-Botne']
6


In [13]:
y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=123456)

In [14]:
print(X_train.shape)
X_train_new, y_train_new = utils.resample(X_train, y_train, n_samples=X_train.shape[0]*20, random_state=123456)

(1491902, 22)


In [15]:
print("y", np.unique(y, return_counts=True))
print("y_train", np.unique(y_train_new, return_counts=True))
print("y_test", np.unique(y_test, return_counts=True))

y (array([list([0]), list([1]), list([2]), list([3]), list([4]), list([6])],
      dtype=object), array([2207092,   18047,     263,     984,      48,     286]))
y_train (array([False,  True]), array([29834014,     4026]))
y_test (array([False,  True]), array([734736,     82]))


In [16]:
## Embedded Method
print("Random Forest Classifier")

Random Forest Classifier


In [17]:
clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=123456, verbose=1, class_weight=None)
clf.fit(X_train_new, y_train_new)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 50.5min finished


RandomForestClassifier(random_state=123456, verbose=1)

In [18]:
print("Train")
y_pred_train = clf.predict(X_train_new)
print("accuracy score = ", metrics.balanced_accuracy_score(y_train_new, y_pred_train))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_train_new, y_pred_train)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])

Train


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.5min finished


accuracy score =  1.0
precision =  1.0
recall =  1.0
fbeta_score =  1.0
support =  4026


In [19]:
print("Test")
y_pred_test = clf.predict(X_test)
print("accuracy score = ", metrics.balanced_accuracy_score(y_test, y_pred_test))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred_test)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])

Test


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.9s finished


accuracy score =  0.975609756097561
precision =  1.0
recall =  0.9512195121951219
fbeta_score =  0.975
support =  82
