In [1]:
# Author: Antoine DELPLACE
# Last update: 17/01/2020
"""
Use Random Forest with a bootstrap method to predict which flow is a malware.

Parameters
----------
data_window_botnetx.h5         : extracted data from preprocessing1.py
data_window3_botnetx.h5        : extracted data from preprocessing2.py
data_window_botnetx_labels.npy : label numpy array from preprocessing1.py

Return
----------
Print train and test accuracy, precison, recall, f1 and support
"""

'\nUse Random Forest with a bootstrap method to predict which flow is a malware.\n\nParameters\n----------\ndata_window_botnetx.h5         : extracted data from preprocessing1.py\ndata_window3_botnetx.h5        : extracted data from preprocessing2.py\ndata_window_botnetx_labels.npy : label numpy array from preprocessing1.py\n\nReturn\n----------\nPrint train and test accuracy, precison, recall, f1 and support\n'

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
# import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py
import csv

In [3]:
from sklearn import model_selection, feature_selection, kernel_approximation, ensemble, linear_model, metrics, utils

In [4]:
print("Import data")

Import data


In [5]:
X = pd.read_hdf('data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [6]:
X2 = pd.read_hdf('data_window3_botnet3.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [7]:
X = X.join(X2)

In [8]:
X.drop('window_id', axis=1, inplace=True)

In [9]:
y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

In [10]:
labels = np.load("data_window_botnet3_labels.npy", allow_pickle=True)

In [11]:
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=To-Backgro' 'flow=From-Backg' 'flow=From-Norma'
 'flow=To-Normal-' 'flow=Normal-V42' 'flow=From-Botne']
6


In [12]:
y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=123456)

In [13]:
print(len(y_train))
print(len(y_test))

1491902
734818


In [14]:
ynew = np.array(y_test)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
# save the dataframe as a csv file
DF.to_csv("ytest.csv", index = False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


In [15]:
print(X_train.shape)
X_train_new, y_train_new = utils.resample(X_train, y_train, n_samples=X_train.shape[0]*20, random_state=123456)

(1491902, 22)


In [55]:
ynew = np.array(y_train_new)
ynew = ynew.reshape(-1,40)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
# save the dataframe as a csv file
DF.to_csv("ytrain.csv", index = False)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [17]:
print("y", np.unique(y, return_counts=True))
print("y_train", np.unique(y_train_new, return_counts=True))
print("y_test", np.unique(y_test, return_counts=True))

y (array([list([0]), list([1]), list([2]), list([3]), list([4]), list([6])],
      dtype=object), array([2207092,   18047,     263,     984,      48,     286]))
y_train (array([False,  True]), array([29834014,     4026]))
y_test (array([False,  True]), array([734736,     82]))


# Random Forest Model


In [18]:
clf = ensemble.RandomForestClassifier(n_estimators=80, random_state=123456, verbose=1, class_weight=None)
clf.fit(X_train_new, y_train_new)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 24.6min finished


RandomForestClassifier(n_estimators=80, random_state=123456, verbose=1)

In [19]:
print("Train")
y_pred_train = clf.predict(X_train_new)
print(y_pred_train)

Train


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.7min finished


[False False False ... False False False]


In [20]:
ynew = np.array(y_pred_train)
ynew = ynew.reshape(-1,40)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_rf.csv", index=False)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [21]:
print("Test")
y_pred_test = clf.predict(X_test)
print(y_pred_test)

Test


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[False False False ... False False False]


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.2s finished


In [23]:
print(len(y_pred_test))

734818


In [24]:
ynew = np.array(y_pred_test)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_rf_test.csv", index=False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


# Logistic Regression

In [25]:
tab_seed = np.random.randint(0, 1000000000)

In [26]:
clf = linear_model.LogisticRegression(penalty='l2', C=550, random_state=tab_seed, multi_class="auto", class_weight={0:0.044, 1:1-0.044}, solver="lbfgs", max_iter=700, verbose=0)
clf.fit(X_train_new, y_train_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=550, class_weight={0: 0.044, 1: 0.956}, max_iter=700,
                   random_state=828082930)

In [27]:
print("Train")
y_pred_train = clf.predict(X_train_new)
print(y_pred_train)

Train
[False False False ... False False False]


In [28]:
ynew = np.array(y_pred_train)
ynew = ynew.reshape(-1,40)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_lr.csv",index=False)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [29]:
print("Test")
y_pred_test = clf.predict(X_test)
print(y_pred_test)

Test
[False False False ... False False False]


In [30]:
ynew = np.array(y_pred_test)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_lr_test.csv", index=False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


# Neural Networks

In [31]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, AveragePooling2D, Dense, Dropout, Flatten, Lambda, MaxPool2D, Conv2DTranspose, UpSampling2D, Concatenate, Add
from tensorflow.keras import regularizers, optimizers
from keras.preprocessing import image
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [32]:
def get_model(inputs, dropout=0.5, batchnorm=True):
    x = Dense(256, input_shape=(22,))(inputs)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(128, input_shape=(256,))(x)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(1, input_shape=(128,))(x)
    outputs = Activation("sigmoid")(x)
    
    model = Model(inputs=[inputs], outputs=[outputs])
    return model

In [33]:
tab_seed = np.random.randint(0, 1000000000)
filename_weights = "model.h5"

In [34]:
def fprecision(y_true, y_pred):	
    """Precision metric.	
    Only computes a batch-wise average of precision. Computes the precision, a
    metric for multi-label classification of how many selected items are
    relevant.
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))	
    precision = true_positives / (predicted_positives + K.epsilon())	
    return precision

In [35]:
def frecall(y_true, y_pred):	
    """Recall metric.	
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))	
    recall = true_positives / (possible_positives + K.epsilon())	
    return recall

In [36]:
def ff1_score(y_true, y_pred):
    """Computes the F1 Score
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """
    p = fprecision(y_true, y_pred)
    r = frecall(y_true, y_pred)
    return (2 * p * r) / (p + r + K.epsilon())

In [37]:
def get_model(inputs, dropout=0.5, batchnorm=True):
    x = Dense(256, input_shape=(22,))(inputs)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(128, input_shape=(256,))(x)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(1, input_shape=(128,))(x)
    outputs = Activation("sigmoid")(x)
    
    model = Model(inputs=[inputs], outputs=[outputs])
    return model

In [38]:
import time

In [39]:
inputs = Input((22,), name='input')
model = get_model(inputs, dropout=0, batchnorm=1)

callbacks = [
    ModelCheckpoint(filename_weights, verbose=1, save_best_only=True, save_weights_only=True)
]

model.compile(optimizer=optimizers.Adam(lr=1e-3), loss=["binary_crossentropy"], metrics=[fprecision, frecall, ff1_score])
#model.summary()

tps = time.time()
results = model.fit(X_train, y_train, batch_size=32, epochs=13, validation_split=0.15, shuffle=True, class_weight=None, verbose=0, callbacks=callbacks)
print("Execution time = ", time.time()-tps)

model.load_weights(filename_weights)

  "The `lr` argument is deprecated, use `learning_rate` instead.")



Epoch 00001: val_loss improved from inf to 0.00099, saving model to model.h5

Epoch 00002: val_loss improved from 0.00099 to 0.00053, saving model to model.h5

Epoch 00003: val_loss did not improve from 0.00053

Epoch 00004: val_loss improved from 0.00053 to 0.00045, saving model to model.h5

Epoch 00005: val_loss did not improve from 0.00045

Epoch 00006: val_loss did not improve from 0.00045

Epoch 00007: val_loss improved from 0.00045 to 0.00041, saving model to model.h5

Epoch 00008: val_loss did not improve from 0.00041

Epoch 00009: val_loss improved from 0.00041 to 0.00036, saving model to model.h5

Epoch 00010: val_loss did not improve from 0.00036

Epoch 00011: val_loss did not improve from 0.00036

Epoch 00012: val_loss did not improve from 0.00036

Epoch 00013: val_loss did not improve from 0.00036
Execution time =  1639.5491924285889


In [40]:
y_pred_train = model.predict(X_train, batch_size=32, verbose=0)
y_pred_train_bin = (y_pred_train > 0.5).astype(np.uint8)
print(y_pred_train_bin)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [41]:
ynew = np.array(y_pred_train_bin)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_nn.csv", index=False)

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [42]:
print("Test")
y_pred_test = model.predict(X_test, batch_size=32, verbose=0)
y_pred_test_bin = (y_pred_test > 0.5).astype(np.uint8)
print(y_pred_test_bin)

Test
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [43]:
ynew = np.array(y_pred_test_bin)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_nn_test.csv", index=False)

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


# Gradient Boosting

In [44]:
tab_seed = np.random.randint(0, 1000000000)

In [45]:
clf = ensemble.GradientBoostingClassifier(loss='exponential', learning_rate=0.1, n_estimators=33, max_depth=4, random_state=tab_seed, verbose=0)
clf.fit(X_train_new, y_train_new)
y_pred_train = clf.predict(X_train_new)
print(y_pred_train)

[False False False ... False False False]


In [46]:
ynew = np.array(y_pred_train)
ynew = ynew.reshape(-1,40)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_gb.csv", index=False)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [47]:
print("Test")
y_pred_test = clf.predict(X_test)
print(y_pred_test)

Test
[False False False ... False False False]


In [48]:
ynew = np.array(y_pred_test)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_gb_test.csv", index=False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


# SVM

In [49]:
feature_map_nystroem = kernel_approximation.Nystroem(kernel='poly', gamma=None, degree=2, n_components=200, random_state=123456)
feature_map_nystroem.fit(X_train)
X_train_new = feature_map_nystroem.transform(X_train)
X_test_new = feature_map_nystroem.transform(X_test)

In [50]:
clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', max_iter=100, alpha=1e-9, tol=1e-3, random_state=123456, class_weight=None, verbose=1)
clf.fit(X_train_new, y_train)
y_pred_train = clf.predict(X_train_new)
print(y_pred_train)

-- Epoch 1
Norm: 611434.98, NNZs: 200, Bias: -6873.173385, T: 1491902, Avg. loss: 36.683272
Total training time: 0.91 seconds.
-- Epoch 2
Norm: 499270.70, NNZs: 200, Bias: -10604.367865, T: 2983804, Avg. loss: 20.351496
Total training time: 1.83 seconds.
-- Epoch 3
Norm: 422247.09, NNZs: 200, Bias: -11536.796979, T: 4475706, Avg. loss: 10.730387
Total training time: 2.75 seconds.
-- Epoch 4
Norm: 366052.88, NNZs: 200, Bias: -12616.671616, T: 5967608, Avg. loss: 6.050920
Total training time: 3.66 seconds.
-- Epoch 5
Norm: 323215.92, NNZs: 200, Bias: -13370.153168, T: 7459510, Avg. loss: 3.559299
Total training time: 4.58 seconds.
-- Epoch 6
Norm: 289436.25, NNZs: 200, Bias: -14379.140825, T: 8951412, Avg. loss: 2.211512
Total training time: 5.49 seconds.
-- Epoch 7
Norm: 262122.66, NNZs: 200, Bias: -14646.605027, T: 10443314, Avg. loss: 1.414755
Total training time: 6.42 seconds.
-- Epoch 8
Norm: 239488.99, NNZs: 200, Bias: -14950.020574, T: 11935216, Avg. loss: 1.118370
Total training 

Norm: 40238.53, NNZs: 200, Bias: -8630.256580, T: 96973630, Avg. loss: 0.038898
Total training time: 59.33 seconds.
-- Epoch 66
Norm: 39666.50, NNZs: 200, Bias: -8552.784828, T: 98465532, Avg. loss: 0.038781
Total training time: 60.24 seconds.
-- Epoch 67
Norm: 39105.08, NNZs: 200, Bias: -8505.065505, T: 99957434, Avg. loss: 0.036739
Total training time: 61.15 seconds.
-- Epoch 68
Norm: 38565.09, NNZs: 200, Bias: -8429.725599, T: 101449336, Avg. loss: 0.037597
Total training time: 62.06 seconds.
-- Epoch 69
Norm: 38032.54, NNZs: 200, Bias: -8392.521706, T: 102941238, Avg. loss: 0.035574
Total training time: 62.97 seconds.
-- Epoch 70
Norm: 37522.22, NNZs: 200, Bias: -8319.230738, T: 104433140, Avg. loss: 0.034479
Total training time: 63.88 seconds.
-- Epoch 71
Norm: 37027.74, NNZs: 200, Bias: -8238.243873, T: 105925042, Avg. loss: 0.033899
Total training time: 64.79 seconds.
-- Epoch 72
Norm: 36542.52, NNZs: 200, Bias: -8175.985190, T: 107416944, Avg. loss: 0.033017
Total training time

In [51]:
ynew = np.array(y_pred_train)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_svm.csv", index=False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


In [53]:
print("Test")
y_pred_test = clf.predict(X_test_new)
print(y_pred_test)

Test
[False False False ... False False False]


In [54]:
ynew = np.array(y_pred_test)
ynew = ynew.reshape(-1,2)
# display the array
print(ynew)
  
# convert array into dataframe
DF = pd.DataFrame(ynew)
  
# save the dataframe as a csv file
DF.to_csv("output_svm_test.csv", index=False)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]
