In [1]:
"""
Perform a statistic analysis of the Logistic regression classifier.

Parameters
----------
data_window_botnetx.h5         : extracted data from preprocessing1.py
data_window3_botnetx.h5        : extracted data from preprocessing2.py
data_window_botnetx_labels.npy : label numpy array from preprocessing1.py
nb_prediction                  : number of predictions to perform

Return
----------
Print train and test mean accuracy, precison, recall, f1
"""

'\nPerform a statistic analysis of the Logistic regression classifier.\n\nParameters\n----------\ndata_window_botnetx.h5         : extracted data from preprocessing1.py\ndata_window3_botnetx.h5        : extracted data from preprocessing2.py\ndata_window_botnetx_labels.npy : label numpy array from preprocessing1.py\nnb_prediction                  : number of predictions to perform\n\nReturn\n----------\nPrint train and test mean accuracy, precison, recall, f1\n'

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py

In [3]:
from sklearn import model_selection, feature_selection, utils, ensemble, linear_model, metrics

In [4]:
print("Import data")

Import data


In [5]:
X = pd.read_hdf('data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [6]:
X2 = pd.read_hdf('data_window3_botnet3.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [7]:
X = X.join(X2)

In [8]:
X.drop('window_id', axis=1, inplace=True)

In [9]:
y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

In [10]:
labels = np.load("data_window_botnet3_labels.npy", allow_pickle=True)

In [11]:
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=To-Backgro' 'flow=From-Backg' 'flow=From-Norma'
 'flow=To-Normal-' 'flow=Normal-V42' 'flow=From-Botne']
6


In [12]:
y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
print("y", np.unique(y, return_counts=True))

y (array([list([0]), list([1]), list([2]), list([3]), list([4]), list([6])],
      dtype=object), array([2207092,   18047,     263,     984,      48,     286]))


# Train

In [13]:
nb_prediction = 3
np.random.seed(seed=123456)
tab_seed = np.random.randint(0, 1000000000, nb_prediction)
print(tab_seed)

[545331265  64051946 930796018]


In [14]:
tab_train_precision = np.array([0.]*nb_prediction)
tab_train_recall = np.array([0.]*nb_prediction)
tab_train_fbeta_score = np.array([0.]*nb_prediction)

In [15]:
tab_test_precision = np.array([0.]*nb_prediction)
tab_test_recall = np.array([0.]*nb_prediction)
tab_test_fbeta_score = np.array([0.]*nb_prediction)

In [16]:
for i in range(0, nb_prediction):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=tab_seed[i])

    X_train_new, y_train_new = utils.resample(X_train, y_train, n_samples=X_train.shape[0]*10, random_state=tab_seed[i])
    
    print(i)
    print("y_train", np.unique(y_train_new, return_counts=True))
    print("y_test", np.unique(y_test, return_counts=True))

    clf = linear_model.LogisticRegression(penalty='l2', C=550, random_state=tab_seed[i], multi_class="auto", class_weight={0:0.044, 1:1-0.044}, solver="lbfgs", max_iter=1000, verbose=0)
    clf.fit(X_train_new, y_train_new)

    y_pred_train = clf.predict(X_train_new)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_train_new, y_pred_train)
    tab_train_precision[i] = precision[1]
    tab_train_recall[i] = recall[1]
    tab_train_fbeta_score[i] = fbeta_score[1]

    y_pred_test = clf.predict(X_test)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred_test)
    tab_test_precision[i] = precision[1]
    tab_test_recall[i] = recall[1]
    tab_test_fbeta_score[i] = fbeta_score[1]

0
y_train (array([False,  True]), array([14917125,     1895]))
y_test (array([False,  True]), array([734723,     95]))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1
y_train (array([False,  True]), array([14917233,     1787]))
y_test (array([False,  True]), array([734716,    102]))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2
y_train (array([False,  True]), array([14917177,     1843]))
y_test (array([False,  True]), array([734721,     97]))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [17]:
print("Train")
print("precision = ", tab_train_precision.mean(), tab_train_precision.std(), tab_train_precision)
print("recall = ", tab_train_recall.mean(), tab_train_recall.std(), tab_train_recall)
print("fbeta_score = ", tab_train_fbeta_score.mean(), tab_train_fbeta_score.std(), tab_train_fbeta_score)

Train
precision =  0.7569736494912401 0.02442442080426817 [0.76046901 0.7849861  0.72546584]
recall =  0.9522975996348652 0.004389489522442692 [0.95831135 0.94795747 0.95062398]
fbeta_score =  0.8432446378737596 0.015032343165338203 [0.84800374 0.85880862 0.82292156]


In [18]:
print("Test")
print("precision = ", tab_test_precision.mean(), tab_test_precision.std(), tab_test_precision)
print("recall = ", tab_test_recall.mean(), tab_test_recall.std(), tab_test_recall)
print("fbeta_score = ", tab_test_fbeta_score.mean(), tab_test_fbeta_score.std(), tab_test_fbeta_score)

Test
precision =  0.761033681765389 0.04311496353497057 [0.72357724 0.82142857 0.73809524]
recall =  0.9325219253916073 0.023389708157912154 [0.93684211 0.90196078 0.95876289]
fbeta_score =  0.836802521022943 0.017781338461466088 [0.81651376 0.85981308 0.83408072]
