In [2]:
"""
Use different embedded methods to extract relevant features :
- Lasso and Ridge Logistic Regression
- Support Vector Machine (SVM) method with Recursive Feature Elimination (RFE)

Parameters
----------
data_window.h5         : extracted data from preprocessing1.py
data_window3.h5        : extracted data from preprocessing2.py
data_window_labels.npy : label numpy array from preprocessing1.py

Return
----------
Print the results of the different methods (precision, recall, f1)
Plot the graphs of the different extractions
"""

'\nUse different embedded methods to extract relevant features :\n- Lasso and Ridge Logistic Regression\n- Support Vector Machine (SVM) method with Recursive Feature Elimination (RFE)\n\nParameters\n----------\ndata_window.h5         : extracted data from preprocessing1.py\ndata_window3.h5        : extracted data from preprocessing2.py\ndata_window_labels.npy : label numpy array from preprocessing1.py\n\nReturn\n----------\nPrint the results of the different methods (precision, recall, f1)\nPlot the graphs of the different extractions\n'

In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py

In [4]:
from sklearn import model_selection, feature_selection, linear_model, metrics

In [5]:
print("Import data")

Import data


In [6]:
X = pd.read_hdf('data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

In [7]:
X2 = pd.read_hdf('data_window3_botnet3.h5', key='data')
X2.reset_index(drop=True, inplace=True)

In [8]:
X = X.join(X2)

In [9]:
X.drop('window_id', axis=1, inplace=True)

In [13]:
print(X)

           counts  Sport_nunique  DstAddr_nunique  Dport_nunique   Dur_sum  \
0       -0.029219      -0.063408        -0.053469      -0.004937  0.546187   
1       -0.029219      -0.084612        -0.053469      -0.013360  0.548073   
2       -0.029219      -0.063408        -0.053469      -0.004937 -0.160237   
3       -0.029219      -0.063408        -0.053469      -0.004937  0.447344   
4       -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   
...           ...            ...              ...            ...       ...   
2024048 -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   
2024049 -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   
2024050 -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   
2024051 -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   
2024052 -0.029219      -0.063408        -0.053469      -0.004937 -0.163826   

         Dur_mean   Dur_std   Dur_max  Dur_median  TotBytes_sum

In [12]:
y = X['Label_<lambda>']
print(X['Label_<lambda>'])
X.drop('Label_<lambda>', axis=1, inplace=True)

KeyError: 'Label_<lambda>'

In [11]:
print(X)

0          [0]
1          [0]
2          [0]
3          [0]
4          [0]
          ... 
2024048    [0]
2024049    [0]
2024050    [0]
2024051    [0]
2024052    [0]
Name: Label_<lambda>, Length: 2024053, dtype: object


In [10]:
labels = np.load("data_window_botnet3_labels.npy",allow_pickle=True)

In [11]:
#print(X)
#print(y)
print(X.columns.values)
print(labels)

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=To-Backgro' 'flow=From-Backg' 'flow=From-Norma'
 'flow=To-Normal-' 'flow=Normal-V42' 'flow=From-Botne']


In [15]:
y_bin6 = y==6
print(y==6)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=123456)
#y_train_bin6 = y_train==6
#y_test_bin6 = y_test==6

0          True
1          True
2          True
3          True
4          True
           ... 
2024048    True
2024049    True
2024050    True
2024051    True
2024052    True
Name: Label_<lambda>, Length: 2024053, dtype: bool


In [25]:
print("y", np.unique(y, return_counts=True))
print("y_train", np.unique(y_train, return_counts=True))
print("y_test", np.unique(y_test, return_counts=True))

y (array([list([0]), list([1]), list([2]), list([3]), list([4]), list([6])],
      dtype=object), array([2207092,   18047,     263,     984,      48,     286]))
y_train (array([False]), array([2204452]))
y_test (array([False]), array([22268]))


In [26]:
## Embedded Method
print("Logistic Regression")

Logistic Regression


In [27]:
clf = linear_model.LogisticRegression(penalty='l2', C=1.0, random_state=123456, multi_class="auto", class_weight=None, solver="lbfgs", max_iter=1000, verbose=1)
clf.fit(X_train, y_train)
#print(clf.classes_)
print(clf.coef_)
print(clf.intercept_)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: False

In [None]:
y_pred = clf.predict(X_test)
#y_pred_bin6 = y_pred==6
#print(clf.predict_proba(X_test))
print("accuracy score = ", metrics.balanced_accuracy_score(y_test, y_pred))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])

In [None]:
clf = linear_model.LogisticRegression(penalty='l2', C=1.0, random_state=123456, multi_class="auto", class_weight='balanced', solver="lbfgs", max_iter=1000, verbose=1)
clf.fit(X_train, y_train)
#print(clf.classes_)
print(clf.coef_)
print(clf.intercept_)

In [None]:
y_pred = clf.predict(X_test)
#y_pred_bin6 = y_pred==6
#print(clf.predict_proba(X_test))
print("accuracy score = ", metrics.balanced_accuracy_score(y_test, y_pred))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])

In [None]:
clf = linear_model.LogisticRegression(penalty='l2', C=1.0, random_state=123456, multi_class="auto", class_weight={0:0.5, 1:0.5}, solver="lbfgs", max_iter=1000, verbose=1)
clf.fit(X_train, y_train)
#print(clf.classes_)
print(clf.coef_)
print(clf.intercept_)

In [None]:
y_pred = clf.predict(X_test)
#y_pred_bin6 = y_pred==6
#print(clf.predict_proba(X_test))
print("accuracy score = ", metrics.balanced_accuracy_score(y_test, y_pred))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])

main problems:
with class_weight='balanced', super high recall but very low precision
without, high precision but very low recall
accuracy score not a good metric (even balanced_accuracy)

In [None]:
print("Logistic Regression Cross Validation")

In [None]:
def apply_logreg_cross_validation(X, y, svc_args={'penalty':'l2', 'C':1.0, 'random_state':123456, 'multi_class':"auto", 'class_weight':None, 'solver':"lbfgs", 'max_iter':1000, 'verbose':1}):
    clf = linear_model.LogisticRegression(**svc_args)
    cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456)
    scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True)
    print(scores)
    return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1'])]

In [None]:
tab_class_weight = np.linspace(0, 0.1, 10)
print(tab_class_weight)

In [None]:
tab_score = np.array([apply_logreg_cross_validation(X_train, y_train, {'penalty':'l2', 'C':1.0, 'random_state':123456, 'multi_class':"auto", 'class_weight':{0:w, 1:1-w}, 'solver':"lbfgs", 'max_iter':1000, 'verbose':0}) for w in tab_class_weight])
print(tab_score)

In [None]:
plt.plot(tab_class_weight, tab_score[:, 0])
plt.plot(tab_class_weight, tab_score[:, 1])
plt.plot(tab_class_weight, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Botnet class weight")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_class_weight.pdf", format="pdf")
plt.show()

Results: class_weight_best = 0.044

In [None]:
def apply_logreg_cross_validation_coeff(X, y, svc_args={'penalty':'l2', 'C':1.0, 'random_state':123456, 'multi_class':"auto", 'class_weight':None, 'solver':"lbfgs", 'max_iter':1000, 'verbose':1}):
    clf = linear_model.LogisticRegression(**svc_args)
    #cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) #for l2
    cv = model_selection.ShuffleSplit(n_splits=3, test_size=0.1, random_state=123456) #for l1
    scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True, return_estimator=True)
    print(scores)
    return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean([model.coef_[0] for model in scores['estimator']], axis=0)]

In [None]:
tab_C = np.logspace(-2, 6, 9)
tab_logC = np.log10(tab_C)
print(tab_C)
print(tab_logC)

In [None]:
tab_score = np.array([apply_logreg_cross_validation_coeff(X_train, y_train, {'penalty':'l2', 'C':C, 'random_state':123456, 'multi_class':"auto", 'class_weight':{0:0.044, 1:1-0.044}, 'solver':"lbfgs", 'max_iter':1000, 'verbose':0}) for C in tab_C])
print(tab_score)

In [None]:
plt.plot(tab_logC, tab_score[:, 0])
plt.plot(tab_logC, tab_score[:, 1])
plt.plot(tab_logC, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("log(C)")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_C.pdf", format="pdf")
plt.show()

In [None]:
matrix_coeff = np.stack(tab_score[:, 3], axis=0)
print(matrix_coeff)
print(matrix_coeff.shape)

In [None]:
ax = plt.subplot(111)
NUM_COLORS = matrix_coeff.shape[1]
LINE_STYLES = ['solid', 'dashed', 'dashdot', 'dotted']
NUM_STYLES = len(LINE_STYLES)
cm = plt.get_cmap('Set1')

In [None]:
for i in range(0, matrix_coeff.shape[1]):
    lines = ax.plot(tab_logC, matrix_coeff[:, i])
    lines[0].set_color(cm(i//NUM_STYLES))
    lines[0].set_linestyle(LINE_STYLES[i%NUM_STYLES])

In [None]:
plt.xlabel("log(C)")
plt.xticks(label=np.log(tab_C))

In [None]:
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.95, box.height])
ax.legend(np.arange(0, matrix_coeff.shape[1]), loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
plt.savefig("cross_validation_C_coeff.pdf", format="pdf")
plt.show()

In [None]:
print(matrix_coeff)

In [None]:
tab_C = np.linspace(550, 1000, 10)
print(tab_C)

In [None]:
tab_score = np.array([apply_logreg_cross_validation_coeff(X_train, y_train, {'penalty':'l2', 'C':C, 'random_state':123456, 'multi_class':"auto", 'class_weight':{0:0.044, 1:1-0.044}, 'solver':"lbfgs", 'max_iter':1000, 'verbose':0}) for C in tab_C])
print(tab_score)

In [None]:
plt.plot(tab_C, tab_score[:, 0])
plt.plot(tab_C, tab_score[:, 1])
plt.plot(tab_C, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("C")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_C.pdf", format="pdf")
plt.show()

In [None]:
matrix_coeff = np.stack(tab_score[:, 3], axis=0)
print(matrix_coeff)
print(matrix_coeff.shape)

In [None]:
ax = plt.subplot(111)
NUM_COLORS = matrix_coeff.shape[1]
LINE_STYLES = ['solid', 'dashed', 'dashdot', 'dotted']
NUM_STYLES = len(LINE_STYLES)
cm = plt.get_cmap('Set1')

In [None]:
for i in range(0, matrix_coeff.shape[1]):
    lines = ax.plot(tab_C, matrix_coeff[:, i])
    lines[0].set_color(cm(i//NUM_STYLES))
    lines[0].set_linestyle(LINE_STYLES[i%NUM_STYLES])

In [None]:
plt.xlabel("C")

In [None]:
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.95, box.height])
ax.legend(np.arange(0, matrix_coeff.shape[1]), loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
plt.savefig("cross_validation_C_coeff.pdf", format="pdf")
plt.show()

In [None]:
tab_C = np.linspace(50, 1000, 20)
print(tab_C)

In [None]:
print(tab_score)

In [None]:
plt.plot(tab_C, tab_score[:, 0])
plt.plot(tab_C, tab_score[:, 1])
plt.plot(tab_C, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"], loc='upper right', bbox_to_anchor=(1, 0.9))
plt.xlabel("C")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_C.pdf", format="pdf")
plt.show()

In [None]:
matrix_coeff = np.stack(tab_score[:, 3], axis=0)
print(matrix_coeff)
print(matrix_coeff.shape)

In [None]:
ax = plt.subplot(111)
NUM_COLORS = matrix_coeff.shape[1]
LINE_STYLES = ['solid', 'dashed', 'dashdot', 'dotted']
NUM_STYLES = len(LINE_STYLES)
cm = plt.get_cmap('Set1')

In [None]:
for i in range(0, matrix_coeff.shape[1]):
    lines = ax.plot(tab_C, matrix_coeff[:, i])
    lines[0].set_color(cm(i//NUM_STYLES))
    lines[0].set_linestyle(LINE_STYLES[i%NUM_STYLES])

In [None]:
plt.xlabel("C")

In [None]:
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.95, box.height])
ax.legend(np.arange(0, matrix_coeff.shape[1]), loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
plt.savefig("cross_validation_C_coeff.pdf", format="pdf")
plt.show()

In [None]:
#tab_C = np.logspace(-2, 6, 9)
tab_C = [1e6]
tab_logC = np.log10(tab_C)
print(tab_C)
print(tab_logC)

In [None]:
tab_score = np.array([apply_logreg_cross_validation_coeff(X_train, y_train, {'penalty':'l1', 'C':C, 'random_state':123456, 'multi_class':"auto", 'class_weight':{0:0.044, 1:1-0.044}, 'solver':"liblinear", 'max_iter':1000, 'verbose':1}) for C in tab_C])
print(tab_score)

In [None]:
plt.plot(tab_logC, tab_score[:, 0])
plt.plot(tab_logC, tab_score[:, 1])
plt.plot(tab_logC, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("log(C)")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_C.pdf", format="pdf")
plt.show()

In [None]:
matrix_coeff = np.stack(tab_score[:, 3], axis=0)
print(matrix_coeff)
print(matrix_coeff.shape)

In [None]:
ax = plt.subplot(111)
NUM_COLORS = matrix_coeff.shape[1]
LINE_STYLES = ['solid', 'dashed', 'dashdot', 'dotted']
NUM_STYLES = len(LINE_STYLES)
cm = plt.get_cmap('Set1')

In [None]:
for i in range(0, matrix_coeff.shape[1]):
    lines = ax.plot(tab_logC, matrix_coeff[:, i])
    lines[0].set_color(cm(i//NUM_STYLES))
    lines[0].set_linestyle(LINE_STYLES[i%NUM_STYLES])

In [None]:
plt.xlabel("log(C)")
plt.xticks(label=np.log(tab_C))

In [None]:
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.95, box.height])
ax.legend(np.arange(0, matrix_coeff.shape[1]), loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
plt.savefig("cross_validation_C_coeff.pdf", format="pdf")
plt.show()

In [None]:
print("SVM method with RFE")

In [None]:
def extract_feature(clf, X, y):
    pass

In [None]:
def rfe_svm(X, y):
    clf = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', max_iter=1000, alpha=1e-9, tol=1e-3, random_state=123456, class_weight={0:0.044, 1:1-0.044})
    cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456)

    nb_features = X.shape[1]
    print(nb_features)
    
    scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True)
    print(scores)
    
    if nb_features > 1:
        rfe = feature_selection.RFE(clf, n_features_to_select=nb_features-1, step=1)
        rfe.fit(X, y)
        output = rfe_svm(rfe.transform(X), y)
        
        output.append([nb_features, np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), rfe.support_, rfe.ranking_])
        return output
    else:
        return [[nb_features, np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), [True], [1]]]

In [None]:
results = np.array(rfe_svm(X_train, y_train))
print(results)

In [None]:
plt.plot(results[:, 0], results[:, 1])
plt.plot(results[:, 0], results[:, 2])
plt.plot(results[:, 0], results[:, 3])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Number of features")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_rfe.pdf", format="pdf")
plt.show()

In [None]:
def apply_svm_cross_validation(X, y, svc_args={'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':0.001, 'tol':1e-3, 'random_state':123456, 'class_weight':None}):
    clf = linear_model.SGDClassifier(**svc_args)
    cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456)
    scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True)
    print(scores)
    return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1'])]

In [None]:
tab_class_weight = np.linspace(0, 0.1, 10)
print(tab_class_weight)

In [None]:
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, {'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':0.001, 'tol':1e-3, 'random_state':123456, 'class_weight':{0:w, 1:1-w}}) for w in tab_class_weight])
print(tab_score)

In [None]:
plt.plot(tab_class_weight, tab_score[:, 0])
plt.plot(tab_class_weight, tab_score[:, 1])
plt.plot(tab_class_weight, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Botnet class weight")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_class_weight_svm.pdf", format="pdf")
plt.show()

In [None]:
tab_C = np.logspace(-16, -8, 9)
tab_logC = np.log10(tab_C)
print(tab_C)
print(tab_logC)

In [None]:
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, {'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':C, 'tol':1e-3, 'random_state':123456, 'class_weight':{0:0.044, 1:1-0.044}}) for C in tab_C])
print(tab_score)

In [None]:
plt.plot(tab_logC, tab_score[:, 0])
plt.plot(tab_logC, tab_score[:, 1])
plt.plot(tab_logC, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("log(alpha)")

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_alpha.pdf", format="pdf")
plt.show()