# CSE-CIC-IDS 2017 Logistic Regression

In [1]:
model_id = "logisticregression-multiclass"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob

In [3]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_grouped_1henc.csv")

# Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [7]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [8]:
del x_train
del y_train
del x_valtest 
del y_valtest

# ADASYN

In [9]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn.csv").to_numpy()

In [10]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn.csv")

In [11]:
print(x_train_res.shape)
print(y_train_res.shape)
print(type(x_train_res))
print(type(y_train_res))

(9549194, 68)
(9549194, 1)
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


# Train model

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
%%time
clf = LogisticRegression(solver="saga", multi_class='multinomial', max_iter=100, verbose=2).fit(x_train_res, y_train_res)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 2433 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 40.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 40.6min finished


Wall time: 40min 42s


**Save model**

In [14]:
import pickle, time

In [15]:
model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(clf, save_file)
save_file.close()

**Test model**

In [18]:
pred = clf.predict(x_val)

In [23]:
pred_index = pred
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,400160,51,0,65,0,80,0,400356
Botnet,14963,339,0,0,0,11,0,15313
Brute Force,2586,0,2810,262,0,88,0,5746
DoS/DDoS,13482,0,0,74953,0,49,3,88487
Infiltration,7581,7,0,58,7,0,0,7653
PortScan,3760,0,0,647,0,31546,0,35953
Web Attack,11675,0,0,525,0,13,428,12641
All,454207,397,2810,76510,7,31787,431,566149


# Test Set

In [16]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [24]:
pred = clf.predict(x_test)
pred_index = pred
y_index = list(np.argmax(y_test.to_numpy(), axis=1))
f1_micro = f1_score(y_index, pred_index, average='micro')
f1_macro = f1_score(y_index, pred_index, average='macro')

print(f1_micro)
print(f1_macro)

0.9014323084559012
0.505514839403138


In [25]:
atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix.to_csv(NOTEBOOK_PATH + "Confusion Matrices/test_" + model_filename + ".csv")
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,400841,53,0,48,0,90,0,401032
Botnet,15176,322,0,0,0,10,0,15508
Brute Force,2613,0,2750,250,0,90,0,5703
DoS/DDoS,13522,0,0,74362,0,44,4,87932
Infiltration,7528,9,0,79,5,0,0,7621
PortScan,3650,0,0,608,0,31624,0,35882
Web Attack,11460,0,2,550,0,18,441,12471
All,454790,384,2752,75897,5,31876,445,566149


# Log results

In [52]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tPrecision: " + str(precision))
model_log.write("\n\tRecall: " + str(recall))
model_log.write("\n\tF1: " + str(f1))

model_log.close()