# CSE-CIC-IDS 2017 Logistic Regression

In [1]:
model_id = "logisticregression-binaryadasyn"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob

In [3]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_all_binary.csv")

# Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [7]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [8]:
del x_train
del y_train
del x_valtest 
del y_valtest

In [9]:
print("\nValidation:")
print(y_val['IsAttack'].value_counts())
print("\nTest:")
print(y_test['IsAttack'].value_counts())


Validation:
0    454207
1    111942
Name: IsAttack, dtype: int64

Test:
0    454790
1    111359
Name: IsAttack, dtype: int64


# ADASYN

In [10]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn_binary.csv")

In [11]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn_binary.csv")['IsAttack']

In [12]:
print(x_train_res.shape)
print(y_train_res.shape)
print(type(x_train_res))
print(type(y_train_res))

(2727615, 68)
(2727615,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


# Train model

In [13]:
from sklearn.linear_model import LogisticRegression

In [17]:
%%time
clf = LogisticRegression(solver="saga", max_iter=1000, verbose=2, n_jobs=-1).fit(x_train_res, y_train_res)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


max_iter reached after 2516 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 41.9min finished


Wall time: 41min 56s


**Save model**

In [18]:
import pickle, time

In [19]:
model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(clf, save_file)
save_file.close()

**Test model**

In [20]:
pred = clf.predict(x_val)

In [21]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_val.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,108594,34611,143205
Benign,3348,419596,422944
All,111942,454207,566149


# Test Set

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [23]:
pred = clf.predict(x_test)
y_test_npy = y_test.to_numpy().ravel()

precision = precision_score(y_test_npy, pred)
print("Precision:", precision)

recall = recall_score(y_test_npy, pred)
print("Recall:", recall)

f1 = f1_score(y_test_npy, pred)
print("F1:", f1)

Precision: 0.7565174205913419
Recall: 0.971219209942618
F1: 0.8505280707135049


In [24]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_test.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,108154,34809,142963
Benign,3205,419981,423186
All,111359,454790,566149


In [25]:
pd.crosstab(pred_series, y_series).apply(lambda r: r/r.sum(), axis=0)

Actual,Attack,Benign
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1
Attack,0.971219,0.076539
Benign,0.028781,0.923461


# Log results

In [52]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tPrecision: " + str(precision))
model_log.write("\n\tRecall: " + str(recall))
model_log.write("\n\tF1: " + str(f1))

model_log.close()