# IDS with Incremental Learning
In this notebook, we will use `creme`, a python library for online learning, and train a logistic regression sample by sample. We will compare this approach with our previous models.

In [1]:
import string
import os
import pickle


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint

from creme import stream
from creme import model_selection
from creme import compose
from creme import feature_extraction
from creme import linear_model
from creme import metrics
from creme import preprocessing
from creme import stats


from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score

Reading in our data:

In [2]:
train_attack_types = pd.read_csv("datasets/training_attack_types.txt", delimiter=" ", header=None, names= ["attack_type", "attack_category"])

`cols` is a list of all of our column names (including 20+ columns that are the result of expertise-driven feature engineering); `reduced_cols` is the first nine column names plus the label. `reduced_cols` corresponds to basic packet metadata.

In [3]:
cols = list(pd.read_csv("datasets/kddcup.names.txt", skiprows=1, header=None)[0].map(lambda x: str(x).split(":")[0]).values)
cols.append("label")


reduced_cols = cols[:9]
reduced_cols.append("label")

reduced_cols

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'label']

# Train/test split
In online learning, we do not partition our data into a training set and a test set. Since we learn on a per-observation basis, EVERY observation is a test observation, and after we've made our prediction we can then use it as a training observation. It's quite elegant, actually! KDD itself _is_ partitioned into a training and test set, and most importantly, the test set contains attacks that are not present in the train set. Thus, performance on the test set is useful to measure how generalizable our classifier is.

In [4]:
train = pd.read_csv("datasets/kddcup.data.corrected.txt", header=None, names=cols)

Discard the columns that are engineered features:

In [5]:
train = train[reduced_cols].copy()

In [6]:
test = pd.read_csv("datasets/corrected.txt", header=None, names=cols)

Discard the columns that are engineered features:

In [7]:
test = test[reduced_cols].copy()

In [8]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,label
0,0,tcp,http,SF,215,45076,0,0,0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,normal.
4,0,tcp,http,SF,239,486,0,0,0,normal.


In [9]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,label
0,0,udp,private,SF,105,146,0,0,0,normal.
1,0,udp,private,SF,105,146,0,0,0,normal.
2,0,udp,private,SF,105,146,0,0,0,normal.
3,0,udp,private,SF,105,146,0,0,0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,snmpgetattack.


# Preprocessing

## Process the label column
We want to generate a binary label:

In [10]:
train["label"] = train["label"].map(lambda x: x.split(".")[0])
test["label"] = test["label"].map(lambda x: x.split(".")[0])

### Binary label (normal/malicious)

In [11]:
train["label_binary"] = train["label"].map(lambda x: 0 if x=="normal" else 1)
test["label_binary"] = test["label"].map(lambda x: 0 if x=="normal" else 1)

In [12]:
test["label_binary"].value_counts()

1    250436
0     60593
Name: label_binary, dtype: int64

In [13]:
train["label_binary"].value_counts()

1    3925650
0     972781
Name: label_binary, dtype: int64

In [15]:
# X = pd.get_dummies(train, columns=["protocol_type", "service", "flag"], drop_first=True).drop(columns=["label", "label_binary"])
train_shuffled = train.sample(frac=1).reset_index(drop=True)

# Incremental Logistic Regression
## Data stream
Here we will build our data stream

In [16]:
train.shape

(4898431, 11)

In [17]:
N = 500_000

In [18]:
train = train_shuffled.iloc[:N].reset_index(drop=True)
holdout = train_shuffled.iloc[N:].reset_index(drop=True)

In [19]:
datastream = stream.iter_pandas(train.drop(columns=["label_binary", "label"]), train["label_binary"])
teststream = stream.iter_pandas(holdout.drop(columns=["label_binary", "label"]), holdout["label_binary"])

In [20]:
train_shuffled.iloc[N:].shape

(4398431, 11)

## Pipeline
We will scale and one-hot-encode.

In [21]:
train.dtypes

duration           int64
protocol_type     object
service           object
flag              object
src_bytes          int64
dst_bytes          int64
land               int64
wrong_fragment     int64
urgent             int64
label             object
label_binary       int64
dtype: object

In [22]:
cols_string = [col for col,col_type in zip(train.columns[:-2], train.dtypes[:-2]) if col_type == "object"]

In [24]:
ohe_list = [preprocessing.OneHotEncoder(col) for col in cols_string]
string_processor = (compose.TransformerUnion(ohe_list))

num_processor = compose.Blacklister(*cols_string)|preprocessing.StandardScaler()

model = string_processor + num_processor
model |= linear_model.LogisticRegression()
model.draw().render(format="svg")

'Digraph.gv.png'

In [24]:
# rocauc = metrics.ROCAUC()
cm = metrics.ConfusionMatrix()
recall = metrics.Recall()
f1 = metrics.F1()
precision = metrics.Precision()

counter=0

y_actual = []
y_preds = []
for x,y in datastream:
    if counter%10000 == 0:
        print(f"Current iteration: {counter}")
#         print(f"Current confusion matrix: {cm}")
        print(f"Current recall: {recall}")
        print(f"Current f1: {f1}")
        print(f"Current precision: {precision}")
        print()
    y_pred= model.predict_one(x)
    
    y_actual.append(y)
    y_preds.append(y_pred)
#     rocauc.update(y, y_pred)
    cm.update(y, y_pred)
    recall.update(y, y_pred)
    f1.update(y, y_pred)
    precision.update(y, y_pred)
    
    model.fit_one(x, y)
    counter+=1
# print(rocauc)
# print(cm)
print(recall)
print(f1)
print(precision)

Current iteration: 0
Current recall: Recall: 0.
Current f1: F1: 0.
Current precision: Precision: 0.

Current iteration: 10000
Current recall: Recall: 0.998127
Current f1: F1: 0.983516
Current precision: Precision: 0.969326

Current iteration: 20000
Current recall: Recall: 0.998062
Current f1: F1: 0.990479
Current precision: Precision: 0.983009

Current iteration: 30000
Current recall: Recall: 0.998417
Current f1: F1: 0.992958
Current precision: Precision: 0.987558

Current iteration: 40000
Current recall: Recall: 0.998157
Current f1: F1: 0.993871
Current precision: Precision: 0.989622

Current iteration: 50000
Current recall: Recall: 0.998225
Current f1: F1: 0.994396
Current precision: Precision: 0.990596

Current iteration: 60000
Current recall: Recall: 0.998288
Current f1: F1: 0.994852
Current precision: Precision: 0.991439

Current iteration: 70000
Current recall: Recall: 0.998371
Current f1: F1: 0.995279
Current precision: Precision: 0.992207

Current iteration: 80000
Current recal

In [25]:
roc_auc_score(y_actual, y_preds)

0.9925471574805707

In [39]:
cm

                   0         1
         0   72360.0     790.0
         1     453.0  295361.0

In [26]:
teststream = stream.iter_pandas(holdout.drop(columns=["label_binary", "label"]), holdout["label_binary"])
y_preds = [model.predict_one(x) for x,y in teststream]

In [28]:
roc_auc_score(holdout["label_binary"], y_preds)

0.9943776764682737

In [29]:
f1_score(holdout["label_binary"], y_preds)

0.9980653071883044

In [30]:
confusion_matrix(holdout["label_binary"], y_preds)

array([[ 864691,    8559],
       [   5088, 3520093]], dtype=int64)

In [31]:
precision_score(holdout["label_binary"], y_preds)

0.9975744278551696

In [32]:
recall_score(holdout["label_binary"], y_preds)

0.998556669856101