# Restricted Logistic Regression
In this notebook, we build a logistic regression classifier that utilizes only basic http header metadata to build a binary classifier -- malicious vs background. In particular, we are *discarding* the features that were engineered with domain expertise.

In [1]:
import string
import os
import pickle
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [2]:
train_attack_types = pd.read_csv("datasets/training_attack_types.txt", delimiter=" ", header=None, names= ["attack_type", "attack_category"])

In [3]:
target = "back"
train_attack_types.loc[train_attack_types["attack_type"]==target, "attack_category"].values[0]

'dos'

In [4]:
cols = list(pd.read_csv("datasets/kddcup.names.txt", skiprows=1, header=None)[0].map(lambda x: str(x).split(":")[0]).values)
cols.append("label")


reduced_cols = cols[:9]
reduced_cols.append("label")

reduced_cols

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'label']

In [5]:
train = pd.read_csv("datasets/kddcup.data.corrected.txt", header=None, names=cols)

In [6]:
train = train[reduced_cols].copy()

In [7]:
test = pd.read_csv("datasets/corrected.txt", header=None, names=cols)

In [8]:
test = test[reduced_cols].copy()

In [9]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,label
0,0,tcp,http,SF,215,45076,0,0,0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,normal.
4,0,tcp,http,SF,239,486,0,0,0,normal.


In [10]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,label
0,0,udp,private,SF,105,146,0,0,0,normal.
1,0,udp,private,SF,105,146,0,0,0,normal.
2,0,udp,private,SF,105,146,0,0,0,normal.
3,0,udp,private,SF,105,146,0,0,0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,snmpgetattack.


# EDA

## Process the label column
We want to remove the period. Then we want to generate three label columns:

In [11]:
train["label"] = train["label"].map(lambda x: x.split(".")[0])
test["label"] = test["label"].map(lambda x: x.split(".")[0])

### Binary label (normal/malicious)

In [12]:
train["label_binary"] = train["label"].map(lambda x: 0 if x=="normal" else 1)
test["label_binary"] = test["label"].map(lambda x: 0 if x=="normal" else 1)

In [13]:
test["label_binary"].value_counts()

1    250436
0     60593
Name: label_binary, dtype: int64

In [14]:
train["label_binary"].value_counts()

1    3925650
0     972781
Name: label_binary, dtype: int64

In [15]:
X = pd.get_dummies(train, columns=["protocol_type", "service", "flag"], drop_first=True).drop(columns=["label", "label_binary"])

# Restricted Logistic Regression
How well do we do with only the baseline features?

In [16]:
y = train["label_binary"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)

In [18]:
X_train.shape

(489843, 87)

In [19]:
y_train.shape

(489843,)

In [None]:
filename = "./models/restricted_logreg.pickle"
if not os.path.exists(filename):
    now = time.time()
    logreg = LogisticRegression(penalty="none", solver="sag", max_iter=10000, verbose=1, n_jobs = -2)
    logreg.fit(X_train, y_train)
    print(f"fitting completed in {time.time() - now}")
    with open(filename, 'wb') as f:
        pickle.dump(logreg, f)
else:
    with open(filename, 'rb') as f:
        logreg = pickle.load(f)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.


In [None]:
logreg.score(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

In [None]:
roc_auc_score(y_train, logreg.predict(X_train))

In [None]:
roc_auc_score(y_test, logreg.predict(X_test))

In [None]:
# cross_val_score(logreg, X_train, y_train)

In [None]:
y_actual = test["label"].map(lambda x: 0 if x == "normal" else 1)
y_actual

In [None]:
X_big_test = pd.get_dummies(test, columns=["protocol_type", "service", "flag"], drop_first=True).drop(columns=["label", "label_binary"])

In [None]:
X_big_test.columns

In [None]:
X_test.columns

In [None]:
cols_to_zero = [col for col in X_test.columns if col not in X_big_test.columns]
for col in cols_to_zero:
    X_big_test[col] = 0

In [None]:
logreg.score(X_big_test[X_test.columns], y_actual)

In [None]:
roc_auc_score(y_actual, logreg.predict(X_big_test[X_test.columns]))

In [None]:
tn, fp, fn, tp = confusion_matrix(logreg.predict(X_big_test[X_test.columns]), y_actual).ravel()

In [None]:
# Our sensitivity is
(tp) / (tp + fn) 

In [None]:
# Our precision is
(tp) / (tp + fp)

In [None]:
coefs = pd.DataFrame(logreg.coef_, columns=X_test.columns, index=["coef_"]).T

In [None]:
with pd.option_context("max_rows", None):
    display(coefs.sort_values(by="coef_", ascending=False))

In [None]:
plt.hist(logreg.coef_[0]);
plt.yscale('log')

## Only using basic metadata
It's useful to see how we do with only the first few columns:

In [None]:
plt.plot(X["land"],'.')

## Residuals
Do an analysis of the resids

Maybe simulate some internal attacks on your own network. Can you connect a networking component with a model scoring component?

Online learning (iterative model updating). See "vawpol wabbit"

See

http://onlineprediction.net/?n=Main.HomePage

https://duckduckgo.com/?q=vawp`ol+wabbit&t=ffab&ia=web
    