## Binary Decision Tree

Load the required packages:

In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import re
import pydot
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz

Load data:

In [2]:
logs_labels = pd.read_csv("logs_labels.csv")

Regular expression-based feature extraction:

In [3]:
timestamp = []
method = []
login = []
status = []
request_len = []
bytes_sent = []


for i in range(len(logs_labels.full_log)):
    
    r = re.compile(r"^\[([\w:/]+\s[+\-]\d{4})\] (\S+)\s?(\S+)?\s?(\S+)? (\d{3}|-) (\d+|-)")
    result = r.findall(logs_labels.full_log[i])
    timestamp.append(result[0][0])
    method.append(result[0][1])
    if "login" in result[0][2]:
        login.append(1)
    elif "admin" in result[0][2]:
        login.append(1)
    else: 
        login.append(0)
    status.append((result[0][4]))
    request_len.append(len(result[0][2]))
    bytes_sent.append(result[0][5])

Binary and full label extraction:

In [4]:
label_binary = [0 if i == 0 else 1 for i in logs_labels.label]

label_full = []
for i in logs_labels.label:
    if i == 0:
        label_full.append(0)
    elif i == 10:
        label_full.append(10)
    else:
        label_full.append(1)

Label distribution:

In [5]:
Counter(label_full)

Counter({0: 199890, 1: 27329, 10: 620})

Concatenate, one-hot encode and split data:

In [6]:
def one_hot(input_list):
    output = np.zeros((len(input_list), len(set(input_list))))
    index = {v: k for k, v in enumerate(sorted(set(input_list)))}
    for i in range(len(input_list)):
        output[i,index[input_list[i]]] = 1
    return output

In [7]:
data_discrete = np.vstack((np.array(login), np.array(request_len), np.array(bytes_sent)))
data_categorical = np.concatenate((one_hot(method), one_hot(status)), axis = 1)
data = np.concatenate((data_discrete.T, data_categorical), axis = 1)

column_names = ["login", "request_len", "bytes_sent"]
column_names += list((sorted(set(method))))
column_names += list((sorted(set(status))))

X = pd.DataFrame(data, columns = column_names)
test_split = int(.75*len(logs_labels.full_log))
data_train = X[:test_split]
label_train = label_binary[:test_split]
X_test = X[test_split:]
y_test = label_binary[test_split:]

Print all features:

In [8]:
for i,j in enumerate(column_names):
    print(str(i +1)+ ":",j)

1: login
2: request_len
3: bytes_sent
4: DEBUG
5: GET
6: HEAD
7: INDEX
8: OPTIONS
9: POST
10: PUT
11: SEARCH
12: TRACE
13: ZIZYKAFG
14: 200
15: 301
16: 302
17: 304
18: 400
19: 403
20: 404
21: 405
22: 429
23: 499
24: 500
25: 504


Set up and train Decision Tree:

In [9]:
X_train, X_val, y_train, y_val = train_test_split(data_train, label_train, test_size=0.25, random_state=2809)

model = DecisionTreeClassifier(max_depth = 5)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Validate and evaluate:

In [10]:
print("Validation set evaluation:")
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

print("Test set evaluation:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Validation set evaluation:
             precision    recall  f1-score   support

          0       1.00      0.99      1.00     38145
          1       0.94      0.98      0.96      4575

avg / total       0.99      0.99      0.99     42720

Test set evaluation:
             precision    recall  f1-score   support

          0       1.00      0.99      1.00     47679
          1       0.97      0.99      0.98      9281

avg / total       0.99      0.99      0.99     56960



Export tree visualisation:

In [11]:
#export_graphviz(model, feature_names=column_names, filled=True, rounded=True)

#(graph,) = pydot.graph_from_dot_file('tree.dot')
#graph.write_png('tree.png')

## Classification of escalated log entries

Load extra packages:

In [12]:
from datetime import datetime
from datetime import timedelta

Load IP data and split as previously configured:

In [13]:
ip = pd.read_csv("ips_sorted.txt", header = None)

In [14]:
ip_train = ip[:test_split]
ip_test = ip[test_split:]

Timestamps in original data have recurring instances, multiple entries on the same second. Append extra milliseconds to have unique timestamp for each entry:

In [15]:
times = []
times_appended_ms = []

for i in range(len(timestamp)):
    time = timestamp[i]
    time = time.replace(" +0200", "")
    time = datetime.strptime(time, "%d/%b/%Y:%H:%M:%S")
    times.append(time)
    if i == 0:
        times_appended_ms.append(time)
    else:
        if time == times[i-1]:
            time = times_appended_ms[i-1]
            times_appended_ms.append(time + timedelta(milliseconds=4))
        else:
            times_appended_ms.append(time)

real_times = times_appended_ms

Create new dataframe, concatenating IP and whether or not this entry is classified in X0 or X1. Timestamps are the new index, to allow for time-based computation:

In [16]:
ip_data = pd.concat([ip, pd.DataFrame(label_binary)], axis = 1)
ip_data.columns = ["ip", "classification"]

In [17]:
ip_data = pd.concat([ip_data, pd.DataFrame(data)], axis = 1)
ip_data.index = pd.to_datetime(real_times)

The column classification is summed with a rolling window of 5 minutes. So, if an IP is classified in X1 10 times in the preceding 5 minutes, the result of the following calculation for that entry is 10. This can be done with a single line of code, using Pandas:

In [18]:
ip_data["classification"] = ip_data["classification"].groupby(ip_data["ip"], group_keys=False).rolling('300s').sum()

Timestamps are no longer required, so reset the index:

In [None]:
ip_data.reset_index(drop = True)

Drop the IP column, since it was only used as an identifier for the previous calculation:

In [20]:
training_data = ip_data.drop("ip", axis = 1)

Split full IP data:

In [21]:
ip_data_train = training_data[:test_split]
ip_label_train = label_full[:test_split]
ip_data_test = training_data[test_split:]
ip_label_test = label_full[test_split:]

Split into train and validation sets, fit model, and evaluate validation set. Model performance seems low, because it is hard for the model to predict label 10 on the exact right log entry:

In [22]:
X_train, X_val, y_train, y_val = train_test_split(ip_data_train, ip_label_train, test_size=0.25, random_state=2809)
    
model = DecisionTreeClassifier(criterion ="entropy", max_depth = 25, max_features = .67, class_weight = "balanced")
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     38145
          1       0.97      0.81      0.88      4433
         10       0.07      0.45      0.12       142

avg / total       0.99      0.98      0.98     42720



Discovery rate calculations on the test set:

In [23]:
y_pred_test = model.predict(ip_data_test)
ip_and_label = pd.DataFrame({"y_val": ip_label_test, "y_pred": y_pred_test, "ip": list(ip_test[0])})

In [24]:
ip_and_true_label_10 = ip_and_label.loc[(ip_and_label["y_val"] == 10)]
ip_and_pred_label_10 = ip_and_label.loc[(ip_and_label["y_pred"] == 10)]
ip_true = set(ip_and_true_label_10.ip)
ip_pred = set(ip_and_pred_label_10.ip)

Calculate and print discovery rate. Getting discovery rate on the test set, so performance is lower than on the full dataset, which is what is reported in the thesis:

In [25]:
disc_ip = 0
for i in ip_pred:
    if i in ip_true:
        disc_ip += 1
        
print (disc_ip/len(ip_true))

0.6666666666666666
