In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import numpy as np

from trustee import ClassificationTrustee

In [None]:
pool_1_ips = {
    '169.231.10.199', 
    '169.231.210.93', 
    '169.231.172.165',
}

pool_2_ips = {
    '128.111.52.37',
}

pool_3_ips = {
    '169.231.8.190',
    '169.231.123.195',
}

Now we have 2 different attacks, let's create 2 datasets - where they're labelled together as an attack and where they are different attacks

In [None]:
dataset_1 = pd.read_csv(f'campus_dataset_1.csv')
dataset_1['Class'] = 0
dataset_1.loc[dataset_1['Src IP'].isin(pool_1_ips), 'Class'] = 1
dataset_1.loc[dataset_1['Src IP'].isin(pool_3_ips), 'Class'] = 2
ttl_data_1 = pd.read_csv(f'campus_ttl_1.csv')
dataset_1 = dataset_1.merge(ttl_data_1, on="Flow ID", how='left')

dataset_2 = pd.read_csv(f'campus_dataset_2.csv')
dataset_2['Class'] = 0
dataset_2.loc[dataset_2['Src IP'].isin(pool_2_ips), 'Class'] = 1
dataset_2.loc[dataset_2['Src IP'].isin(pool_3_ips), 'Class'] = 2
ttl_data_2 = pd.read_csv(f'campus_ttl_2.csv')
dataset_2 = dataset_2.merge(ttl_data_2, on="Flow ID", how='left')

dataset = pd.concat([dataset_1, dataset_2])
dataset = dataset.replace([np.inf, -np.inf], np.nan)
dataset = dataset.dropna(axis=0)
dataset = dataset.drop([
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Timestamp', 
    'Protocol',    # always tcp
    'Label',       # empty
], axis=1)

campus_dataset_separated = dataset
campus_dataset_merged = dataset.copy()
campus_dataset_merged.loc[campus_dataset_merged['Class'] == 2, 'Class'] = 1

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
target_variable = 'Class'
features = list(sorted(set(campus_dataset_separated.columns) - {target_variable}))

x_train_separated = campus_dataset_separated[features]
y_train_separated = campus_dataset_separated[target_variable]

x_train_merged = campus_dataset_merged[features]
y_train_merged = campus_dataset_merged[target_variable]

In [None]:
classifiers_merged = [
    KNeighborsClassifier(2),
    MLPClassifier(alpha=1, max_iter=100),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    RandomForestClassifier(max_depth=2),
]

## Explore merged attacks first

In [None]:
for clf in classifiers_merged:
    print(clf)
    clf.fit(x_train_merged, y_train_merged)
    y_pred = clf.predict(x_train_merged)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_train_merged, y_pred))

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train_merged, y_train_merged)
y_pred = clf.predict(x_train_merged)
print(metrics.classification_report(y_train_merged, y_pred))
print(metrics.confusion_matrix(y_train_merged, y_pred))
fig = plt.figure(figsize=(25,20))
plot_tree(clf, feature_names=x_train_merged.columns, class_names=['benign', 'attack'], filled=True, max_depth=2)

## Explore separated classes

In [None]:
classifiers_separated = [
    KNeighborsClassifier(3),
    MLPClassifier(alpha=1, max_iter=100),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    RandomForestClassifier(max_depth=2),
]

In [None]:
for clf in classifiers_separated:
    print(clf)
    clf.fit(x_train_separated, y_train_separated)
    y_pred = clf.predict(x_train_separated)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_train_separated, y_pred))

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train_separated, y_train_separated)
y_pred = clf.predict(x_train_separated)
print(metrics.classification_report(y_train_separated, y_pred))
print(metrics.confusion_matrix(y_train_separated, y_pred))
fig = plt.figure(figsize=(25,20))
plot_tree(clf, feature_names=x_train_separated.columns, class_names=['benign', 'attack_patator', 'attack_hydra'], filled=True, max_depth=2)

## Previous OOD dataset exploration
Let's take the OOD dataset from the previous experiment and check classifiers on it

In [None]:
pool_ips = {
    '169.231.10.199',
    '169.231.210.93',
    '169.231.8.190',
    '169.231.123.195',
}

dataset = pd.read_csv(f'../0.3/ood_dataset.csv')
dataset['Class'] = 0
dataset.loc[dataset['Src IP'].isin(pool_ips), 'Class'] = 2
ttl_data = pd.read_csv(f'../0.3/ood_ttl.csv')
dataset = dataset.merge(ttl_data, on="Flow ID", how='left')
dataset = dataset.replace([np.inf, -np.inf], np.nan)
dataset = dataset.dropna(axis=0)
dataset = dataset.drop([
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Timestamp', 
    'Protocol',    # always tcp
    'Label',       # empty
], axis=1)


hydra_dataset_separated = dataset
hydra_dataset_merged = dataset.copy()
hydra_dataset_merged.loc[hydra_dataset_merged['Class'] == 2, 'Class'] = 1

## Merged

In [None]:
x_test = hydra_dataset_merged[features]
y_test = hydra_dataset_merged[target_variable]
for clf in classifiers_merged:
    print(clf)
    print("OOD test dataset accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

## Separated

In [None]:
x_test = hydra_dataset_separated[features]
y_test = hydra_dataset_separated[target_variable]
for clf in classifiers_separated:
    print(clf)
    print("OOD test dataset accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))