In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

from trustee import ClassificationTrustee

In [None]:
attacker_ips = {
    '169.231.210.93',
    '169.231.28.232',
    '169.231.123.195',
    '169.231.172.165',
    '169.231.11.193',
    '169.231.8.190',
    '169.231.10.199',
}

In [None]:
campus_dataset = pd.read_csv('campus_dataset_1.csv')
campus_dataset['Class'] = 0
campus_dataset.loc[campus_dataset['Src IP'].isin(attacker_ips), 'Class'] = 1

print(len(campus_dataset))
ttl_data = pd.read_csv('campus_ttl_1.csv')
campus_dataset = campus_dataset.merge(ttl_data, on="Flow ID", how='left')
len(campus_dataset)

In [None]:
campus_dataset = campus_dataset.drop([
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Timestamp', 
    'Protocol',    # always tcp
    'Label',       # empty
], axis=1)

In [None]:
azure_dataset = pd.read_csv('azure_dataset_1.csv')
azure_dataset['Class'] = 0
azure_dataset.loc[azure_dataset['Src IP'].isin(attacker_ips), 'Class'] = 1
ttl_df_azure = pd.read_csv('azure_ttl_1.csv')
azure_dataset = azure_dataset.merge(ttl_df_azure, on="Flow ID", how='left')
azure_dataset = azure_dataset.replace([np.inf, -np.inf], np.nan)
azure_dataset = azure_dataset.dropna(axis=0)

In [None]:
attacker_ips = {
    '157.245.108.149',
    '34.214.149.122',
}

multicloud_dataset = pd.read_csv('multicloud_dataset_1.csv')
multicloud_dataset['Class'] = 0
multicloud_dataset.loc[multicloud_dataset['Src IP'].isin(attacker_ips), 'Class'] = 1
ttl_df_multicloud = pd.read_csv('multicloud_ttl_1.csv')
multicloud_dataset = multicloud_dataset.merge(ttl_df_multicloud, on="Flow ID", how='left')
multicloud_dataset = multicloud_dataset.replace([np.inf, -np.inf], np.nan)
multicloud_dataset = multicloud_dataset.dropna(axis=0)

In [None]:
# CIC-IDS-2018
dpath = "~/data/ids2018/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv"
cic_dataset = pd.read_csv(dpath)

In [None]:
features_rename = {
    'Fwd Pkts/b Avg': 'Fwd Packet/Bulk Avg',
    'Flow Byts/s': 'Flow Bytes/s',
    'Tot Fwd Pkts': 'Total Fwd Packet',
    'Pkt Size Avg': 'Average Packet Size',
    'Fwd Byts/b Avg': 'Fwd Bytes/Bulk Avg',
    'Fwd Pkts/s': 'Fwd Packets/s',
    'Bwd Pkt Len Max': 'Bwd Packet Length Max',
    'Fwd Header Len': 'Fwd Header Length',
    'Flow Pkts/s': 'Flow Packets/s',
    'Fwd Pkt Len Min': 'Fwd Packet Length Min',
    'RST Flag Cnt': 'RST Flag Count',
    'ECE Flag Cnt': 'ECE Flag Count',
    'Subflow Fwd Byts': 'Subflow Fwd Bytes',
    'Subflow Bwd Pkts': 'Subflow Bwd Packets',
    'TotLen Bwd Pkts': 'Total Length of Bwd Packet',
    'Bwd Header Len': 'Bwd Header Length',
    'Pkt Len Std': 'Packet Length Std',
    'Subflow Bwd Byts': 'Subflow Bwd Bytes',
    'Pkt Len Min': 'Packet Length Min',
    'TotLen Fwd Pkts': 'Total Length of Fwd Packet',
    'Pkt Len Var': 'Packet Length Variance',
    'Subflow Fwd Pkts': 'Subflow Fwd Packets',
    'Fwd Blk Rate Avg': 'Fwd Bulk Rate Avg',
    'Bwd Pkt Len Mean': 'Bwd Packet Length Mean',
    'Bwd Seg Size Avg': 'Bwd Segment Size Avg',
    'Bwd Pkts/s': 'Bwd Packets/s',
    'URG Flag Cnt': 'URG Flag Count',
    'FIN Flag Cnt': 'FIN Flag Count',
    'ACK Flag Cnt': 'ACK Flag Count',
    'Init Bwd Win Byts': 'Bwd Init Win Bytes',
    'Fwd Seg Size Avg': 'Fwd Segment Size Avg',
    'Init Fwd Win Byts': 'FWD Init Win Bytes',
    'Fwd IAT Tot': 'Fwd IAT Total',
    'Bwd Pkt Len Std': 'Bwd Packet Length Std',
    'Bwd Byts/b Avg': 'Bwd Bytes/Bulk Avg',
    'Fwd Pkt Len Std': 'Fwd Packet Length Std',
    'Pkt Len Max': 'Packet Length Max',
    'Bwd Pkt Len Min': 'Bwd Packet Length Min',
    'Tot Bwd Pkts': 'Total Bwd packets',
    'Fwd Pkt Len Max': 'Fwd Packet Length Max',
    'Bwd Pkts/b Avg': 'Bwd Packet/Bulk Avg',
    'Fwd Pkt Len Mean': 'Fwd Packet Length Mean',
    'PSH Flag Cnt': 'PSH Flag Count',
    'Bwd IAT Tot': 'Bwd IAT Total',
    'SYN Flag Cnt': 'SYN Flag Count',
    'Pkt Len Mean': 'Packet Length Mean',
    'Bwd Blk Rate Avg': 'Bwd Bulk Rate Avg',
    'CWE Flag Count': 'CWR Flag Count',
}

In [None]:
cic_dataset = cic_dataset.rename({x: features_rename.get(x.strip(), x.strip()) for x in cic_dataset.columns}, axis=1)
cic_dataset['Src Port'] = np.random.randint(20000, 40000, cic_dataset.shape[0])

# Here's important point: we couldn't extract true TTL from CIC-IDS-2018
# So we have to synthesize some data here
# And this is a good confirmation that TTL is a main feature used by models

# This gives you around 0.6 F1 score
cic_dataset['TTL'] = np.random.randint(60, 64, cic_dataset.shape[0])

# These settings give you 0.1 total F1 score
cic_dataset['TTL'] = 63
cic_dataset.loc[cic_dataset['Label'] != 'Benign', 'TTL'] = 60

# These settings give you 1.0 total F1 score!
cic_dataset['TTL'] = 60
cic_dataset.loc[cic_dataset['Label'] != 'Benign', 'TTL'] = 63


cic_dataset['Class'] = 0
cic_dataset.loc[cic_dataset['Label'] != 'Benign', 'Class'] = 1

cic_dataset = cic_dataset.replace([np.inf, -np.inf], np.nan)
cic_dataset = cic_dataset.dropna(axis=0)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Train on campus dataset, check on Azure, multicloud, and CIC-18

In [None]:
target_variable = 'Class'
features = list(set(campus_dataset.columns) - {target_variable})
x_train = campus_dataset[features]
y_train = campus_dataset[target_variable]
x_test = azure_dataset[features]
y_test = azure_dataset[target_variable]
x_test_2 = multicloud_dataset[features]
y_test_2 = multicloud_dataset[target_variable]
x_test_3 = cic_dataset[features]
y_test_3 = cic_dataset[target_variable]

In [None]:
from sklearn.preprocessing import StandardScaler
x_train_scaler = StandardScaler()
x_test_scaler = StandardScaler()
x_test_2_scaler = StandardScaler()
x_test_3_scaler = StandardScaler()

x_train = pd.DataFrame(x_train_scaler.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(x_test_scaler.fit_transform(x_test), columns = x_test.columns)
x_test_2 = pd.DataFrame(x_test_2_scaler.fit_transform(x_test_2), columns = x_test_2.columns)
x_test_3 = pd.DataFrame(x_test_3_scaler.fit_transform(x_test_3), columns = x_test_3.columns)

In [None]:
classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
]

In [None]:
for clf in classifiers:
    print(clf)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_train, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')

Let's explore the reasons of this performance

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_train)
print("campus dataset training accuracy: ")
print(metrics.classification_report(y_train, y_pred))

y_pred = clf.predict(x_test)
print("Azure dataset test accuracy: ")
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

y_pred = clf.predict(x_test_2)
print("Multicloud dataset test accuracy: ")
print(metrics.classification_report(y_test_2, y_pred))
print(metrics.confusion_matrix(y_test_2, y_pred))

y_pred = clf.predict(x_test_3)
print("CIC-IDS-2018 dataset test accuracy: ")
print(metrics.classification_report(y_test_3, y_pred))
print(metrics.confusion_matrix(y_test_3, y_pred))

fig = plt.figure(figsize=(25,20))
_ = plot_tree(clf, feature_names=x_train.columns, class_names=['benign', 'attack'], filled=True, max_depth=2)

The answer is a shortcut in the dataset - usage of TTL as a feature.  
Random forests choose features randomly and therefore often remove TTL from the list and doesn't overfit to it.  

Anyway, TTL is a plain shortcut, let's remove it.

## Out of curiosity - dropping or noising the features
Use information from the future - we know that TTL and Init Win Bytes are or would be shortcuts, so let's add some noise there or drop them and check performance.

In [None]:
target_variable = 'Class'
features = list(set(campus_dataset.columns) - {target_variable})
x_train = campus_dataset[features].copy()
y_train = campus_dataset[target_variable]
x_test = azure_dataset[features]
y_test = azure_dataset[target_variable]
x_test_2 = multicloud_dataset[features]
y_test_2 = multicloud_dataset[target_variable]
x_test_3 = cic_dataset[features]
y_test_3 = cic_dataset[target_variable]

In [None]:
x_train.loc[:, 'TTL'] += np.random.randint(-1, 1, [len(x_train)])
x_train.loc[:, 'Bwd Init Win Bytes'] += np.random.randint(-5, 5, [len(x_train)])

In [None]:
x_train = pd.DataFrame(StandardScaler().fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(StandardScaler().fit_transform(x_test), columns = x_test.columns)
x_test_2 = pd.DataFrame(StandardScaler().fit_transform(x_test_2), columns = x_test_2.columns)
x_test_3 = pd.DataFrame(StandardScaler().fit_transform(x_test_3), columns = x_test_3.columns)

In [None]:
classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
]
for clf in classifiers:
    print(clf)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_train, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')

### Or let's just drop them

In [None]:
x_train = x_train.drop(['TTL', 'Bwd Init Win Bytes'], axis=1)
x_test = x_test.drop(['TTL', 'Bwd Init Win Bytes'], axis=1)
x_test_2 = x_test_2.drop(['TTL', 'Bwd Init Win Bytes'], axis=1)
x_test_3 = x_test_3.drop(['TTL', 'Bwd Init Win Bytes'], axis=1)

In [None]:
classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(max_depth=4),
]
for clf in classifiers:
    print(clf)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_train, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')

In [None]:
trustee = ClassificationTrustee(expert=classifiers[0])
trustee.fit(x_train, y_train, num_iter=10, num_stability_iter=3, samples_size=0.8)

_, dt, _, score = trustee.explain()
print(f"Training score of pruned DT: {score}")
dt_y_pred = dt.predict(x_test)

print("Model explanation global fidelity report:")
print(metrics.classification_report(classifiers[0].predict(x_test), dt_y_pred))
print("Model explanation score report:")
print(metrics.classification_report(y_test, dt_y_pred))

fig = plt.figure(figsize=(25,20))
plot_tree(dt, feature_names=x_train.columns, class_names=['benign', 'attack'], filled=True, max_depth=3)

### Checking SMOTE

In [None]:
import smote_variants as sv

In [None]:
target_variable = 'Class'
features = list(set(campus_dataset.columns) - {target_variable})
x_train = campus_dataset[features].copy()
y_train = campus_dataset[target_variable]
x_test = azure_dataset[features]
y_test = azure_dataset[target_variable]
x_test_2 = multicloud_dataset[features]
y_test_2 = multicloud_dataset[target_variable]
x_test_3 = cic_dataset[features]
y_test_3 = cic_dataset[target_variable]

In [None]:
from sklearn.preprocessing import StandardScaler

x_train = pd.DataFrame(StandardScaler().fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(StandardScaler().fit_transform(x_test), columns = x_test.columns)
x_test_2 = pd.DataFrame(StandardScaler().fit_transform(x_test_2), columns = x_test_2.columns)
x_test_3 = pd.DataFrame(StandardScaler().fit_transform(x_test_3), columns = x_test_3.columns)

In [None]:
oversampler = sv.SMOTE()
X_samp, y_samp = oversampler.sample(x_train.to_numpy(), y_train.to_numpy())

classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(max_depth=4),
]
for clf in classifiers:
    print(clf)
    clf.fit(X_samp, y_samp)
    y_pred = clf.predict(X_samp)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_samp, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')

### SYMPROD SMOTE

In [None]:
oversampler = sv.SYMPROD()
X_samp, y_samp = oversampler.sample(x_train.to_numpy(), y_train.to_numpy())

classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(max_depth=4),
]
for clf in classifiers:
    print(clf)
    clf.fit(X_samp, y_samp)
    y_pred = clf.predict(X_samp)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_samp, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')

### CCR

In [None]:
oversampler = sv.CCR()
X_samp, y_samp = oversampler.sample(x_train.to_numpy(), y_train.to_numpy())

classifiers = [
    MLPClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(max_depth=4),
]
for clf in classifiers:
    print(clf)
    clf.fit(X_samp, y_samp)
    y_pred = clf.predict(X_samp)
    print("campus dataset training accuracy: ")
    print(metrics.classification_report(y_samp, y_pred))

    print("Azure dataset test accuracy: ")
    y_pred = clf.predict(x_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    
    print("Multicloud dataset test accuracy: ")
    y_pred = clf.predict(x_test_2)
    print(metrics.classification_report(y_test_2, y_pred))
    print(metrics.confusion_matrix(y_test_2, y_pred))
    
    print("CIC-IDS-2018 dataset test accuracy: ")
    y_pred = clf.predict(x_test_3)
    print(metrics.classification_report(y_test_3, y_pred))
    print(metrics.confusion_matrix(y_test_3, y_pred))
    print('#' * 10 + '\n')