## Preprocessing

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

In [None]:
campus_dataset_attack = pd.read_csv('connectors-salt-attack-443.pcap_Flow.csv')
campus_dataset_attack['Class'] = 1

campus_dataset_benign = pd.read_csv('connectors-salt-benign-443.pcap_Flow.csv')
campus_dataset_benign['Class'] = 0

campus_dataset = pd.concat([campus_dataset_attack, campus_dataset_benign])

In [None]:
azure_dataset_attack = pd.read_csv('connectors-azure-attack-443.pcap_Flow.csv')
azure_dataset_attack['Class'] = 1

azure_dataset_benign = pd.read_csv('connectors-azure-benign-443.pcap_Flow.csv')
azure_dataset_benign['Class'] = 0

azure_dataset = pd.concat([azure_dataset_attack, azure_dataset_benign])

In [None]:
print(len(campus_dataset_attack))
print(len(campus_dataset_benign))
print(len(azure_dataset_attack))
print(len(azure_dataset_benign))

In [None]:
len(campus_dataset_benign.columns)

In [None]:
campus_dataset = campus_dataset.drop([
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Src Port',
    'Timestamp', 
    'Protocol',    # always tcp
    'Label',       # empty
], axis=1)

azure_dataset = azure_dataset.drop([
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Src Port',
    'Timestamp', 
    'Protocol',    # always tcp
    'Label',       # empty
], axis=1)

campus_dataset = campus_dataset.replace([np.inf, -np.inf], np.nan)
campus_dataset = campus_dataset.dropna(axis=0)

azure_dataset = azure_dataset.replace([np.inf, -np.inf], np.nan)
azure_dataset = azure_dataset.dropna(axis=0)

In [None]:
target_variable = 'Class'
features = list(set(campus_dataset.columns) - {target_variable})
x_campus = campus_dataset[features]
y_campus = campus_dataset[target_variable]
x_azure = azure_dataset[features]
y_azure = azure_dataset[target_variable]

In [None]:
x_campus_train, x_campus_test, y_campus_train, y_campus_test = train_test_split(x_campus, y_campus, test_size=0.25)
x_azure_train, x_azure_test, y_azure_train, y_azure_test = train_test_split(x_azure, y_azure, test_size=0.25)

### Campus exploration

In [None]:
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(x_campus_train, y_campus_train)

y_pred = clf.predict(x_campus_test)
print("campus dataset training accuracy: ")
print(metrics.classification_report(y_campus_test, y_pred))
print(metrics.confusion_matrix(y_campus_test, y_pred))

y_pred = clf.predict(x_azure_test)
print("Azure dataset testing accuracy: ")
print(metrics.classification_report(y_azure_test, y_pred))
print(metrics.confusion_matrix(y_azure_test, y_pred))


fig = plt.figure(figsize=(25,20))
_ = plot_tree(clf, feature_names=x_campus.columns, class_names=['benign', 'attack'], filled=True, max_depth=2)

### Azure exploration

In [None]:
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(x_azure, y_azure)

y_pred = clf.predict(x_azure_test)
print("Azure dataset training accuracy: ")
print(metrics.classification_report(y_azure_test, y_pred))
print(metrics.confusion_matrix(y_azure_test, y_pred))

y_pred = clf.predict(x_campus_test)
print("campus dataset testing accuracy: ")
print(metrics.classification_report(y_campus_test, y_pred))
print(metrics.confusion_matrix(y_campus_test, y_pred))


fig = plt.figure(figsize=(25,20))
_ = plot_tree(clf, feature_names=x_azure.columns, class_names=['benign', 'attack'], filled=True, max_depth=2)