In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv("data/preprocessed_CTU-IoT-Malware-Capture-21-1.csv")

In [29]:
df.head()

Unnamed: 0,id.resp_h,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,8,2,1,-0.031471,-0.12058,-0.091638,2,-0.028072,1,-0.031437,-0.033291,-0.032746,-0.035204,0
1,8,2,1,-0.031469,0.083804,0.114779,5,-0.028072,2,-0.025729,-0.022584,-0.021844,-0.016371,0
2,8,2,1,-0.025755,0.056852,-0.089369,2,-0.028072,1,-0.025729,-0.02421,-0.032746,-0.035204,0
3,8,2,1,-0.031469,0.083804,0.114779,5,-0.028072,2,-0.025729,-0.022584,-0.021844,-0.016371,0
4,8,2,1,-0.025717,0.144445,0.232731,5,-0.028072,2,-0.020022,-0.015129,-0.021844,-0.009663,0


# Train Test Split

In [58]:
# get a train test split which has most malware examples in the test set

malware = df[df['label'] == 1]
malware_test = malware.sample(frac=0.8, random_state=42)
malware_train = malware.drop(malware_test.index)

benign = df[df['label'] == 0]

benign_test = benign.sample(frac=0.8, random_state=42)
benign_train = benign.drop(benign_test.index)

train = pd.concat([malware_train, benign_train])
test = pd.concat([malware_test, benign_test])

X_train = train.drop(['label'], axis=1)
y_train = train['label']

X_test = test.drop(['label'], axis=1)
y_test = test['label']



# Models
- Isolation Forest
- One-Class SVM (Support Vector Machine)

Another ones
- K-Nearest Neighbors (KNN)
- DBSCAN


In [61]:
from sklearn.neighbors import KNeighborsClassifier

knn_1 = KNeighborsClassifier(n_neighbors=1)
knn_2 = KNeighborsClassifier(n_neighbors=2)
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_5 = KNeighborsClassifier(n_neighbors=5)

knn_1.fit(X_train, y_train)
knn_2.fit(X_train, y_train)
knn_3.fit(X_train, y_train)
knn_5.fit(X_train, y_train)

y_pred_1 = knn_1.predict(X_test)
y_pred_2 = knn_2.predict(X_test)
y_pred_3 = knn_3.predict(X_test)
y_pred_5 = knn_5.predict(X_test)

In [65]:
# get confusion matrix for each model
from sklearn.metrics import confusion_matrix

cm_1 = confusion_matrix(y_test, y_pred_1).ravel()
cm_2 = confusion_matrix(y_test, y_pred_2).ravel()
cm_3 = confusion_matrix(y_test, y_pred_3).ravel()
cm_5 = confusion_matrix(y_test, y_pred_5).ravel()

In [70]:
# make a df with all the confusion matrices
cm_df = pd.DataFrame([cm_1, cm_2, cm_3, cm_5], columns=['tn', 'fp', 'fn', 'tp'], index=[1, 2, 3, 5])
# set index name to be the number of neighbors
cm_df.index.rename('neighbors', inplace=True)
cm_df.head()

Unnamed: 0_level_0,tn,fp,fn,tp
neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2618,0,0,11
2,2618,0,2,9
3,2616,2,2,9
5,2618,0,11,0


# Conclusion
Here we can see that the model with only 1 neighbour was the one which outperformed the rest. One thing which is remarkable is that 80% of the malicious cases were in the <b>test set</b>, so the model made do with very few anomalies, and was able to predict them all correctly.

When comparing it to the Isolation Forest model, this is clearly a better model, as it was able to detect all the anomalies. However, it is worth noting that the Isolation Forest model is unsupervised, which gives it a clear disadvantage when compared to the KNN model, which is supervised.