In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/preprocessed_CTU-IoT-Malware-Capture-21-1.csv")
df.head()

Unnamed: 0,id.resp_h,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,8,2,1,-0.031471,-0.12058,-0.091638,2,-0.028072,1,-0.031437,-0.033291,-0.032746,-0.035204,0
1,8,2,1,-0.031469,0.083804,0.114779,5,-0.028072,2,-0.025729,-0.022584,-0.021844,-0.016371,0
2,8,2,1,-0.025755,0.056852,-0.089369,2,-0.028072,1,-0.025729,-0.02421,-0.032746,-0.035204,0
3,8,2,1,-0.031469,0.083804,0.114779,5,-0.028072,2,-0.025729,-0.022584,-0.021844,-0.016371,0
4,8,2,1,-0.025717,0.144445,0.232731,5,-0.028072,2,-0.020022,-0.015129,-0.021844,-0.009663,0


# Train Test Split

In [16]:
X = df.drop(['label'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, df["label"], test_size=0.2, random_state=42)

# Support Vector Machines


##### What's a hyperplane?

A hyperplane is a subspace of one dimension less than its ambient space. If a space is 3-dimensional then its hyperplanes are the 2-dimensional planes, while if the space is 2-dimensional, its hyperplanes are the 1-dimensional lines. In general, a hyperplane of an n-dimensional space is an (n-1)-dimensional subspace. So, in the xy plane, a hyperplane is a line. In the xyz space, a hyperplane is a plane. In the xyzw space, a hyperplane is a 3-dimensional space. And so on.

<img src="https://images.deepai.org/glossary-terms/3bb86574825445cba73a67222b744648/hyperplane.png" width=400>

The SVM algorithm tries to find a hyperplane that separates the data into two classes, as seen in the picture above. The hyperplane is chosen so that the distance from it to the nearest data point on each side (the margin) is maximized. The data points that are closest to the hyperplane are called support vectors.

<img src="https://www.researchgate.net/publication/332248436/figure/fig5/AS:864758303563793@1583185854982/Support-Vector-Machine-visualization.png" width=400>

#### Choosing hyperparameters
- Kernel: linear, polynomial, radial basis function (RBF), sigmoid. 
- Gamma: Kernel coefficient for RBF, polynomial, sigmoid.
- C: Penalty parameter C of the error term.
    - C controls the trade-off between maximizing the margin and minimizing the classification error. A smaller C makes the decision boundary more robust to misclassified data points, while a larger C allows the model to classify more training points correctly.
    - A higher C may lead to overfitting, while a lower C may lead to underfitting.  


In [21]:
svm_linear = svm.SVC(kernel='linear', C=1, gamma=1)
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)

svm_rbf = svm.SVC(kernel='rbf', C=1, gamma=1)
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)

svm_sig = svm.SVC(kernel='sigmoid', C=1, gamma=1)
svm_sig.fit(X_train, y_train)
y_pred_sig = svm_sig.predict(X_test)

svm_poly = svm.SVC(kernel='poly', C=1, gamma=1)
svm_poly.fit(X_train, y_train)
y_pred_poly = svm_poly.predict(X_test)

In [23]:
# Compare the accuracy of the models with F1 score
from sklearn.metrics import f1_score

print("Linear SVM F1 score: ", f1_score(y_test, y_pred_linear, average='weighted'))
print("RBF SVM F1 score: ", f1_score(y_test, y_pred_rbf, average='weighted'))
print("Sigmoid SVM F1 score: ", f1_score(y_test, y_pred_sig, average='weighted'))
print("Polynomial SVM F1 score: ", f1_score(y_test, y_pred_poly, average='weighted'))


Linear SVM F1 score:  0.9985882167180219
RBF SVM F1 score:  1.0
Sigmoid SVM F1 score:  0.9931663028355677
Polynomial SVM F1 score:  1.0


As we can see here, most models perform almost perfectly, taking into account that we are using a rather small dataset.

# Big dataset


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm

df_big = pd.read_csv("/home/pedranji/Projects/iot/data/preprocessed_CTU-IoT-Malware-Capture-48-1.csv")

X = df_big.drop(['label'], axis=1)
y = df_big['label']

X_train, X_test, y_train, y_test = train_test_split(X, df_big["label"], test_size=0.2, random_state=42)

In [9]:
y_train

1384437    1
3233363    1
1941322    1
1618321    1
3162842    1
          ..
1692743    1
2356330    1
2229084    1
2768307    1
2219110    1
Name: label, Length: 2715470, dtype: int64

In [6]:
# print shapes
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (2715470, 7)
X_test:  (678868, 7)
y_train:  (2715470,)
y_test:  (678868,)


In [7]:
# how many malicious cases are in each set?
print("Malicious cases in y_train: ", sum(y_train))
print("Malicious cases in y_test: ", sum(y_test))

Malicious cases in y_train:  2712468
Malicious cases in y_test:  678136


In [4]:
svm_linear = svm.SVC(kernel='linear', C=1, gamma=1)
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)

svm_rbf = svm.SVC(kernel='rbf', C=1, gamma=1)
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)

svm_sig = svm.SVC(kernel='sigmoid', C=1, gamma=1)
svm_sig.fit(X_train, y_train)
y_pred_sig = svm_sig.predict(X_test)

svm_poly = svm.SVC(kernel='poly', C=1, gamma=1)
svm_poly.fit(X_train, y_train)
y_pred_poly = svm_poly.predict(X_test)

In [5]:
# Compare the accuracy of the models with F1 score
from sklearn.metrics import f1_score

print("Linear SVM F1 score: ", f1_score(y_test, y_pred_linear, average='weighted'))
print("RBF SVM F1 score: ", f1_score(y_test, y_pred_rbf, average='weighted'))
print("Sigmoid SVM F1 score: ", f1_score(y_test, y_pred_sig, average='weighted'))
print("Polynomial SVM F1 score: ", f1_score(y_test, y_pred_poly, average='weighted'))

Linear SVM F1 score:  1.0
RBF SVM F1 score:  0.9999985264566973
Sigmoid SVM F1 score:  0.9998667870362798
Polynomial SVM F1 score:  1.0


As we can see, the models perform almost perfectly, even now that we are using the big dataset. Using bigger datasets is usually better for the model, since it has more examples to train. However, a larger dataset can have more noise, or outliers, which can lead to overfitting.
