In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
# columns = ['Timestamp', 'CAN ID', 'DLC', 'DATA[0]', 'DATA[1]', 'DATA[2]', 'DATA[3]', 'DATA[4]', 'DATA[5]', 'DATA[6]', 'DATA[7]', 'Flag']
dos = pd.read_table('../dataset/DoS_dataset.csv', header=None)
fuzzy = pd.read_table('../dataset/uzzy_dataset.csv', header=None)
gear = pd.read_table('../dataset/gear_dataset.csv', header=None)
rpm = pd.read_table('../dataset/RPM_dataset.csv', header=None)

In [4]:
# 0-Normal 1-DoS 2-Fuzzy 3-Spoof
dos[4] = dos[0].apply(lambda x: 'DoS' if (x.split(',')[-1]=='T') else 'Normal')
fuzzy[4] = fuzzy[0].apply(lambda x: 'Fuzzy' if (x.split(',')[-1]=='T') else 'Normal')
gear[4] = gear[0].apply(lambda x: 'Spoof' if (x.split(',')[-1]=='T') else 'Normal')
rpm[4] = rpm[0].apply(lambda x: 'Spoof' if (x.split(',')[-1]=='T') else 'Normal')

In [5]:
# combining all hex bytes of DATA to integer 
dos[2] = dos[0].apply(lambda x:int("0x"+"".join(x.split(',')[3:-1]),16))
fuzzy[2] = fuzzy[0].apply(lambda x:int("0x"+"".join(x.split(',')[3:-1]),16))
gear[2] = gear[0].apply(lambda x:int("0x"+"".join(x.split(',')[3:-1]),16))
rpm[2] = rpm[0].apply(lambda x:int("0x"+"".join(x.split(',')[3:-1]),16))

In [6]:
# DLC
dos[3] = dos[0].apply(lambda x:x.split(',')[2])
fuzzy[3] = fuzzy[0].apply(lambda x:x.split(',')[2])
gear[3] = gear[0].apply(lambda x:x.split(',')[2])
rpm[3] = rpm[0].apply(lambda x:x.split(',')[2])

In [7]:
# converting hex CAN ID to integer ID
dos[1] = dos[0].apply(lambda x:int("0x"+x.split(',')[1],16))
fuzzy[1] = fuzzy[0].apply(lambda x:int("0x"+x.split(',')[1],16))
gear[1] = gear[0].apply(lambda x:int("0x"+x.split(',')[1],16))
rpm[1] = rpm[0].apply(lambda x:int("0x"+x.split(',')[1],16))

In [8]:
dos.loc[:,0] = dos[0].apply(lambda x:x.split(',')[0])
fuzzy.loc[:,0] = fuzzy[0].apply(lambda x:x.split(',')[0])
gear.loc[:,0] = gear[0].apply(lambda x:x.split(',')[0])
rpm.loc[:,0] = rpm[0].apply(lambda x:x.split(',')[0])

In [9]:
data = pd.concat([dos,fuzzy,gear,rpm])

In [10]:
y_data = data.pop(4)
x_data = data

In [11]:
train_x, test_x, train_y, test_y = train_test_split(x_data,y_data,test_size=0.20,random_state=0)

In [12]:
train_x.reset_index(drop=True,inplace=True)
test_x.reset_index(drop=True,inplace=True)
train_y.reset_index(drop=True,inplace=True)
test_y.reset_index(drop=True,inplace=True)

In [13]:
print(train_x.shape,train_y.shape)
print(test_x.shape,test_y.shape)

(13255580, 4) (13255580,)
(3313895, 4) (3313895,)


In [14]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [15]:
def printReport(test_y,predict_y):
  accuracy = accuracy_score(test_y, predict_y)
  print('Accuracy: %.6f' % accuracy)

  # T up-down  F left-right
  print('Confusion Matrix')                      #  TP FP
  print(confusion_matrix(test_y,predict_y))      #  FN TN 

  print(classification_report(test_y,predict_y,digits=6,zero_division=1))

In [16]:
n=5
classifier = KNeighborsClassifier(n_neighbors=n, metric='minkowski', algorithm='kd_tree', n_jobs=-1)
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('K-Nearest Neighbour Classifier')
printReport(test_y,predict_y)

K-Nearest Neighbour Classifier
Accuracy: 0.999748
Confusion Matrix
[[ 117727       0       0       0       0]
 [      0   97406     796       0       0]
 [      0      40 2847870       0       0]
 [      0       0       0  119226       0]
 [      0       0       0       0  130830]]
              precision    recall  f1-score   support

         DoS   1.000000  1.000000  1.000000    117727
       Fuzzy   0.999590  0.991894  0.995727     98202
      Normal   0.999721  0.999986  0.999853   2847910
  Spoof Gear   1.000000  1.000000  1.000000    119226
   Spoof RPM   1.000000  1.000000  1.000000    130830

    accuracy                       0.999748   3313895
   macro avg   0.999862  0.998376  0.999116   3313895
weighted avg   0.999748  0.999748  0.999747   3313895



In [17]:
classifier = LogisticRegression(multi_class='multinomial', solver='sag', random_state=0, max_iter=1000, n_jobs=-1)
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('Logistic Regression')
printReport(test_y,predict_y)

Logistic Regression
Accuracy: 0.886328
Confusion Matrix
[[ 117727       0       0       0       0]
 [      2       0   97680     520       0]
 [      0       0 2819472   26520    1918]
 [      0       0  119226       0       0]
 [      0       0  130830       0       0]]
              precision    recall  f1-score   support

         DoS   0.999983  1.000000  0.999992    117727
       Fuzzy   1.000000  0.000000  0.000000     98202
      Normal   0.890207  0.990014  0.937462   2847910
  Spoof Gear   0.000000  0.000000  0.000000    119226
   Spoof RPM   0.000000  0.000000  0.000000    130830

    accuracy                       0.886328   3313895
   macro avg   0.578038  0.398003  0.387491   3313895
weighted avg   0.830189  0.886328  0.841165   3313895



In [18]:
model = RandomForestClassifier(random_state=0, n_jobs=-1)
model.fit(train_x, train_y)

predict_y = model.predict(test_x)

print('Random Forest Classifier')
printReport(test_y,predict_y)

Random Forest Classifier
Accuracy: 0.999985
Confusion Matrix
[[ 117727       0       0       0       0]
 [      0   98154      48       0       0]
 [      0       1 2847909       0       0]
 [      0       0       0  119226       0]
 [      0       0       0       0  130830]]
              precision    recall  f1-score   support

         DoS   1.000000  1.000000  1.000000    117727
       Fuzzy   0.999990  0.999511  0.999750     98202
      Normal   0.999983  1.000000  0.999991   2847910
  Spoof Gear   1.000000  1.000000  1.000000    119226
   Spoof RPM   1.000000  1.000000  1.000000    130830

    accuracy                       0.999985   3313895
   macro avg   0.999995  0.999902  0.999948   3313895
weighted avg   0.999985  0.999985  0.999985   3313895



In [None]:
classifier = svm.SVC(random_state=0, kernel='rbf', decision_function_shape='ovr')
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('Support Vector Machine')
printReport(test_y,predict_y)