In [1]:
import os
import sys
sys.path.insert(0, os.getcwd() + "\\code")

import numpy as np
import pandas as pd
import math

import json
import helper_functions
from importlib import reload

from timeit import default_timer as timer
import sklearn.metrics
import sklearn.preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM               # Takes too long with more than 200,000 rows
# from sklearn.linear_model import SGDOneClassSVM

In [2]:
print(os.getcwd())

c:\Users\sshar\Desktop\CAN IDS Benchmarking


# Ambient Data

In [3]:
# Read in required dataset
# df_ambient = helper_functions.make_can_df("road/ambient/ambient_dyno_drive_extended_long.log")
df_ambient = helper_functions.make_can_df("road/ambient/ambient_dyno_drive_basic_long.log")

# Take subset only
df_ambient = df_ambient.iloc[:200000]
df_ambient.head()

Unnamed: 0,time,aid,data
0,0.0,813,0000042758010000
1,0.001019,1694,0440047E1FC01542
2,0.00102,293,9000401F41BE7960
3,0.002916,737,0000000000000004
4,0.003937,852,1FFF40000003B680


In [4]:
# Split data field into 8 fields 
start = 0
stop = 2
for i in range(8):
    df_ambient["data"+str(i)] = df_ambient["data"].str[start:stop]
    start += 2
    stop += 2
df_ambient.drop("data", axis=1, inplace=True)
df_ambient.head()

Unnamed: 0,time,aid,data0,data1,data2,data3,data4,data5,data6,data7
0,0.0,813,00,00,4,27,58,01,00,0
1,0.001019,1694,04,40,4,7E,1F,C0,15,42
2,0.00102,293,90,00,40,1F,41,BE,79,60
3,0.002916,737,00,00,0,00,00,00,00,4
4,0.003937,852,1F,FF,40,00,00,03,B6,80


In [5]:
# Label all messages in ambient dataframe as benign
df_ambient["actual_attack"] = False
df_ambient.head()

Unnamed: 0,time,aid,data0,data1,data2,data3,data4,data5,data6,data7,actual_attack
0,0.0,813,00,00,4,27,58,01,00,0,False
1,0.001019,1694,04,40,4,7E,1F,C0,15,42,False
2,0.00102,293,90,00,40,1F,41,BE,79,60,False
3,0.002916,737,00,00,0,00,00,00,00,4,False
4,0.003937,852,1F,FF,40,00,00,03,B6,80,False


In [6]:
# Total number of messages
print("Total no. of messages =", len(df_ambient))

Total no. of messages = 200000


In [7]:
# Breakdown by AIDs
print("No. of unique AIDs =", df_ambient["aid"].nunique())
df_temp = df_ambient["aid"].value_counts().rename_axis('aid').reset_index(name='counts')
df_temp = df_temp.sort_values(by=["counts"], ascending = False)
df_temp.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\ambient_aidcounts.csv", index = None)
del df_temp

No. of unique AIDs = 105


In [8]:
for col in ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']:
    # Convert to integer values
    df_ambient[col] = df_ambient[col].apply(lambda x: int(x, 16))

# Scale to [0, 1]
scaler = sklearn.preprocessing.MinMaxScaler()
df_ambient[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']] = scaler.fit_transform(df_ambient[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])
display(df_ambient.head())

Unnamed: 0,time,aid,data0,data1,data2,data3,data4,data5,data6,data7,actual_attack
0,0.0,0.452862,0.0,0.0,0.015686,0.152941,0.345098,0.003922,0.0,0.0,False
1,0.001019,0.94725,0.015686,0.25098,0.015686,0.494118,0.121569,0.752941,0.082353,0.258824,False
2,0.00102,0.161055,0.564706,0.0,0.25098,0.121569,0.254902,0.745098,0.47451,0.376471,False
3,0.002916,0.410213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015686,False
4,0.003937,0.474747,0.121569,1.0,0.25098,0.0,0.0,0.011765,0.713725,0.501961,False


# Attack Data

In [9]:
reload(helper_functions)
with open("road\\attacks\\capture_metadata.json", "r") as read_file:
    attack_dict = json.load(read_file)

attack_metadata = []
attack_names = []

df_attack_aggr = []
count = 0
for filename in os.listdir(os.getcwd()+"\\road\\attacks"):
    if "metadata" not in filename and "accelerator" not in filename and "engine" not in filename:
        print(count, filename)

        # creating dataframe from log
        df_attack = helper_functions.make_can_df(os.getcwd()+"\\road\\attacks\\"+filename)
        # df_attack_aggr[count] = (df_attack)

        # printing AIDs of injections
        if attack_dict[filename[:-4]]["injection_id"] != "XXX":
            print(attack_dict[filename[:-4]]["injection_id"], int(attack_dict[filename[:-4]]["injection_id"], 16))
        else:
            print(attack_dict[filename[:-4]]["injection_id"])

        # adding injection interval to attack_metadata
        attack_metadata.append([tuple(attack_dict[filename[:-4]]["injection_interval"])])

        # adding attack name to attack_names
        attack_names.append(filename[:-4])

        # creating new column to label attack messages
        if attack_dict[filename[:-4]]["injection_id"] != "XXX":
            df_attack = helper_functions.add_actual_attack_col(df_attack, attack_metadata[count], int(attack_dict[filename[:-4]]["injection_id"], 16), attack_dict[filename[:-4]]["injection_data_str"])                                                      
        else:
            df_attack = helper_functions.add_actual_attack_col(df_attack, attack_metadata[count], "XXX", attack_dict[filename[:-4]]["injection_data_str"])  
        
        # Append the dataframe to the aggregate 
        df_attack_aggr.append(df_attack)

        count += 1
del df_attack

0 correlated_signal_attack_1.log
0x6e0 1760
1 correlated_signal_attack_1_masquerade.log
0x6e0 1760
2 correlated_signal_attack_2.log
0x6e0 1760
3 correlated_signal_attack_2_masquerade.log
0x6e0 1760
4 correlated_signal_attack_3.log
0x6e0 1760
5 correlated_signal_attack_3_masquerade.log
0x6e0 1760
6 fuzzing_attack_1.log
XXX
7 fuzzing_attack_2.log
XXX
8 fuzzing_attack_3.log
XXX
9 max_speedometer_attack_1.log
0xd0 208
10 max_speedometer_attack_1_masquerade.log
0xd0 208
11 max_speedometer_attack_2.log
0xd0 208
12 max_speedometer_attack_2_masquerade.log
0xd0 208
13 max_speedometer_attack_3.log
0xd0 208
14 max_speedometer_attack_3_masquerade.log
0xd0 208
15 reverse_light_off_attack_1.log
0xd0 208
16 reverse_light_off_attack_1_masquerade.log
0xd0 208
17 reverse_light_off_attack_2.log
0xd0 208
18 reverse_light_off_attack_2_masquerade.log
0xd0 208
19 reverse_light_off_attack_3.log
0xd0 208
20 reverse_light_off_attack_3_masquerade.log
0xd0 208
21 reverse_light_on_attack_1.log
0xd0 208
22 reverse_

In [10]:
print(attack_names[0])
display(df_attack_aggr[0])

correlated_signal_attack_1


Unnamed: 0,time,aid,data,actual_attack
0,0.000000,1505,893FC00B0A013880,False
1,0.000001,651,0000000000000000,False
2,0.000003,167,0010FA24D12E00A0,False
3,0.000004,208,4A7704600201F000,False
4,0.000997,51,000698000E4207D0,False
...,...,...,...,...
76231,33.095972,651,0000000000000000,False
76232,33.096941,51,177FA9788DC007D0,False
76233,33.096943,167,00510BA5212BA0A0,False
76234,33.096944,61,0000020000000000,False


In [11]:
# Splitting the data column into 8 data columns, and then rearranging the columns in each attack dataframe
for j in range(len(df_attack_aggr)):
    start = 0
    stop = 2
    for i in range(8):
        df_attack_aggr[j]["data"+str(i)] = df_attack_aggr[j]["data"].str[start:stop]
        start += 2
        stop += 2
    df_attack_aggr[j].drop("data", axis=1, inplace=True)
    df_attack_aggr[j] = df_attack_aggr[j][['time', 'aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7', 'actual_attack']]
    df_attack_aggr[j].head()

In [12]:
df_attack_aggr[0].head()

Unnamed: 0,time,aid,data0,data1,data2,data3,data4,data5,data6,data7,actual_attack
0,0.0,1505,89,3F,C0,0B,0A,01,38,80,False
1,1e-06,651,00,00,00,00,00,00,00,00,False
2,3e-06,167,00,10,FA,24,D1,2E,00,A0,False
3,4e-06,208,4A,77,04,60,02,01,F0,00,False
4,0.000997,51,00,06,98,00,0E,42,07,D0,False


In [13]:
# for df in df_attack_aggr:
#     display(df[['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']].apply(min))
#     display(df[['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']].apply(max))

In [14]:
# Standardize data fields using MinMaxScaler
for i in range(len(df_attack_aggr)):
    for col in ['data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']:
        # Convert to integer values
        df_attack_aggr[i][col] =  df_attack_aggr[i][col].apply(lambda x: int(x, 16))

    # Scale
    df_attack_aggr[i][['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']] = scaler.transform(df_attack_aggr[i][['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])

In [15]:
print(df_attack_aggr[0].dtypes)

time             float64
aid              float64
data0            float64
data1            float64
data2            float64
data3            float64
data4            float64
data5            float64
data6            float64
data7            float64
actual_attack       bool
dtype: object


In [16]:
# Getting class balance
df_class = pd.DataFrame(columns=["attack_name", "total_messages", "benign_messages", "attack_messages", "benign_percent", "attack_percent"])
for i in range(len(df_attack_aggr)):
    df_class.loc[i] = [ attack_names[i], 
                        len(df_attack_aggr[i]), 
                        df_attack_aggr[i]["actual_attack"].value_counts()[False],  
                        df_attack_aggr[i]["actual_attack"].value_counts()[True],
                        df_attack_aggr[i]["actual_attack"].value_counts()[False] / len(df_attack_aggr[i]) * 100,
                        df_attack_aggr[i]["actual_attack"].value_counts()[True] / len(df_attack_aggr[i]) * 100]
display(df_class)
# df_class.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\attack_class_breakdown.csv", index = None)
del df_class

Unnamed: 0,attack_name,total_messages,benign_messages,attack_messages,benign_percent,attack_percent
0,correlated_signal_attack_1,76236,74150,2086,97.26376,2.73624
1,correlated_signal_attack_1_masquerade,74150,72064,2086,97.186784,2.813216
2,correlated_signal_attack_2,65398,63258,2140,96.727729,3.272271
3,correlated_signal_attack_2_masquerade,63258,61118,2140,96.617029,3.382971
4,correlated_signal_attack_3,39265,38001,1264,96.780848,3.219152
5,correlated_signal_attack_3_masquerade,38001,36737,1264,96.673772,3.326228
6,fuzzing_attack_1,46246,45655,591,98.722052,1.277948
7,fuzzing_attack_2,30315,29964,351,98.842157,1.157843
8,fuzzing_attack_3,12401,12287,114,99.080719,0.919281
9,max_speedometer_attack_1,200106,197662,2444,98.778647,1.221353


# Evaluation

In [17]:
def saveMetrics(df_results, alg_num):
    new_df = pd.DataFrame(columns=["Attack name", "TN", "FP", "FN", "TP", "Accuracy", "Precision", "Recall", "F1-score", "Balanced accuracy", "Informedness", "Markedness", "MCC"])
    for i in range(len(df_results)):
        tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_results[i]["actual_attack"], df_results[i]["predicted_attack"]).ravel()    # Confusion Matrix
        acc     = sklearn.metrics.accuracy_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                     # Accuracy
        prec    = sklearn.metrics.precision_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                    # Precision
        recall  = sklearn.metrics.recall_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                       # Recall
        fscore  = sklearn.metrics.f1_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                           # F1-score
        mcc     = sklearn.metrics.matthews_corrcoef(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                  # Matthews Correlation Coefficient
        bacc    = sklearn.metrics.balanced_accuracy_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])            # Balanced Accuracy
        inf     = recall + tn / (fp + tn) - 1                                                                                           # Informedness
        mark    = prec + tn / (fn + tn) - 1                                                                                             # Markedness

        new_df.loc[i] = [attack_names[i], tn, fp, fn, tp, acc, prec, recall, fscore, bacc, inf, mark, mcc]
    display(new_df)
    new_df.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg" + str(alg_num) + "_confusion.csv", index = None)

## One-Class Support Vector Machine (OCSVM)

We use the hyperparameters as in Berger et al. (2019), i.e. kernel is linear and nu is 0.01

### Training

In [18]:
# Train model
start_time = timer()

ocsvm_model = OneClassSVM(kernel="linear", nu=0.001).fit(df_ambient[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])
# ocsvm_model = SGDOneClassSVM(nu=0.01, shuffle=False, random_state=2).fit(df_ambient[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)

Training time = 2.352029700006824


### Testing

In [19]:
#  Prepare list of dataframes to store actual_attack and predicted_attack for each attack sample
df_alg8_results = []
for df in df_attack_aggr:
    new_df = pd.DataFrame()
    new_df["actual_attack"] = df["actual_attack"]
    new_df["predicted_attack"] = 0

    df_alg8_results.append(new_df)

# Prepare dataframe to record testing time
df_alg8_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

In [20]:
# Testing with all attack datasets
count = 0
for df in df_attack_aggr:

    # Print name of attack
    print(count, attack_names[count], end = " ")
    start_time = timer()

    # Predict using trained model
    df_alg8_results[count]["predicted_attack"] = ocsvm_model.predict(df[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])

    # Elapsed time
    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)

    # Save elapsed time
    df_alg8_testing_time.loc[count] = [attack_names[count], elapsed_time]

    count += 1
df_alg8_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg8_testing_time.csv", index = None)


0 correlated_signal_attack_1 0.31666820000100415
1 correlated_signal_attack_1_masquerade 0.2929805000021588
2 correlated_signal_attack_2 0.2297236000013072
3 correlated_signal_attack_2_masquerade 0.21822669998800848
4 correlated_signal_attack_3 0.1403515999991214
5 correlated_signal_attack_3_masquerade 0.13077289999637287
6 fuzzing_attack_1 0.16464620000624564
7 fuzzing_attack_2 0.11411160000716336
8 fuzzing_attack_3 0.04448529999353923
9 max_speedometer_attack_1 0.7311516000045231
10 max_speedometer_attack_1_masquerade 0.6949744000012288
11 max_speedometer_attack_2 0.4924897000018973
12 max_speedometer_attack_2_masquerade 0.4723609999928158
13 max_speedometer_attack_3 0.7195215000101598
14 max_speedometer_attack_3_masquerade 0.7016062999900896
15 reverse_light_off_attack_1 0.23606159999326337
16 reverse_light_off_attack_1_masquerade 0.23095800000010058
17 reverse_light_off_attack_2 0.3366958999977214
18 reverse_light_off_attack_2_masquerade 0.32644049999362323
19 reverse_light_off_att

In [21]:
# Convert the +1 and -1 predicted labels to True and False
# +1 = normal, -1 = anomaly
for i in range(len(df_alg8_results)):
    df_alg8_results[i]["predicted_attack"] = df_alg8_results[i]["predicted_attack"].apply(lambda x : True if x == -1 else False)

In [22]:
display(df_alg8_results[9].value_counts())

actual_attack  predicted_attack
False          False               196785
True           False                 2444
False          True                   877
dtype: int64

In [23]:
# Get metrics for each attack
saveMetrics(df_alg8_results, 8)

Unnamed: 0,Attack name,TN,FP,FN,TP,Accuracy,Precision,Recall,F1-score,Balanced accuracy,Informedness,Markedness,MCC
0,correlated_signal_attack_1,73710,440,2086,0,0.966866,0.0,0.0,0.0,0.497033,-0.005934,-0.027521,-0.012779
1,correlated_signal_attack_1_masquerade,71624,440,2086,0,0.965934,0.0,0.0,0.0,0.496947,-0.006106,-0.0283,-0.013145
2,correlated_signal_attack_2,63100,158,2140,0,0.964861,0.0,0.0,0.0,0.498751,-0.002498,-0.032802,-0.009052
3,correlated_signal_attack_2_masquerade,60960,158,2140,0,0.963673,0.0,0.0,0.0,0.498707,-0.002585,-0.033914,-0.009363
4,correlated_signal_attack_3,37859,142,1264,0,0.964192,0.0,0.0,0.0,0.498132,-0.003737,-0.032308,-0.010988
5,correlated_signal_attack_3_masquerade,36595,142,1264,0,0.963001,0.0,0.0,0.0,0.498067,-0.003865,-0.033387,-0.01136
6,fuzzing_attack_1,45403,252,591,0,0.981771,0.0,0.0,0.0,0.49724,-0.00552,-0.01285,-0.008422
7,fuzzing_attack_2,29780,184,351,0,0.982352,0.0,0.0,0.0,0.49693,-0.006141,-0.011649,-0.008458
8,fuzzing_attack_3,12277,10,114,0,0.990001,0.0,0.0,0.0,0.499593,-0.000814,-0.0092,-0.002736
9,max_speedometer_attack_1,196785,877,2444,0,0.983404,0.0,0.0,0.0,0.497782,-0.004437,-0.012267,-0.007378


## Isolation Forest
Parameters are taken from Costa Canones (2021)

### Training

In [24]:
# Train model
start_time = timer()

if_model = IsolationForest(n_estimators=10, warm_start=True, contamination=0.1, random_state=2).fit(df_ambient[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)



Training time = 0.624104400005308


### Testing

In [25]:
#  Prepare list of dataframes to store actual_attack and predicted_attack for each attack sample
df_alg9_results = []
for df in df_attack_aggr:
    new_df = pd.DataFrame()
    new_df["actual_attack"] = df["actual_attack"]
    new_df["predicted_attack"] = 0

    df_alg9_results.append(new_df)

# Prepare dataframe to record testing time
df_alg9_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

In [26]:
# Testing with all attack datasets
count = 0
for df in df_attack_aggr:

    # Print name of attack
    print(count, attack_names[count], end = " ")
    start_time = timer()

    # Predict using trained model
    df_alg9_results[count]["predicted_attack"] = if_model.predict(df[['aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7']])

    # Elapsed time
    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)

    # Save elapsed time
    df_alg9_testing_time.loc[count] = [attack_names[count], elapsed_time]

    count += 1
df_alg9_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg9_testing_time.csv", index = None)


0 correlated_signal_attack_1 0.24905199999921024
1 correlated_signal_attack_1_masquerade 0.22008590000041295
2 correlated_signal_attack_2 0.19852020000689663
3 correlated_signal_attack_2_masquerade 0.1893080000008922
4 correlated_signal_attack_3 0.11666990000230726
5 correlated_signal_attack_3_masquerade 0.1265546000067843
6 fuzzing_attack_1 0.15175879999878816
7 fuzzing_attack_2 0.09492060000775382
8 fuzzing_attack_3 0.04001320000679698
9 max_speedometer_attack_1 0.6038497999979882
10 max_speedometer_attack_1_masquerade 0.6286209999962011
11 max_speedometer_attack_2 0.4605149000126403
12 max_speedometer_attack_2_masquerade 0.4335877999983495
13 max_speedometer_attack_3 0.6489996000018436
14 max_speedometer_attack_3_masquerade 0.5752592000062577
15 reverse_light_off_attack_1 0.19496900000376627
16 reverse_light_off_attack_1_masquerade 0.20701280000503175
17 reverse_light_off_attack_2 0.28659499999776017
18 reverse_light_off_attack_2_masquerade 0.25700089999008924
19 reverse_light_off_a

In [27]:
# Convert the +1 and -1 predicted labels to True and False
# +1 = normal, -1 = anomaly
for i in range(len(df_alg9_results)):
    df_alg9_results[i]["predicted_attack"] = df_alg9_results[i]["predicted_attack"].apply(lambda x : True if x == -1 else False)

In [28]:
# Get metrics for each attack
saveMetrics(df_alg9_results, 9)

Unnamed: 0,Attack name,TN,FP,FN,TP,Accuracy,Precision,Recall,F1-score,Balanced accuracy,Informedness,Markedness,MCC
0,correlated_signal_attack_1,66575,7575,2086,0,0.873275,0.0,0.0,0.0,0.448921,-0.102158,-0.030381,-0.055711
1,correlated_signal_attack_1_masquerade,64746,7318,2086,0,0.873176,0.0,0.0,0.0,0.449226,-0.101549,-0.031213,-0.056299
2,correlated_signal_attack_2,56741,6517,2140,0,0.867626,0.0,0.0,0.0,0.448489,-0.103023,-0.036344,-0.061191
3,correlated_signal_attack_2_masquerade,54979,6139,2140,0,0.869123,0.0,0.0,0.0,0.449777,-0.100445,-0.037466,-0.061345
4,correlated_signal_attack_3,31726,6275,1264,0,0.807997,0.0,0.0,0.0,0.417436,-0.165127,-0.038315,-0.079541
5,correlated_signal_attack_3_masquerade,30756,5981,1264,0,0.809347,0.0,0.0,0.0,0.418597,-0.162806,-0.039475,-0.080167
6,fuzzing_attack_1,41436,4219,0,591,0.90877,0.122869,1.0,0.218848,0.953795,0.90759,0.122869,0.333938
7,fuzzing_attack_2,26050,3914,0,351,0.870889,0.082298,1.0,0.15208,0.934688,0.869377,0.082298,0.267484
8,fuzzing_attack_3,11034,1253,0,114,0.89896,0.083394,1.0,0.15395,0.949011,0.898022,0.083394,0.27366
9,max_speedometer_attack_1,178002,19660,2205,239,0.890733,0.012011,0.097791,0.021394,0.499164,-0.001672,-0.000225,-0.000614
