In [1]:
import os
import sys
sys.path.insert(0, os.getcwd() + "\\code")

import numpy as np
import pandas as pd
import math

import json

import helper_functions
from importlib import reload

from timeit import default_timer as timer
import sklearn.metrics

In [2]:
# # Get ambient data metadata as CSV

# os.chdir(os.getcwd() + "\\road\\ambient")
# print(os.getcwd())

# df = pd.read_json("capture_metadata.json")
# df = df.transpose()
# df = df.rename_axis('name').reset_index()
# df.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\ambient_metadata.csv", index = None)

# Ambient data
We will use only **ambient_dyno_drive_extended_long** which consists of several basic driving activities along with some complex or one-off activities.

The function ```helper_functions.make_can_df``` has been modified so that messages of *all* AIDs are retained. 

In [3]:
# Read in required dataset
# df_ambient = helper_functions.make_can_df("road/ambient/ambient_dyno_drive_extended_long.log")
df_ambient = helper_functions.make_can_df("road/ambient/ambient_dyno_drive_basic_long.log")

df_ambient.head()

Unnamed: 0,time,aid,data
0,0.0,813,0000042758010000
1,0.001019,1694,0440047E1FC01542
2,0.00102,293,9000401F41BE7960
3,0.002916,737,0000000000000004
4,0.003937,852,1FFF40000003B680


In [4]:
#  Split data field into 8 fields 
# start = 0
# stop = 2
# for i in range(8):
#     df_ambient["data"+str(i)] = df_ambient["data"].str[start:stop]
#     start += 2
#     stop += 2
# df_ambient.drop("data", axis=1, inplace=True)
# df_ambient.head()

In [5]:
# Convert data column in hex to binary
df_ambient["data"] = df_ambient["data"].apply(int, base=16)
df_ambient["data"] = df_ambient["data"].apply(bin)
df_ambient["data"] = df_ambient["data"].apply(lambda x: x[2:])
df_ambient["data"] = df_ambient["data"].str.zfill(64)
display(df_ambient.head())

Unnamed: 0,time,aid,data
0,0.0,813,0000000000000000000001000010011101011000000000...
1,0.001019,1694,0000010001000000000001000111111000011111110000...
2,0.00102,293,1001000000000000010000000001111101000001101111...
3,0.002916,737,0000000000000000000000000000000000000000000000...
4,0.003937,852,0001111111111111010000000000000000000000000000...


In [6]:
# Label all messages in ambient dataframe as benign
df_ambient["actual_attack"] = False
df_ambient.head()

Unnamed: 0,time,aid,data,actual_attack
0,0.0,813,0000000000000000000001000010011101011000000000...,False
1,0.001019,1694,0000010001000000000001000111111000011111110000...,False
2,0.00102,293,1001000000000000010000000001111101000001101111...,False
3,0.002916,737,0000000000000000000000000000000000000000000000...,False
4,0.003937,852,0001111111111111010000000000000000000000000000...,False


In [7]:
df_ambient.dtypes

time             float64
aid                int64
data              object
actual_attack       bool
dtype: object

In [8]:
# Total number of messages
print("Total no. of messages =", len(df_ambient))

Total no. of messages = 2802431


In [9]:
# Breakdown by AIDs
print("No. of unique AIDs =", df_ambient["aid"].nunique())
df_temp = df_ambient["aid"].value_counts().rename_axis('aid').reset_index(name='counts')
df_temp = df_temp.sort_values(by=["counts"], ascending = False)
df_temp.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\ambient_aidcounts.csv", index = None)
del df_temp

No. of unique AIDs = 105


# Attack data
We will use data for the fuzzing attacks and the fabrication attacks which include both targeted ID attacks and masquerade attacks. 

In [10]:
# # Get attack data metadata as CSV

# os.chdir(os.path.dirname(os.getcwd()) + "\\attacks")
# print(os.getcwd())

# df = pd.read_json("capture_metadata.json")
# df = df.transpose()
# df = df.rename_axis('name').reset_index()

# # Convert injection_interval column to injection_interval start and injection_interval_end
# df["injection_interval_start"] = df["injection_interval"].apply(lambda x: None if x == None else x[0])
# df["injection_interval_end"] = df["injection_interval"].apply(lambda x: None if x == None else x[1])
# df.drop(columns = ["injection_interval"])
# df.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\attack_metadata.csv", index = None)

In [11]:
# os.chdir(os.path.dirname(os.getcwd()) + "/attacks")
print(os.getcwd())

c:\Users\sshar\Desktop\CAN IDS Benchmarking


In [12]:
# df_attack_aggr = []
# for filename in os.listdir(os.getcwd()+"\\road\\attacks"):
#     if "metadata" not in filename and "accelerator" not in filename and "engine" not in filename:
#         print(filename)
#         df_attack = helper_functions.make_can_df(os.getcwd()+"\\road\\attacks\\"+filename)
#         df_attack_aggr.append(df_attack)

The following cell creates a list of dataframes called ```df_attack_aggr```. The dataframe at a particular index corresponds to the attack listed in ```attack_names```.

The function ```helper_functions.add_actual_attack_col``` has been modified from the original to mark *only the injected messages* as attack messages, and not other messages with the same AID as the injected messages. 

(Another function called ```payload_match``` has been added to ```helper_functions``` to help compare the data of frames with the injected payload.)

In [13]:
reload(helper_functions)
with open("road\\attacks\\capture_metadata.json", "r") as read_file:
    attack_dict = json.load(read_file)

attack_metadata = []
attack_names = []

df_attack_aggr = []
count = 0
for filename in os.listdir(os.getcwd()+"\\road\\attacks"):
    if "metadata" not in filename and "accelerator" not in filename and "engine" not in filename:
        print(count, filename)

        # creating dataframe from log
        df_attack = helper_functions.make_can_df(os.getcwd()+"\\road\\attacks\\"+filename)
        # df_attack_aggr[count] = (df_attack)

        # printing AIDs of injections
        if attack_dict[filename[:-4]]["injection_id"] != "XXX":
            print(attack_dict[filename[:-4]]["injection_id"], int(attack_dict[filename[:-4]]["injection_id"], 16))
        else:
            print(attack_dict[filename[:-4]]["injection_id"])

        # adding injection interval to attack_metadata
        attack_metadata.append([tuple(attack_dict[filename[:-4]]["injection_interval"])])

        # adding attack name to attack_names
        attack_names.append(filename[:-4])

        # creating new column to label attack messages
        if attack_dict[filename[:-4]]["injection_id"] != "XXX":
            df_attack = helper_functions.add_actual_attack_col(df_attack, attack_metadata[count], int(attack_dict[filename[:-4]]["injection_id"], 16), attack_dict[filename[:-4]]["injection_data_str"])                                                      
        else:
            df_attack = helper_functions.add_actual_attack_col(df_attack, attack_metadata[count], "XXX", attack_dict[filename[:-4]]["injection_data_str"])  
        
        # Append the dataframe to the aggregate 
        df_attack_aggr.append(df_attack)

        count += 1
del df_attack

0 correlated_signal_attack_1.log
0x6e0 1760
1 correlated_signal_attack_1_masquerade.log
0x6e0 1760
2 correlated_signal_attack_2.log
0x6e0 1760
3 correlated_signal_attack_2_masquerade.log
0x6e0 1760
4 correlated_signal_attack_3.log
0x6e0 1760
5 correlated_signal_attack_3_masquerade.log
0x6e0 1760
6 fuzzing_attack_1.log
XXX
7 fuzzing_attack_2.log
XXX
8 fuzzing_attack_3.log
XXX
9 max_speedometer_attack_1.log
0xd0 208
10 max_speedometer_attack_1_masquerade.log
0xd0 208
11 max_speedometer_attack_2.log
0xd0 208
12 max_speedometer_attack_2_masquerade.log
0xd0 208
13 max_speedometer_attack_3.log
0xd0 208
14 max_speedometer_attack_3_masquerade.log
0xd0 208
15 reverse_light_off_attack_1.log
0xd0 208
16 reverse_light_off_attack_1_masquerade.log
0xd0 208
17 reverse_light_off_attack_2.log
0xd0 208
18 reverse_light_off_attack_2_masquerade.log
0xd0 208
19 reverse_light_off_attack_3.log
0xd0 208
20 reverse_light_off_attack_3_masquerade.log
0xd0 208
21 reverse_light_on_attack_1.log
0xd0 208
22 reverse_

In [14]:
print(attack_names[0])
display(df_attack_aggr[0])

correlated_signal_attack_1


Unnamed: 0,time,aid,data,actual_attack
0,0.000000,1505,893FC00B0A013880,False
1,0.000001,651,0000000000000000,False
2,0.000003,167,0010FA24D12E00A0,False
3,0.000004,208,4A7704600201F000,False
4,0.000997,51,000698000E4207D0,False
...,...,...,...,...
76231,33.095972,651,0000000000000000,False
76232,33.096941,51,177FA9788DC007D0,False
76233,33.096943,167,00510BA5212BA0A0,False
76234,33.096944,61,0000020000000000,False


In [15]:
# # Splitting the data column into 8 data columns, and then rearranging the columns in each attack dataframe
# for j in range(len(df_attack_aggr)):
#     start = 0
#     stop = 2
#     for i in range(8):
#         df_attack_aggr[j]["data"+str(i)] = df_attack_aggr[j]["data"].str[start:stop]
#         start += 2
#         stop += 2
#     df_attack_aggr[j].drop("data", axis=1, inplace=True)
#     df_attack_aggr[j] = df_attack_aggr[j][['time', 'aid', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7', 'actual_attack']]
#     df_attack_aggr[j].head()

In [16]:
# Converting data column in hex to binary
for i in range(len(df_attack_aggr)):
    df_attack_aggr[i]["data"] = df_attack_aggr[i]["data"].apply(int, base=16)
    df_attack_aggr[i]["data"] = df_attack_aggr[i]["data"].apply(bin)
    df_attack_aggr[i]["data"] = df_attack_aggr[i]["data"].apply(lambda x: x[2:])
    df_attack_aggr[i]["data"] = df_attack_aggr[i]["data"].str.zfill(64)

In [17]:
df_attack_aggr[0].head()

Unnamed: 0,time,aid,data,actual_attack
0,0.0,1505,1000100100111111110000000000101100001010000000...,False
1,1e-06,651,0000000000000000000000000000000000000000000000...,False
2,3e-06,167,0000000000010000111110100010010011010001001011...,False
3,4e-06,208,0100101001110111000001000110000000000010000000...,False
4,0.000997,51,0000000000000110100110000000000000001110010000...,False


In [18]:
df_attack_aggr[0].dtypes

time             float64
aid                int64
data              object
actual_attack       bool
dtype: object

In [19]:
# Getting class balance
df_class = pd.DataFrame(columns=["attack_name", "total_messages", "benign_messages", "attack_messages", "benign_percent", "attack_percent"])
for i in range(len(df_attack_aggr)):
    df_class.loc[i] = [ attack_names[i], 
                        len(df_attack_aggr[i]), 
                        df_attack_aggr[i]["actual_attack"].value_counts()[False],  
                        df_attack_aggr[i]["actual_attack"].value_counts()[True],
                        df_attack_aggr[i]["actual_attack"].value_counts()[False] / len(df_attack_aggr[i]) * 100,
                        df_attack_aggr[i]["actual_attack"].value_counts()[True] / len(df_attack_aggr[i]) * 100]
display(df_class)
df_class.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\attack_class_breakdown.csv", index = None)
del df_class

Unnamed: 0,attack_name,total_messages,benign_messages,attack_messages,benign_percent,attack_percent
0,correlated_signal_attack_1,76236,74150,2086,97.26376,2.73624
1,correlated_signal_attack_1_masquerade,74150,72064,2086,97.186784,2.813216
2,correlated_signal_attack_2,65398,63258,2140,96.727729,3.272271
3,correlated_signal_attack_2_masquerade,63258,61118,2140,96.617029,3.382971
4,correlated_signal_attack_3,39265,38001,1264,96.780848,3.219152
5,correlated_signal_attack_3_masquerade,38001,36737,1264,96.673772,3.326228
6,fuzzing_attack_1,46246,45655,591,98.722052,1.277948
7,fuzzing_attack_2,30315,29964,351,98.842157,1.157843
8,fuzzing_attack_3,12401,12287,114,99.080719,0.919281
9,max_speedometer_attack_1,200106,197662,2444,98.778647,1.221353


The number of attack messages are very small compared to the number of benign messages, i.e. the classes are heavily imbalanced in all attack samples.

# Evaluation

In [20]:
def saveMetrics(df_results, alg_num):
    new_df = pd.DataFrame(columns=["Attack name", "TN", "FP", "FN", "TP", "Accuracy", "Precision", "Recall", "F1-score", "Balanced accuracy", "Informedness", "Markedness", "MCC"])
    for i in range(len(df_results)):
        tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_results[i]["actual_attack"], df_results[i]["predicted_attack"]).ravel()    # Confusion Matrix
        acc     = sklearn.metrics.accuracy_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                     # Accuracy
        prec    = sklearn.metrics.precision_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                    # Precision
        recall  = sklearn.metrics.recall_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                       # Recall
        fscore  = sklearn.metrics.f1_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                           # F1-score
        mcc     = sklearn.metrics.matthews_corrcoef(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])                  # Matthews Correlation Coefficient
        bacc    = sklearn.metrics.balanced_accuracy_score(df_results[i]["actual_attack"], df_results[i]["predicted_attack"])            # Balanced Accuracy
        inf     = recall + tn / (fp + tn) - 1                                                                                           # Informedness
        mark    = prec + tn / (fn + tn) - 1                                                                                             # Markedness

        new_df.loc[i] = [attack_names[i], tn, fp, fn, tp, acc, prec, recall, fscore, bacc, inf, mark, mcc]
    display(new_df)
    new_df.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg" + str(alg_num) + "_confusion.csv", index = None)

## ID Sequences algorithm (Marchetti & Stabili, 2017)

### Training

In [21]:
# Mark all valid transitions as True in the transition matrix
# We do that by first reducing the training dataset to only unique transitions
# And then creating a transition matrix with all the valid transitions marked

start_time = timer()
# Create a new column called prev_aid 
df_ambient_train = pd.DataFrame()
df_ambient_train["aid"] = df_ambient["aid"]
df_ambient_train["prev_aid"] = df_ambient["aid"].shift(1, fill_value = 0)
df_ambient_train = df_ambient_train.tail(-1)
# print(len(df_ambient_train))

# Remove duplicate transitions
df_ambient_train.drop_duplicates(inplace = True)
# print(len(df_ambient_train))

# Create transition matrix
df_transition_matrix = pd.crosstab(df_ambient_train["prev_aid"], df_ambient_train["aid"]).astype(bool)

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)


Training time = 0.4774685000011232


In [22]:
df_transition_matrix.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 6 to 1788
Columns: 105 entries, 6 to 1788
dtypes: bool(105)
memory usage: 11.6 KB


### Testing
The unit of analysis for this method is a transition, i.e. the transition from one AID to another AID. Hence, we convert the ```actual_attack``` labels, which are assigned to each *message*, to labels assigned to each *transition*. 

| Prev msg actual_attack | Curr msg actual_attack | Transition actual_attack      |
|------------------------|------------------------|-------------------------------|
| False                  | False                  | False                         |
| True                   | False                  | True                          |
| False                  | True                   | True                          |
| True                   | True                   | True                          |

The transition labels is thus the logical or of the actual_attack labels for prev and curr messages. 

In [23]:
# Convert labels per message to labels per transition and create a list of dataframes to store results
# Each dataframe in the list has two columns, actual_attack and predicted_attack
df_alg1_results = []
count = 0
for df in df_attack_aggr:    
    actual_attack_trans = []
    for i in range(1, len(df)):
        actual_attack_trans.append(df["actual_attack"][i - 1] or df["actual_attack"][i])
    
    df_result = pd.DataFrame()
    df_result["actual_attack"] = actual_attack_trans
    df_result["predicted_attack"] = False
    
    df_alg1_results.append(df_result)

KeyboardInterrupt: 

In [None]:
df_attack_aggr[0]["actual_attack"].head(n = 10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: actual_attack, dtype: bool

In [None]:
df_alg1_results[0].value_counts()

actual_attack  predicted_attack
False          False               72063
True           False                4172
dtype: int64

In [None]:
df_attack_aggr[0]["actual_attack"].value_counts()

False    74150
True      2086
Name: actual_attack, dtype: int64

In [None]:
# Getting class balance
df_class = pd.DataFrame(columns = ["attack_name", "total_transitions", "benign_transitions", "attack_transitions", "benign_percent", "attack_percent"])
for i in range(len(df_alg1_results)):
    df_class.loc[i] = [
        attack_names[i],
        len(df_alg1_results[i]),
        df_alg1_results[i]["actual_attack"].value_counts()[False],
        df_alg1_results[i]["actual_attack"].value_counts()[True],
        df_alg1_results[i]["actual_attack"].value_counts()[False] / len(df_alg1_results[i]) * 100,
        df_alg1_results[i]["actual_attack"].value_counts()[True] / len(df_alg1_results[i]) * 100
    ]
display(df_class)
del df_class

Unnamed: 0,attack_name,total_transitions,benign_transitions,attack_transitions,benign_percent,attack_percent
0,correlated_signal_attack_1,76235,72063,4172,94.527448,5.472552
1,correlated_signal_attack_1_masquerade,74149,69977,4172,94.373491,5.626509
2,correlated_signal_attack_2,65397,61117,4280,93.455357,6.544643
3,correlated_signal_attack_2_masquerade,63257,58977,4280,93.23395,6.76605
4,correlated_signal_attack_3,39264,36736,2528,93.561532,6.438468
5,correlated_signal_attack_3_masquerade,38000,35472,2528,93.347368,6.652632
6,fuzzing_attack_1,46245,45063,1182,97.444048,2.555952
7,fuzzing_attack_2,30314,29612,702,97.684238,2.315762
8,fuzzing_attack_3,12400,12172,228,98.16129,1.83871
9,max_speedometer_attack_1,200105,195217,4888,97.557282,2.442718


In [None]:
# Testing with all attack samples
aid_lst = list(df_ambient["aid"].unique())

df_alg1_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

for i in range(len(df_attack_aggr)):

    # Print the name of the attack
    print(i, attack_names[i], end=" ")
    start_time = timer()

    # Check all transitions
    for j in range(1, len(df_attack_aggr[i])):

        # If current or previous AID are unknown, mark transition as anomaly
        if (df_attack_aggr[i]["aid"][j] not in aid_lst) or (df_attack_aggr[i]["aid"][j - 1] not in aid_lst):
            df_alg1_results[i]["predicted_attack"][j - 1] = True

        # If the transition previous AID --> current AID is False in the transition matrix, mark transition as anomaly
        elif (df_transition_matrix[df_attack_aggr[i]["aid"][j - 1]][df_attack_aggr[i]["aid"][j]] == False):
            df_alg1_results[i]["predicted_attack"][j - 1] = True

    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)
    df_alg1_testing_time.loc[i] = [attack_names[i], elapsed_time]
df_alg1_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg1_testing_time.csv", index = None)

0 correlated_signal_attack_1 4.972295599989593
1 correlated_signal_attack_1_masquerade 5.020228200010024
2 correlated_signal_attack_2 4.207615700026508
3 correlated_signal_attack_2_masquerade 4.338202699989779
4 correlated_signal_attack_3 2.458905599982245
5 correlated_signal_attack_3_masquerade 2.4744237000122666
6 fuzzing_attack_1 2.913235500018345
7 fuzzing_attack_2 2.1819305999961216
8 fuzzing_attack_3 0.7692088999901898
9 max_speedometer_attack_1 13.068416199996136
10 max_speedometer_attack_1_masquerade 13.116605499992147
11 max_speedometer_attack_2 8.773860599991167
12 max_speedometer_attack_2_masquerade 8.754813000006834
13 max_speedometer_attack_3 14.148219299997436
14 max_speedometer_attack_3_masquerade 14.953742699988652
15 reverse_light_off_attack_1 4.571907300007297
16 reverse_light_off_attack_1_masquerade 4.420353499997873
17 reverse_light_off_attack_2 6.857392899983097
18 reverse_light_off_attack_2_masquerade 7.028722800023388
19 reverse_light_off_attack_3 8.7386730999860

In [None]:
# Get confusion matrix for each attack
saveMetrics(df_alg1_results, 1)

Unnamed: 0,Attack name,TN,FP,FN,TP,Accuracy,Precision,Recall,F1-score,Balanced accuracy,Informedness,Markedness,MCC
0,correlated_signal_attack_1,58677,13386,3280,892,0.781387,0.062474,0.213806,0.096694,0.514026,0.028052,0.009534,0.016354
1,correlated_signal_attack_1_masquerade,55257,14720,3565,607,0.753402,0.039603,0.145494,0.06226,0.467569,-0.064861,-0.021003,-0.036909
2,correlated_signal_attack_2,49271,11846,3614,666,0.763598,0.053229,0.155607,0.079323,0.480891,-0.038217,-0.015108,-0.024029
3,correlated_signal_attack_2_masquerade,46303,12674,3614,666,0.742511,0.049925,0.155607,0.075596,0.470355,-0.05929,-0.022475,-0.036504
4,correlated_signal_attack_3,30175,6561,2288,240,0.774628,0.035289,0.094937,0.051452,0.458169,-0.083662,-0.035191,-0.05426
5,correlated_signal_attack_3_masquerade,28433,7039,2288,240,0.754553,0.032972,0.094937,0.048945,0.448249,-0.103501,-0.041505,-0.065543
6,fuzzing_attack_1,38121,6942,44,1138,0.848935,0.140842,0.962775,0.245735,0.904362,0.808724,0.139689,0.336109
7,fuzzing_attack_2,23361,6251,24,678,0.793,0.09785,0.965812,0.177696,0.877358,0.754715,0.096823,0.270322
8,fuzzing_attack_3,10395,1777,5,223,0.85629,0.1115,0.97807,0.20018,0.91604,0.832079,0.111019,0.303936
9,max_speedometer_attack_1,157035,38182,3563,1325,0.791385,0.033538,0.271072,0.059691,0.537742,0.075485,0.011353,0.029274


## Entropy-based algorithm (Muter & Asaj, 2011; Marchetti et al., 2016)

### Training
The ambient log will be split in two: the first will be used to model the entropy by way of obtaining the average entropy of the CAN bus traffic; and the second will be used to tune the detection method by finding the value of *k*.
The ambient log will be split by timestamp, not number of messages. 

In [None]:
# Split ambient log into two halves - training and tuning - by time
split_timestamp = df_ambient["time"][len(df_ambient) - 1] / 2
df_ambient_train = df_ambient.loc[df_ambient["time"] <= split_timestamp].reset_index()
df_ambient_tune  = df_ambient.loc[df_ambient["time"] >  split_timestamp].reset_index()
print("Ambient Train:", df_ambient_train["time"][0], df_ambient_train["time"][len(df_ambient_train) - 1], len(df_ambient_train))
print("Ambient Tune :", df_ambient_tune["time"][0],  df_ambient_tune["time"][len(df_ambient_tune) - 1],   len(df_ambient_tune))  

Ambient Train: 0.0 625.4715459346771 1401150
Ambient Tune : 625.4726150035858 1250.943526983261 1401281


In [None]:
start_time = timer()

# Time window
t = 0.5

# Use df_ambient_train to obtain the average entropy
entropy_time_windows_train = []
window_start_time = df_ambient_train["time"][0]
while window_start_time + t <= df_ambient_train["time"][len(df_ambient_train) - 1]:
    aid_counts = df_ambient_train.loc[(df_ambient_train["time"] >= window_start_time) & (df_ambient_train["time"] < (window_start_time + t))]["aid"].value_counts(normalize = True)
    entropy = sum([(x * math.log(1/x, 2)) for x in aid_counts])
    entropy_time_windows_train.append(entropy)
    window_start_time += t

# Average entropy 
avg_entropy = sum(entropy_time_windows_train) / len(entropy_time_windows_train)
# Standard deviation
std_entropy = np.std(entropy_time_windows_train)

# Use df_ambient_tune to obtain k
entropy_time_windows_tune = []
window_start_time = df_ambient_tune["time"][0]
while window_start_time + t <= df_ambient_tune["time"][len(df_ambient_tune) - 1]:
    aid_counts = df_ambient_tune.loc[(df_ambient_tune["time"] >= window_start_time) & (df_ambient_tune["time"] < (window_start_time + t))]["aid"].value_counts(normalize = True)
    entropy = sum([(x * math.log(1/x, 2)) for x in aid_counts])
    entropy_time_windows_tune.append(entropy)
    window_start_time += t
    
k = 1
while (k <= 10):
    print(k, avg_entropy - k * std_entropy, avg_entropy + k * std_entropy)
    if all((e >= (avg_entropy - k * std_entropy) and e <= (avg_entropy + k * std_entropy)) for e in entropy_time_windows_tune):
        break
    k += 1

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)

1 5.596389424070583 5.682461483323953
2 5.553353394443898 5.725497512950638
Training time = 9.434831299993675


In [None]:
print("df_ambient_train stats")
print("======================")
print("Number of time windows =", len(entropy_time_windows_train))
print("Minimum entropy        =", min(entropy_time_windows_train))
print("Maximum entropy        =", max(entropy_time_windows_train))
print()
print("Average entropy score  =", avg_entropy)
print("Standard deviation     =", std_entropy)
print()
print("df_ambient_tune stats")
print("======================")
print("Number of time windows =", len(entropy_time_windows_tune))
print("Minimum entropy        =", min(entropy_time_windows_tune))
print("Maximum entropy        =", max(entropy_time_windows_tune))
print()
print("k =", k)
print("t =", t)

df_ambient_train stats
Number of time windows = 1250
Minimum entropy        = 5.579786475424514
Maximum entropy        = 5.709323766850912

Average entropy score  = 5.639425453697268
Standard deviation     = 0.04303602962668497

df_ambient_tune stats
Number of time windows = 1250
Minimum entropy        = 5.589599914171682
Maximum entropy        = 5.70547067898237

k = 2
t = 0.5


In [None]:
del entropy_time_windows_train
del entropy_time_windows_tune
del df_ambient_train
del df_ambient_tune

### Testing
The unit of analysis in this method is the time window. Thus, for each attack sample, we need to convert labels per message to labels per time window of size ```t```.

In [None]:
# Convert labels per message to labels per time window, and create a list of dataframes to store the results
# Each dataframe in the list has two columns, actual_attack and predicted_attack
df_alg2_results = []
count = 0
for df in df_attack_aggr:
    print(count, attack_names[count])
    actual_attack_entro = []

    window_start_time = df["time"][0]
    while (window_start_time + t) <= df["time"][len(df) - 1]:
        actual_attack_entro.append(df.loc[(df["time"] >= window_start_time) & (df["time"] < window_start_time + t)]["actual_attack"].any())
        window_start_time += t
        
    df_result = pd.DataFrame()
    df_result["actual_attack"] = actual_attack_entro
    df_result["predicted_attack"] = False
    
    df_alg2_results.append(df_result)

    count += 1

0 correlated_signal_attack_1
1 correlated_signal_attack_1_masquerade
2 correlated_signal_attack_2
3 correlated_signal_attack_2_masquerade
4 correlated_signal_attack_3
5 correlated_signal_attack_3_masquerade
6 fuzzing_attack_1
7 fuzzing_attack_2
8 fuzzing_attack_3
9 max_speedometer_attack_1
10 max_speedometer_attack_1_masquerade
11 max_speedometer_attack_2
12 max_speedometer_attack_2_masquerade
13 max_speedometer_attack_3
14 max_speedometer_attack_3_masquerade
15 reverse_light_off_attack_1
16 reverse_light_off_attack_1_masquerade
17 reverse_light_off_attack_2
18 reverse_light_off_attack_2_masquerade
19 reverse_light_off_attack_3
20 reverse_light_off_attack_3_masquerade
21 reverse_light_on_attack_1
22 reverse_light_on_attack_1_masquerade
23 reverse_light_on_attack_2
24 reverse_light_on_attack_2_masquerade
25 reverse_light_on_attack_3
26 reverse_light_on_attack_3_masquerade


In [None]:
df_alg2_results[0].value_counts() 
# This matches the expected number of time windows as per the attack duration given in metadata

actual_attack  predicted_attack
True           False               43
False          False               23
dtype: int64

In [None]:
# Getting class balance
df_class = pd.DataFrame(columns = ["attack_name", "total_windows", "benign_windows", "attack_windows", "benign_percent", "attack_percent"])
for i in range(len(df_alg2_results)):
    df_class.loc[i] = [
        attack_names[i],
        len(df_alg2_results[i]),
        df_alg2_results[i]["actual_attack"].value_counts()[False],
        df_alg2_results[i]["actual_attack"].value_counts()[True],
        df_alg2_results[i]["actual_attack"].value_counts()[False] / len(df_alg2_results[i]) * 100,
        df_alg2_results[i]["actual_attack"].value_counts()[True] / len(df_alg2_results[i]) * 100
    ]
display(df_class)
del df_class

Unnamed: 0,attack_name,total_windows,benign_windows,attack_windows,benign_percent,attack_percent
0,correlated_signal_attack_1,66,23,43,34.848485,65.151515
1,correlated_signal_attack_1_masquerade,66,23,43,34.848485,65.151515
2,correlated_signal_attack_2,56,13,43,23.214286,76.785714
3,correlated_signal_attack_2_masquerade,56,13,43,23.214286,76.785714
4,correlated_signal_attack_3,33,8,25,24.242424,75.757576
5,correlated_signal_attack_3_masquerade,33,8,25,24.242424,75.757576
6,fuzzing_attack_1,40,33,7,82.5,17.5
7,fuzzing_attack_2,26,22,4,84.615385,15.384615
8,fuzzing_attack_3,10,9,1,90.0,10.0
9,max_speedometer_attack_1,176,127,49,72.159091,27.840909


In [None]:
# Test intrusion detection algorithm with each attack sample
df_alg2_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

count = 0
for df in df_attack_aggr:

    # Print the name of the attack
    print(count, attack_names[count], end = " ")
    start_time = timer()

    window_count = 0
    window_start_time = df["time"][0]
    window_aids = []

    
    for i in range(len(df)):                        # For each message of the attack sample
        if df["time"][i] < window_start_time + t:  # It is a message within current time window
            window_aids.append(df["aid"][i])

        else:                                       # It is a message in the next time window, i.e. a time window has just ended
            # Calculate entropy and decide attack/benign for time window that has just elapsed
            aid_counts = pd.Series(window_aids).value_counts(normalize = True)
            entropy = sum([(x * math.log(1/x, 2)) for x in aid_counts])
            # if not (entropy >= (avg_entropy - k * std_entropy) and entropy <= (avg_entropy + k * std_entropy)):
            if entropy < (avg_entropy - k * std_entropy) or entropy > (avg_entropy + k * std_entropy):
                df_alg2_results[count]["predicted_attack"][window_count] = True

            # Reset aids and update window_count and window_start_time
            window_count += 1
            window_start_time += t
            window_aids = [df["aid"][i]]

    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)
    df_alg2_testing_time.loc[count] = [attack_names[count], elapsed_time]
    
    count += 1
df_alg2_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg2_testing_time.csv", index = None)

0 correlated_signal_attack_1 0.7715549000131432
1 correlated_signal_attack_1_masquerade 0.7405688999861013
2 correlated_signal_attack_2 0.7068234000180382
3 correlated_signal_attack_2_masquerade 0.6961682000255678
4 correlated_signal_attack_3 0.4127458999864757
5 correlated_signal_attack_3_masquerade 0.4094586999854073
6 fuzzing_attack_1 0.5208183999930043
7 fuzzing_attack_2 0.3375682999903802
8 fuzzing_attack_3 0.1302637000044342
9 max_speedometer_attack_1 2.0663803000061307
10 max_speedometer_attack_1_masquerade 2.00238290001289
11 max_speedometer_attack_2 1.3884086000034586
12 max_speedometer_attack_2_masquerade 1.3866364999848884
13 max_speedometer_attack_3 2.2724175000039395
14 max_speedometer_attack_3_masquerade 1.9789600000076462
15 reverse_light_off_attack_1 0.6716733000066597
16 reverse_light_off_attack_1_masquerade 0.6611769000010099
17 reverse_light_off_attack_2 0.9745507999905385
18 reverse_light_off_attack_2_masquerade 1.042757100018207
19 reverse_light_off_attack_3 1.4503

In [None]:
# Get metrics for each attack
saveMetrics(df_alg2_results, 2)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Attack name,TN,FP,FN,TP,Accuracy,Precision,Recall,F1-score,Balanced accuracy,Informedness,Markedness,MCC
0,correlated_signal_attack_1,23,0,29,14,0.560606,1.0,0.325581,0.491228,0.662791,0.325581,0.442308,0.379483
1,correlated_signal_attack_1_masquerade,23,0,43,0,0.348485,0.0,0.0,0.0,0.5,0.0,-0.651515,0.0
2,correlated_signal_attack_2,13,0,24,19,0.571429,1.0,0.44186,0.612903,0.72093,0.44186,0.351351,0.394016
3,correlated_signal_attack_2_masquerade,13,0,43,0,0.232143,0.0,0.0,0.0,0.5,0.0,-0.767857,0.0
4,correlated_signal_attack_3,8,0,13,12,0.606061,1.0,0.48,0.648649,0.74,0.48,0.380952,0.427618
5,correlated_signal_attack_3_masquerade,8,0,25,0,0.242424,0.0,0.0,0.0,0.5,0.0,-0.757576,0.0
6,fuzzing_attack_1,33,0,0,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,fuzzing_attack_2,22,0,0,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,fuzzing_attack_3,9,0,0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,max_speedometer_attack_1,127,0,35,14,0.801136,1.0,0.285714,0.444444,0.642857,0.285714,0.783951,0.473271


## Hamming distance (Stabili et al., 2017)

### Training
In the training phase, Hamming distances are calculated between messages of each AID, and the maximum and minimum Hamming distance are recorded for each AID. 

In [None]:
# # Make a copy of df_ambient
# df_ambient_copy = df_ambient.copy()
# display(df_ambient_copy.head())

# # Concatenate all data fields into one data column
# df_ambient_copy["data"] = df_ambient_copy["data0"]
# df_ambient_copy.drop(columns=["data0"], inplace=True)
# for i in range(1, 8):
#     df_ambient_copy["data"] = df_ambient_copy["data"] + df_ambient_copy["data"+str(i)]
#     df_ambient_copy.drop(columns=["data"+str(i)], inplace=True)

# # Convert data column in hex to 64-digit binary
# df_ambient_copy["data"] = df_ambient_copy["data"].apply(int, base = 16)
# df_ambient_copy["data"] = df_ambient_copy["data"].apply(bin)
# df_ambient_copy["data"] = df_ambient_copy["data"].apply(lambda x: x[2:])
# df_ambient_copy["data"] = df_ambient_copy["data"].str.zfill(64)
# display(df_ambient_copy.head())

In [None]:
# Function to calculate Hamming distance between two binary strings
def calcHamming(binStr1, binStr2):
    result = -1
    if len(binStr1) == len(binStr2):
        result = 0
        for i in range(len(binStr1)):
            result += (binStr1[i] != binStr2[i])
    return result

In [None]:
# start_time = timer()

# # Get maximum and minimum Hamming distance for each AID
# aid_lst = list(df_ambient_copy["aid"].unique())
# ham_range_aid = pd.DataFrame({
#     "min" : [1000 for x in aid_lst],
#     "max" : [-1 for x in aid_lst]
# }, index = aid_lst)

# for aid in aid_lst:
#     df_amb_subset = df_ambient_copy.loc[df_ambient_copy["aid"] == aid]
#     df_amb_subset.reset_index(drop=True, inplace=True)
#     ham_dists = []
#     for i in range(1, len(df_amb_subset)):
#         # Calculate Hamming distance between current and previous message
#         ham_dist = calcHamming(df_amb_subset["data"][i - 1], df_amb_subset["data"][i])
#         ham_dists.append(ham_dist)

#     # Find max and min Hamming distances for this AID
#     ham_range_aid["max"][aid] = max(ham_dists)
#     ham_range_aid["min"][aid] = min(ham_dists)

# end_time = timer()
# elapsed_time = end_time - start_time
# print("Training time =", elapsed_time)
# display(ham_range_aid.head())

In [None]:
start_time = timer()

# Get maximum and minimum Hamming distance for each AID
aid_lst = list(df_ambient["aid"].unique())
ham_range_aid = pd.DataFrame({
    "min" : [1000 for x in aid_lst],
    "max" : [-1 for x in aid_lst]
}, index = aid_lst)

for aid in aid_lst:
    df_amb_subset = df_ambient.loc[df_ambient["aid"] == aid]
    df_amb_subset.reset_index(drop=True, inplace=True)
    
    # df_amb_subset["prev_data"] = df_amb_subset["data"].shift(1, fill_value = "0000000000000000000000000000000000000000000000000000000000000000")
    # ham_dists = list(df_amb_subset.apply(lambda x: calcHamming(x.prev_data, x.data), axis=1))[1:]

    ham_dists = []
    for i in range(1, len(df_amb_subset)):
        # Calculate Hamming distance between current and previous message
        ham_dist = calcHamming(df_amb_subset["data"][i - 1], df_amb_subset["data"][i])
        ham_dists.append(ham_dist)

    # Find max and min Hamming distances for this AID
    ham_range_aid["max"][aid] = max(ham_dists)
    ham_range_aid["min"][aid] = min(ham_dists)

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)
display(ham_range_aid.head())

  df_amb_subset.prev_data = df_amb_subset["data"].shift(1, fill_value = "0000000000000000000000000000000000000000000000000000000000000000")


AttributeError: 'Series' object has no attribute 'prev_data'

### Testing

In [None]:
# # Preprocess all the attack samples to convert them to binary
# df_attack_aggr_copy = [df.copy() for df in df_attack_aggr]
# display(df_attack_aggr_copy[0].head())

# for i in range(len(df_attack_aggr_copy)):
#     # Concatenate all data fields into one column
#     df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data0"]
#     df_attack_aggr_copy[i].drop(columns=["data0"], inplace=True)
#     for j in range(1, 8):
#         df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data"] + df_attack_aggr_copy[i]["data"+str(j)]
#         df_attack_aggr_copy[i].drop(columns=["data"+str(j)], inplace=True)

#     # Convert data column in hex to 64-digit binary
#     df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data"].apply(int, base = 16)
#     df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data"].apply(bin)
#     df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data"].apply(lambda x: x[2:])
#     df_attack_aggr_copy[i]["data"] = df_attack_aggr_copy[i]["data"].str.zfill(64)

In [None]:
# #  Prepare list of dataframes to store actual_attack and predicted_attack for each attack sample
# df_alg3_results = []
# for df in df_attack_aggr_copy:
#     new_df = pd.DataFrame()
#     new_df["actual_attack"] = df["actual_attack"]
#     new_df["predicted_attack"] = False

#     df_alg3_results.append(new_df)

In [None]:
#  Prepare list of dataframes to store actual_attack and predicted_attack for each attack sample
df_alg3_results = []
for df in df_attack_aggr:
    new_df = pd.DataFrame()
    new_df["actual_attack"] = df["actual_attack"]
    new_df["predicted_attack"] = False

    df_alg3_results.append(new_df)

In [None]:
# Testing with all attack samples

df_alg3_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

count = 0
for df in df_attack_aggr:

    # Print name of attack
    print(count, attack_names[count], end=" ")
    start_time = timer()

    aid_last_payload = {}

    for i in range(len(df)):
        if df["aid"][i] not in aid_lst:             # if current AID is an unknown AID, mark as anomaly
            df_alg3_results[count]["predicted_attack"][i] = True

        elif df["aid"][i] not in aid_last_payload:  # if current AID is known but not seen before (1st message)
            aid_last_payload[df["aid"][i]] = df["data"][i]

        else:                                       # current AID is known and is not the first message of that AID - we can calculate Hamming distance
            ham_dist = calcHamming(aid_last_payload[df["aid"][i]], df["data"][i])
            
            if ham_dist < ham_range_aid["min"][df["aid"][i]] or ham_dist > ham_range_aid["max"][df["aid"][i]]:
                df_alg3_results[count]["predicted_attack"][i] = True

            aid_last_payload[df["aid"][i]] = df["data"][i]

    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)
    df_alg3_testing_time.loc[count] = [attack_names[count], elapsed_time]

    count += 1
df_alg3_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg3_testing_time.csv", index = None)

0 correlated_signal_attack_1 4.580995200027246
1 correlated_signal_attack_1_masquerade 4.573010899999645
2 correlated_signal_attack_2 3.608640299993567
3 correlated_signal_attack_2_masquerade 3.5488376000139397
4 correlated_signal_attack_3 2.148653000011109
5 correlated_signal_attack_3_masquerade 2.1957597000000533
6 fuzzing_attack_1 2.6710448999947403
7 fuzzing_attack_2 1.7176032999996096
8 fuzzing_attack_3 0.9535859000170603
9 max_speedometer_attack_1 11.156147300003795
10 max_speedometer_attack_1_masquerade 

KeyboardInterrupt: 

In [None]:
# Get metrics for each attack
saveMetrics(df_alg3_results, 3)

In [None]:
# del df_attack_aggr_copy
del aid_last_payload
del df_amb_subset
# del df_ambient_copy

## Frequency-based (Young et al., 2019)

### Training
The ambient dataset is used to obtain the expected frequencies for each AID. Frequency (f) is given by dividing the number of messages (m) by a set time interval (t). The paper uses t = 1 second.

In [None]:
start_time = timer()

# Calculate expected frequency for each AID
expected_freq_aid = df_ambient["aid"].value_counts().sort_index()
expected_freq_aid = expected_freq_aid / (df_ambient["time"][len(df_ambient) - 1] - df_ambient["time"][0])
# print(expected_freq_aid[:10])

end_time = timer()
elapsed_time = end_time - start_time
print("Training time =", elapsed_time)


### Testing

In [None]:
# Convert attack labels from per message to per window

# Window is 1 sec
t = 1

df_alg4_results = []

for df in df_attack_aggr:

    actual_attack = []

    window_start_time = df["time"][0]
    while (window_start_time + t) <= df["time"][len(df) - 1]:
        actual_attack.append(df.loc[(df["time"] >= window_start_time) & (df["time"] < window_start_time + t)]["actual_attack"].any())
        window_start_time += t

    new_df = pd.DataFrame()
    new_df["actual_attack"] = actual_attack
    new_df["predicted_attack"] = False

    df_alg4_results.append(new_df)

In [None]:
df_alg4_results[0].value_counts()

In [None]:
# Getting class balance
df_class = pd.DataFrame(columns = ["attack_name", "total_windows", "benign_windows", "attack_windows", "benign_percent", "attack_percent"])
for i in range(len(df_alg4_results)):
    df_class.loc[i] = [
        attack_names[i],
        len(df_alg4_results[i]),
        df_alg4_results[i]["actual_attack"].value_counts()[False],
        df_alg4_results[i]["actual_attack"].value_counts()[True],
        df_alg4_results[i]["actual_attack"].value_counts()[False] / len(df_alg4_results[i]) * 100,
        df_alg4_results[i]["actual_attack"].value_counts()[True] / len(df_alg4_results[i]) * 100
    ]
display(df_class)
del df_class

In [None]:
# Testing with all attack samples

df_alg4_testing_time = pd.DataFrame(columns=["attack_name", "testing_time"])

count = 0
for df in df_attack_aggr:

    # Print the name of the attack
    print(count, attack_names[count], end = " ")
    start_time = timer()

    window_count = 0
    window_start_time = df["time"][0]
    
    aid_freq_windows = dict(zip(aid_lst, [0 for aid in aid_lst]))

    for i in range(len(df)):
        if df["time"][i] < window_start_time + t:  # message is within current window
            if df["aid"][i] in aid_freq_windows:        # AID is known
                aid_freq_windows[df["aid"][i]] += 1
            # else:                                       # AID is unknown => definitely anomaly
            #     df_alg4_results[count]["predicted_attack"][window_count] = True
                
        
        else:                                       # message is in the next window, i.e. a window has just elapsed
            # Compute frequencies for all messages in window just elapsed
            for aid in aid_freq_windows:
                aid_freq_windows[aid] = aid_freq_windows[aid] / t
                if aid_freq_windows[aid] > (2 * expected_freq_aid[aid]):  # If we find any one AID for which frequency is too high, no need to check further AIDs
                    df_alg4_results[count]["predicted_attack"][window_count] = True

                    # print(window_count, aid)
                    break    

            # Reset and update
            window_count += 1
            window_start_time += t  
            aid_freq_windows = dict(zip(aid_lst, [0 for aid in aid_lst])) 
            if df["aid"][i] in aid_freq_windows:        # AID is known
                aid_freq_windows[df["aid"][i]] += 1
            # else:                                       # AID is unknown => definitely anomaly
            #     df_alg4_results[count]["predicted_attack"][window_count] = True

    end_time = timer()
    elapsed_time = end_time - start_time
    print(elapsed_time)   
    df_alg4_testing_time.loc[count] = [attack_names[count], elapsed_time]

    count += 1

df_alg4_testing_time.to_csv("c:\\Users\\sshar\\Desktop\\CAN IDS Benchmarking\\alg4_testing_time.csv", index = None)
                

In [None]:
# Get metrics 
saveMetrics(df_alg4_results, 4)