# Mount google drive

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dependencies

In [None]:
# Install library
!pip install scapy



# Libraries

In [None]:
# Libraries

import gc # garbage collector
import pandas as pd
import numpy as np
from scapy.all import *

from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import datasets, layers, models
from tensorflow.keras import regularizersdr

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# Filepaths

In [None]:
drive_root_path = "/content/drive/MyDrive/AVTP dataset"

# Indoors original
indoors_01_original_fp = f"{drive_root_path}/indoors_01_original.pcap"
indoors_02_original_fp = f"{drive_root_path}/indoors_02_original.pcap"

# Indoors injected
indoors_01_injected_fp = f"{drive_root_path}/indoors_01_injected.pcap"
indoors_02_injected_fp = f"{drive_root_path}/indoors_02_injected.pcap"

# Driving
driving_01_injected_fp = f"{drive_root_path}/driving_01_injected.pcap"
driving_02_injected_fp = f"{drive_root_path}/driving_02_injected.pcap"

# Injected only
single_MPEG_frame_fp = f"{drive_root_path}/single-MPEG-frame.pcap"

# Helper functions

In [None]:
# Helper functions
def __read_raw_packets(pcap_filepath):
  raw_packets = rdpcap(pcap_filepath)

  raw_packets_list = []

  for packet in raw_packets:
    if (len(packet) == 438): # The length of AVTP packets in 438 bytes
      raw_packets_list.append(raw(packet))

  return raw_packets_list


def __convert_raw_packets(raw_packets_list):
  converted_packets_list = []

  for raw_packet in raw_packets_list:
    converted_packet = np.frombuffer(raw_packet, dtype='uint8')
    converted_packets_list.append(converted_packet)

  return np.array(converted_packets_list, dtype='uint8')


def __is_array_in_list_of_arrays(array_to_check, list_np_arrays):
  # Reference:
  # https://stackoverflow.com/questions/23979146/check-if-numpy-array-is-in-list-of-numpy-arrays
  is_in_list = np.any(np.all(array_to_check == list_np_arrays, axis=1))

  return is_in_list


def __generate_labels(packets_list, injected_packets):
  labels_list = []

  for packet in packets_list:
    current_label = 0

    if __is_array_in_list_of_arrays(packet, injected_packets):
      current_label = 1

    labels_list.append(current_label)

  return labels_list


def __select_packets_bytes(packets_list, first_byte=0, last_byte=58):
  selected_packets = packets_list[:, first_byte:last_byte]

  return np.array(selected_packets, dtype='uint8')


def __calculate_difference_module(selected_packets):
  difference_array = np.diff(selected_packets, axis=0)
  difference_module = np.mod(difference_array, 256)

  return difference_module


def __split_byte_into_nibbles(byte):
  high_nibble = (byte >> 4) & 0xf
  low_nibble = (byte) & 0xf

  return high_nibble, low_nibble


def __create_nibbles_matrix(difference_module):
  nibbles_matrix = []

  # difference matrix é uma matriz com n linhas e p colunas
  for row_index in range(len(difference_module)):
    nibbles_row = []
    for column_index in range(len(difference_module[row_index])):
      hi_ni, low_ni = __split_byte_into_nibbles(difference_module[row_index, column_index])

      nibbles_row.append(hi_ni)
      nibbles_row.append(low_ni)

    nibbles_matrix.append(np.array(nibbles_row, dtype='uint8'))

  return np.array(nibbles_matrix, dtype='uint8')


def aggregate_based_on_window_size(x_data, y_data, window_size=44, window_slide=44):
    # Prepare the list for the transformed data
    X, y = list(), list()

    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        start_ix = i*(window_slide)
        end_ix = start_ix + window_size - 1 + 1

        # if index is larger than the size of the dataset, we stop
        if end_ix >= x_data.shape[0]:
            break

        # Get a sequence of data for x
        seq_X = x_data[start_ix:end_ix]
        if (window_slide == 1):
          # Get only the last element of the sequence for y
          seq_y = y_data[end_ix]
        else:
          # If the sequence contains an attack, the label is considered as attack
          tmp_seq_y = y_data[start_ix:end_ix]
          if 1 in tmp_seq_y:
            seq_y = 1
          else:
            seq_y = 0
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_array = np.array(X, dtype='uint8')
    y_array = np.array(y, dtype='uint8')

    return x_array, y_array


def __map_model_predictions(predictions, threshold=0.5):

  predictions_true = predictions > threshold
  predictions_false = predictions <= threshold

  predictions[predictions_true] = 1
  predictions[predictions_false] = 0

  return predictions


def __create_model():
  model = models.Sequential()
  # Input layer
  model.add(keras.Input(shape=(44, 116, 1)))

  # Feature extraction layers
  model.add(layers.Conv2D(32, (5, 5), strides=(1,1), padding="same", activation="relu", kernel_regularizer='l2'))
  model.add(layers.BatchNormalization(momentum=0.99, epsilon=0.001))
  model.add(layers.MaxPooling2D(pool_size=(2, 2)))

  model.add(layers.Conv2D(64, (5, 5), strides=(1,1), padding="same", activation="relu", kernel_regularizer='l2'))
  model.add(layers.BatchNormalization(momentum=0.99, epsilon=0.001))
  model.add(layers.MaxPooling2D(pool_size=(2, 2)))

  # Binary classification
  model.add(layers.Flatten())
  model.add(layers.Dropout(0.3))

  model.add(layers.Dense(units=64, activation="relu"))
  model.add(layers.Dropout(0.3))

  model.add(layers.Dense(units=1, activation="sigmoid"))

  # Optimizer and compile
  opt = tf.keras.optimizers.Adam(learning_rate=0.001)
  model.compile(
              loss="binary_crossentropy",
              optimizer=opt,
              metrics=[
                       tf.keras.metrics.BinaryAccuracy(name="binary_accuracy")]
              )

  return model

# Feature generator and constructing labeled dataset

## Load raw packets

In [None]:
# Injected
raw_indoors_01_injected_packets_list = __read_raw_packets(indoors_01_injected_fp)
raw_indoors_02_injected_packets_list = __read_raw_packets(indoors_02_injected_fp)

# Original
# raw_indoors_01_original_packets_list = __read_raw_packets(indoors_01_original_fp)
# raw_indoors_02_original_packets_list = __read_raw_packets(indoors_02_original_fp)

# Injected (36)
raw_injected_packets_list = __read_raw_packets(single_MPEG_frame_fp)

# Convert and apply feature generator

In [None]:
# Get raw packets from .pcap files
# Primeiro é bastante custoso

# Convert packets
## Injected
converted_indoors_01_injected_packets_list = __convert_raw_packets(raw_indoors_01_injected_packets_list)
converted_indoors_02_injected_packets_list = __convert_raw_packets(raw_indoors_02_injected_packets_list)

# Merged converted packets
# merged_indoors_injected_packets = np.concatenate((converted_indoors_01_injected_packets_list, converted_indoors_02_injected_packets_list), axis=0)

## Injected(36)
converted_injected_packets = __convert_raw_packets(raw_injected_packets_list)

# Generate labels
y_indoors_01_injected = __generate_labels(converted_indoors_01_injected_packets_list, converted_injected_packets)
y_indoors_02_injected = __generate_labels(converted_indoors_02_injected_packets_list, converted_injected_packets)

# Select first 58 bytes
selected_indoor_01_injected_packets = __select_packets_bytes(converted_indoors_01_injected_packets_list)
selected_indoor_02_injected_packets = __select_packets_bytes(converted_indoors_02_injected_packets_list)

# Calculate difference and module between rows
diff_module_indoor_01_injected_packets = __calculate_difference_module(selected_indoor_01_injected_packets)
diff_module_indoor_02_injected_packets = __calculate_difference_module(selected_indoor_02_injected_packets)

# Split difference into two nibbles
nibbles_indoors_01_injected_packets = __create_nibbles_matrix(diff_module_indoor_01_injected_packets)
nibbles_indoors_02_injected_packets = __create_nibbles_matrix(diff_module_indoor_02_injected_packets)

# Aggregate features and labels based on window size
X_indoors_01_injected_agg, y_indoors_01_injected_agg = aggregate_based_on_window_size(nibbles_indoors_01_injected_packets, y_indoors_01_injected, window_size=44, window_slide=1)
X_indoors_02_injected_agg, y_indoors_02_injected_agg = aggregate_based_on_window_size(nibbles_indoors_02_injected_packets, y_indoors_02_injected, window_size=44, window_slide=1)

X_indoors_injected_agg = np.concatenate((X_indoors_01_injected_agg, X_indoors_02_injected_agg), axis=0)
y_indoors_injected_agg = np.concatenate((y_indoors_01_injected_agg, y_indoors_02_injected_agg), axis=0)

In [None]:
# Delete unused variables
del raw_indoors_01_injected_packets_list
del raw_indoors_02_injected_packets_list

del raw_injected_packets_list

del converted_indoors_01_injected_packets_list
del converted_indoors_02_injected_packets_list

del converted_injected_packets

del y_indoors_01_injected
del y_indoors_02_injected

del selected_indoor_01_injected_packets
del selected_indoor_02_injected_packets

del diff_module_indoor_01_injected_packets
del diff_module_indoor_02_injected_packets

del nibbles_indoors_01_injected_packets
del nibbles_indoors_02_injected_packets

del X_indoors_01_injected_agg
del y_indoors_01_injected_agg

del X_indoors_02_injected_agg
del y_indoors_02_injected_agg

In [None]:
# Checking if dataset was properly labeled

indoors_unique, indoors_counts = np.unique(np.array(y_indoors_injected_agg), return_counts=True)

# Dindoors has 446,372 bening Xis and 196,894 injected Xis [Paper information]
print(f"Dindoors has {indoors_counts[0]} bening Xis and {indoors_counts[1]} injected Xis")

Dindoors has 446372 bening Xis and 196892 injected Xis


# Create model

In [None]:
X_indoors_injected_agg.shape

(643264, 44, 116)

In [None]:
def reset_seeds():
    np.random.seed(1)
    random.seed(2)
    if tf.__version__[0] == '2':
        tf.random.set_seed(3)
    else:
        tf.set_random_seed(3)
    print("RANDOM SEEDS RESET")

In [None]:
# In each cross-validation, 80% and 20% of the samples
# are randomly selected as the training and validation sets, respectively.
# We randomly assigned 80% of Xis to the trining set
# and 20% of Xis to the test set for each Dindoors
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

metrics_list = []
fold_number = 1

for train_index, val_index in skf.split(X_indoors_injected_agg, y_indoors_injected_agg):
  print(f"Fold atual = {fold_number}")
  X_train, X_val = X_indoors_injected_agg[train_index], X_indoors_injected_agg[val_index]
  y_train, y_val = y_indoors_injected_agg[train_index], y_indoors_injected_agg[val_index]

  reset_seeds()
  model = __create_model()
  print(model.summary())

  checkpoint = tf.keras.callbacks.ModelCheckpoint(f"{drive_root_path}/model_{fold_number}.h5",
                                                monitor='binary_accuracy', verbose=0,
                                                save_best_only=True, mode='max')
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor="loss",
                                                      patience=5,
                                                      verbose=1)
  callbacks_list = [early_stopping,
                    checkpoint]

  model.fit(X_train, y_train,
                      batch_size=64,
                      epochs=30,
                      callbacks=callbacks_list)

  # Run predictions on validation set
  probability_predictions = model.predict(X_val)
  y_pred = __map_model_predictions(probability_predictions)

  # Calculate metrics
  acc = accuracy_score(y_val, y_pred)
  prec = precision_score(y_val, y_pred)
  recall = recall_score(y_val, y_pred)
  f1 = f1_score(y_val, y_pred)
  roc_auc = roc_auc_score(y_val, y_pred)

  # Append metrics on list
  metrics_list.append([fold_number, acc, prec, recall, f1, roc_auc])
  metrics_df = pd.DataFrame(metrics_list, columns=["fold", "acc", "prec", "recall", "f1", "roc_auc"])
  metrics_df.to_csv(f"{drive_root_path}/models_metrics.csv")

  fold_number = fold_number + 1

  del model
  del probability_predictions
  del y_pred
  del X_train, X_val
  del y_train, y_val
  gc.collect()
  tf.keras.backend.clear_session()

Fold atual = 1
RANDOM SEEDS RESET
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 44, 116, 32)       832       
                                                                 
 batch_normalization (BatchN  (None, 44, 116, 32)      128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 22, 58, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 22, 58, 64)        51264     
                                                                 
 batch_normalization_1 (Batc  (None, 22, 58, 64)       256       
 hNormalization)                                                 
                      

In [None]:
print(metrics_df)

   fold       acc      prec    recall        f1   roc_auc
0     1  0.993284  0.995421  0.982579  0.988958  0.990293
1     2  0.993883  0.982689  0.997587  0.990082  0.994918
2     3  0.994917  0.987610  0.995886  0.991731  0.995188
3     4  0.993945  0.984462  0.995937  0.990166  0.994502
4     5  0.995546  0.992212  0.993245  0.992728  0.994903
