# Bluetooth DoS Classifier

In [14]:
import os

import numpy as np
import pandas as pd

# ensure that the working directory is the same as the notebook
notebook_path = os.path.dirname(os.path.abspath("__file__"))
os.chdir(notebook_path)

# create directories
if not os.path.exists("data"):
    raise Exception("Data directory not found.")
if not os.path.exists("models"):
    os.makedirs("models")

In [15]:
# data paths
ATTACK_TRAIN = 'data/dos_train.csv'
BENIGN_TRAIN = 'data/benign_train.csv'
ATTACK_TEST = 'data/dos_test.csv'
BENIGN_TEST = 'data/benign_test.csv'

# preprocessed data paths
PREPROCESSED_TRAIN = 'data/preprocessed_train.csv'
PREPROCESSED_TEST = 'data/preprocessed_test.csv'
LABELS_TRAIN = 'data/labels_train.csv'
LABELS_TEST = 'data/labels_test.csv'

# features paths
FEATURES_TEST = 'data/features_test.csv'
FEATURES_TRAIN = 'data/features_train.csv'

# models paths
VECTORIZER_MODEL = 'models/vectorizer.joblib'
ENCODER_MODEL = 'models/encoder.joblib'
SCALER_MODEL = 'models/scaler.joblib'
GBM_MODEL = 'models/gbm.joblib'  # gradient boosting machine

## Dataset Preprocessing

The dataset is obtained from the following link:
https://www.unb.ca/cic/datasets/iomt-dataset-2024.html

The specific dataset used is the "Bluetooth" dataset. The dataset is in `.pcap` format. The dataset is first converted to `.csv` format using the `tshark` command line tool (WireShark can also be used). The resulting dataset files are:

- `benign_test.csv`
- `benign_train.csv`
- `dos_test.csv`
- `dos_train.csv`

The original dataset are compressed in their `.pcap` format at `data/bl_dataset.zip`.

In [16]:
# read datasets
attack_train = pd.read_csv(ATTACK_TRAIN)
benign_train = pd.read_csv(BENIGN_TRAIN)
attack_test = pd.read_csv(ATTACK_TEST)
benign_test = pd.read_csv(BENIGN_TEST)

# add type column indicating attack or benign
attack_train["Type"] = 1
attack_test["Type"] = 1
benign_train["Type"] = 0
benign_test["Type"] = 0

# combine datasets
train_dataset = pd.concat([attack_train, benign_train], ignore_index=True)
test_dataset = pd.concat([attack_test, benign_test], ignore_index=True)

# shuffle datasets
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

# generate labels
train_labels = train_dataset["Type"]
train_dataset.drop(columns=["Type"], inplace=True)
test_labels = test_dataset["Type"]
test_dataset.drop(columns=["Type"], inplace=True)

# summary statistics
print("Summarizing datasets...")
print(f"Training data:\n{attack_train.describe()}\n")
print(f"Testing data:\n{attack_test.describe()}")

# write modified dataset to files
train_dataset.to_csv(PREPROCESSED_TRAIN, index=False)
test_dataset.to_csv(PREPROCESSED_TEST, index=False)
np.save(LABELS_TRAIN, train_labels)
np.save(LABELS_TEST, test_labels)

Summarizing datasets...
Training data:
                 No.          Time         Length      Type
count  998391.000000  9.983910e+05  998391.000000  998391.0
mean   499196.000000  4.621494e+05      20.064474       1.0
std    288210.800641  4.679402e+05      12.352341       0.0
min         1.000000  0.000000e+00       4.000000       1.0
25%    249598.500000  2.276077e+03       8.000000       1.0
50%    499196.000000  6.853190e+05      16.000000       1.0
75%    748793.500000  8.540422e+05      32.000000       1.0
max    998391.000000  1.198804e+06     255.000000       1.0

Testing data:
                 No.           Time         Length      Type
count  251708.000000  251708.000000  251708.000000  251708.0
mean   125854.500000    1071.256077      20.006591       1.0
std     72661.985116     638.578923      12.006976       0.0
min         1.000000       0.000000       5.000000       1.0
25%     62927.750000     601.485238       8.000000       1.0
50%    125854.500000    1050.763037     