# Bluetooth DoS Classifier

## Setup

In [24]:
import os

import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from joblib import dump, load
from scipy.sparse import csr_matrix, hstack, load_npz, save_npz
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# ensure that the working directory is the same as the notebook
notebook_path = os.path.dirname(os.path.abspath("__file__"))
os.chdir(notebook_path)

# create directories
if not os.path.exists("data"):
    raise Exception("Data directory not found.")
if not os.path.exists("models"):
    os.makedirs("models")

In [25]:
# data paths
ATTACK_TRAIN = "data/dos_train.csv"
BENIGN_TRAIN = "data/benign_train.csv"
ATTACK_TEST = "data/dos_test.csv"
BENIGN_TEST = "data/benign_test.csv"

# preprocessed data paths
PREPROCESSED_TRAIN = "data/preprocessed_train.csv"
PREPROCESSED_TEST = "data/preprocessed_test.csv"
LABELS_TRAIN = "data/labels_train.csv"
LABELS_TEST = "data/labels_test.csv"

# features paths
FEATURES_TEST = "data/features_test.csv"
FEATURES_TRAIN = "data/features_train.csv"

# models paths
VECTORIZER_MODEL = "models/vectorizer.joblib"
ENCODER_MODEL = "models/encoder.joblib"
SCALER_MODEL = "models/scaler.joblib"
GBM_MODEL = "models/gbm.joblib"  # gradient boosting machine

## Dataset Preprocessing

The dataset is obtained from the following link:
https://www.unb.ca/cic/datasets/iomt-dataset-2024.html

The specific dataset used is the "Bluetooth" dataset. The dataset is in `.pcap` format. The dataset is first converted to `.csv` format using the `tshark` command line tool (WireShark can also be used). The resulting dataset files are:

- `data/benign_test.csv`
- `data/benign_train.csv`
- `data/dos_test.csv`
- `data/dos_train.csv`

The original dataset are compressed in their `.pcap` format at `data/bl_dataset.zip`.

In [26]:
# read datasets
attack_train = pd.read_csv(ATTACK_TRAIN)
benign_train = pd.read_csv(BENIGN_TRAIN)
attack_test = pd.read_csv(ATTACK_TEST)
benign_test = pd.read_csv(BENIGN_TEST)

# add type column indicating attack or benign
attack_train["Type"] = 1
attack_test["Type"] = 1
benign_train["Type"] = 0
benign_test["Type"] = 0

# combine datasets
train_dataset = pd.concat([attack_train, benign_train], ignore_index=True)
test_dataset = pd.concat([attack_test, benign_test], ignore_index=True)

# shuffle datasets
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

# generate labels
train_labels = train_dataset["Type"]
train_dataset.drop(columns=["Type"], inplace=True)
test_labels = test_dataset["Type"]
test_dataset.drop(columns=["Type"], inplace=True)

In [27]:
# summary statistics
print(f"Training data:\n{attack_train.describe()}\n")
print(f"Testing data:\n{attack_test.describe()}")

# write modified dataset to files
train_dataset.to_csv(PREPROCESSED_TRAIN, index=False)
test_dataset.to_csv(PREPROCESSED_TEST, index=False)
np.save(LABELS_TRAIN, train_labels)
np.save(LABELS_TEST, test_labels)

Training data:
                 No.          Time         Length      Type
count  998391.000000  9.983910e+05  998391.000000  998391.0
mean   499196.000000  4.621494e+05      20.064474       1.0
std    288210.800641  4.679402e+05      12.352341       0.0
min         1.000000  0.000000e+00       4.000000       1.0
25%    249598.500000  2.276077e+03       8.000000       1.0
50%    499196.000000  6.853190e+05      16.000000       1.0
75%    748793.500000  8.540422e+05      32.000000       1.0
max    998391.000000  1.198804e+06     255.000000       1.0

Testing data:
                 No.           Time         Length      Type
count  251708.000000  251708.000000  251708.000000  251708.0
mean   125854.500000    1071.256077      20.006591       1.0
std     72661.985116     638.578923      12.006976       0.0
min         1.000000       0.000000       5.000000       1.0
25%     62927.750000     601.485238       8.000000       1.0
50%    125854.500000    1050.763037      16.000000       1.0
75%

## Feature Extraction

In [28]:
def apply_feature_hashing(dataset, column, n_features=20):
    """Applies feature hashing to a specified column of the dataset."""
    hasher = FeatureHasher(n_features=n_features, input_type="string")
    hashed_features = hasher.transform(
        dataset[column].apply(lambda x: [str(x)])
    )
    return hashed_features

In [29]:
# apply tf-idf vectorization to Info column
print("Applying TF-IDF vectorization...")
vectorizer = TfidfVectorizer()
train_info = vectorizer.fit_transform(train_dataset["Info"])
test_info = vectorizer.transform(test_dataset["Info"])

# apply one-hot encoding to Protocol column
print("Applying one-hot encoding...")
encoder = OneHotEncoder()
train_protocol = encoder.fit_transform(train_dataset[["Protocol"]])
test_protocol = encoder.transform(test_dataset[["Protocol"]])

# apply standard scaling to Length column
print("Applying standard scaling...")
scaler = StandardScaler()
train_length = scaler.fit_transform(train_dataset[["Length"]])
test_length = scaler.transform(test_dataset[["Length"]])

# apply feature hashing to Source and Destination columns
print("Applying feature hashing...")
train_source = apply_feature_hashing(train_dataset, "Source")
test_source = apply_feature_hashing(test_dataset, "Source")
train_destination = apply_feature_hashing(train_dataset, "Destination")
test_destination = apply_feature_hashing(test_dataset, "Destination")

# combine features
train_features = hstack(
    [
        csr_matrix(train_dataset[["Time"]]),
        train_source,
        train_destination,
        train_protocol,
        csr_matrix(train_length),
        train_info,
    ]
)
test_features = hstack(
    [
        csr_matrix(test_dataset[["Time"]]),
        test_source,
        test_destination,
        test_protocol,
        csr_matrix(test_length),
        test_info,
    ]
)

Applying TF-IDF vectorization...
Applying one-hot encoding...
Applying standard scaling...
Applying feature hashing...


In [33]:
# report feature extraction results
print(f"TF-IDF Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"One-Hot Encoding unique categories: {len(encoder.categories_[0])}")  # type: ignore
print(f"Standard Scaling mean: {scaler.mean_[0]:.4f}")  # type: ignore
print(f"Standard Scaling std: {scaler.scale_[0]:.4f}")  # type: ignore
print(f"Feature Hashing features count: {train_source.shape[1]}")
print(f"Total number of features: {train_features.shape[1]}")

# write features and models to files
save_npz(FEATURES_TRAIN, train_features)
save_npz(FEATURES_TEST, test_features)
dump(vectorizer, VECTORIZER_MODEL);
dump(encoder, ENCODER_MODEL);
dump(scaler, SCALER_MODEL);

TF-IDF Vocabulary size: 609
One-Hot Encoding unique categories: 6
Standard Scaling mean: 22.7672
Standard Scaling std: 14.1469
Feature Hashing features count: 20
Total number of features: 657


## Model Training

In [35]:
# load features (to prevent forced extraction to define variables)
train_features = load_npz(FEATURES_TRAIN)
train_labels = np.load(LABELS_TRAIN)

# train model
model = GradientBoostingClassifier(verbose=1)
model.fit(train_features, train_labels)  # type: ignore

# write model to file
dump(model, GBM_MODEL);

      Iter       Train Loss   Remaining Time 
         1           0.7654            2.87m
         2           0.6565            2.83m
         3           0.5758            2.81m
         4           0.5121            2.75m
         5           0.4581            2.69m
         6           0.4140            2.64m
         7           0.3766            2.64m
         8           0.3440            2.59m
         9           0.3160            2.56m
        10           0.2917            2.54m
        20           0.1581            2.35m
        30           0.0744            2.09m
        40           0.0528            1.84m
        50           0.0317            1.52m
        60           0.0204            1.21m
        70           0.0173           54.89s
        80           0.0140           36.53s
        90           0.0126           18.23s
       100           0.0086            0.00s


## Evaluation

In [37]:
# load model and features (to prevent forced training to define variables)
model = load(GBM_MODEL)
test_features = load_npz(FEATURES_TEST)
test_labels = np.load(LABELS_TEST)

# evaluate model
predictions = model.predict(test_features)  # type: ignore
accuracy = metrics.accuracy_score(test_labels, predictions)
print(f"Train accuracy: {accuracy}\n")
conf_matrix = metrics.confusion_matrix(test_labels, predictions)
print(f"Confusion matrix:\n{conf_matrix}\n")
report = metrics.classification_report(test_labels, predictions)
print(f"Classification report:\n{report}")

Train accuracy: 0.8341019057652395

Confusion matrix:
[[ 63467   1863]
 [ 50733 200975]]

Classification report:
              precision    recall  f1-score   support

           0       0.56      0.97      0.71     65330
           1       0.99      0.80      0.88    251708

    accuracy                           0.83    317038
   macro avg       0.77      0.88      0.80    317038
weighted avg       0.90      0.83      0.85    317038

