# Bluetooth DoS Classifier

## Setup

In [24]:
import logging
import os
import sys
from contextlib import contextmanager

import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from joblib import dump, load
from scipy.sparse import csr_matrix, hstack, load_npz, save_npz
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### Experiment Setup

In [26]:
EXPERIMENTS = "experiments"
if not os.path.exists(EXPERIMENTS):
    os.makedirs("experiments")

# get number of experiments (max + 1 is the next experiment number)
experiment_files = [f for f in os.listdir(EXPERIMENTS) if f.endswith(".log")]
experiment = (
    max([int(f.split(".")[0]) for f in experiment_files])
    if experiment_files
    else 0
) + 1

# create an experiments logger
logger = logging.getLogger("experiment_logger")
logger.setLevel(logging.INFO)
logger.handlers.clear()

# create file handler
file_handler = logging.FileHandler(f"experiments/{experiment}.log")
file_handler.setLevel(logging.INFO)
file_formatter = logging.Formatter("[%(asctime)s] %(message)s")
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)

# create console handler
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_formatter = logging.Formatter("%(message)s")
stream_handler.setFormatter(stream_formatter)
logger.addHandler(stream_handler)


@contextmanager  # redirect stdout to the logger
def log_redirector(logger):
    class StreamToLogger(object):
        """Fake file-like stream object that redirects writes to a logger instance."""

        def __init__(self, logger, level):
            self.logger = logger
            self.level = level

        def write(self, buffer):
            for line in buffer.rstrip().splitlines():
                self.logger.log(self.level, line.rstrip())

        def flush(self):
            pass

    old_stdout = sys.stdout
    sys.stdout = StreamToLogger(logger, logging.INFO)
    try:
        yield
    finally:
        sys.stdout = old_stdout

### Constants

In [25]:
DATA = "data"
MODELS = "models"

# data paths
ATTACK_TRAIN = os.path.join(DATA, "dos_train.csv")
BENIGN_TRAIN = os.path.join(DATA, "benign_train.csv")
ATTACK_TEST = os.path.join(DATA, "dos_test.csv")
BENIGN_TEST = os.path.join(DATA, "benign_test.csv")

# preprocessed data paths
PREPROCESSED_TRAIN = os.path.join(DATA, "preprocessed_train.csv")
PREPROCESSED_TEST = os.path.join(DATA, "preprocessed_test.csv")
LABELS_TRAIN = os.path.join(DATA, "labels_train.npy")
LABELS_TEST = os.path.join(DATA, "labels_test.npy")

# features paths
FEATURES_TEST = os.path.join(DATA, f"features_test_{experiment}.npz")
FEATURES_TRAIN = os.path.join(DATA, f"features_train_{experiment}.npz")

# models paths
VECTORIZER_MODEL = os.path.join(MODELS, f"vectorizer_{experiment}.joblib")
ENCODER_MODEL = os.path.join(MODELS, f"encoder_{experiment}.joblib")
SCALER_MODEL = os.path.join(MODELS, f"scaler_{experiment}.joblib")
GBM_MODEL = os.path.join(
    MODELS, f"gbm_{experiment}.joblib"
)  # gradient boosting machine

# ensure that the working directory is the same as the notebook
notebook_path = os.path.dirname(os.path.abspath("__file__"))
os.chdir(notebook_path)

# create directories
if not os.path.exists(DATA):
    raise Exception("Data directory not found.")
if not os.path.exists(MODELS):
    os.makedirs("models")

## Dataset Preprocessing

The dataset is obtained from the following link:
https://www.unb.ca/cic/datasets/iomt-dataset-2024.html

The specific dataset used is the "Bluetooth" dataset. The dataset is in `.pcap` format. The dataset is first converted to `.csv` format using the `tshark` command line tool (WireShark can also be used). The resulting dataset files are:

- `data/benign_test.csv`
- `data/benign_train.csv`
- `data/dos_test.csv`
- `data/dos_train.csv`

The original dataset are compressed in their `.pcap` format at `data/bl_dataset.zip`.

In [27]:
# read datasets
attack_train = pd.read_csv(ATTACK_TRAIN)
benign_train = pd.read_csv(BENIGN_TRAIN)
attack_test = pd.read_csv(ATTACK_TEST)
benign_test = pd.read_csv(BENIGN_TEST)

# add type column indicating attack or benign
attack_train["Type"] = 1
attack_test["Type"] = 1
benign_train["Type"] = 0
benign_test["Type"] = 0

# combine datasets
train_dataset = pd.concat([attack_train, benign_train], ignore_index=True)
test_dataset = pd.concat([attack_test, benign_test], ignore_index=True)

# shuffle datasets
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

# generate labels
train_labels = train_dataset["Type"]
train_dataset.drop(columns=["Type"], inplace=True)
test_labels = test_dataset["Type"]
test_dataset.drop(columns=["Type"], inplace=True)

In [28]:
# summary statistics
logger.info(f"Training data:\n{attack_train.describe()}\n")
logger.info(f"Testing data:\n{attack_test.describe()}\n")

# write modified dataset to files
train_dataset.to_csv(PREPROCESSED_TRAIN, index=False)
test_dataset.to_csv(PREPROCESSED_TEST, index=False)
np.save(LABELS_TRAIN, train_labels)
np.save(LABELS_TEST, test_labels)

Training data:
                 No.          Time         Length      Type
count  998391.000000  9.983910e+05  998391.000000  998391.0
mean   499196.000000  4.621494e+05      20.064474       1.0
std    288210.800641  4.679402e+05      12.352341       0.0
min         1.000000  0.000000e+00       4.000000       1.0
25%    249598.500000  2.276077e+03       8.000000       1.0
50%    499196.000000  6.853190e+05      16.000000       1.0
75%    748793.500000  8.540422e+05      32.000000       1.0
max    998391.000000  1.198804e+06     255.000000       1.0

Testing data:
                 No.           Time         Length      Type
count  251708.000000  251708.000000  251708.000000  251708.0
mean   125854.500000    1071.256077      20.006591       1.0
std     72661.985116     638.578923      12.006976       0.0
min         1.000000       0.000000       5.000000       1.0
25%     62927.750000     601.485238       8.000000       1.0
50%    125854.500000    1050.763037      16.000000       1.0
75%

## Feature Extraction

In [29]:
def apply_feature_hashing(dataset, column, n_features=20):
    """Applies feature hashing to a specified column of the dataset."""
    hasher = FeatureHasher(n_features=n_features, input_type="string")
    hashed_features = hasher.transform(
        dataset[column].apply(lambda x: [str(x)])
    )
    return hashed_features

In [30]:
# apply tf-idf vectorization to Info column
print("Applying TF-IDF vectorization...")
vectorizer = TfidfVectorizer()
train_info = vectorizer.fit_transform(train_dataset["Info"])
test_info = vectorizer.transform(test_dataset["Info"])

# apply one-hot encoding to Protocol column
print("Applying one-hot encoding...")
encoder = OneHotEncoder()
train_protocol = encoder.fit_transform(train_dataset[["Protocol"]])
test_protocol = encoder.transform(test_dataset[["Protocol"]])

# apply standard scaling to Length column
print("Applying standard scaling...")
scaler = StandardScaler()
train_length = scaler.fit_transform(train_dataset[["Length"]])
test_length = scaler.transform(test_dataset[["Length"]])

# apply feature hashing to Source and Destination columns
print("Applying feature hashing...")
train_source = apply_feature_hashing(train_dataset, "Source")
test_source = apply_feature_hashing(test_dataset, "Source")
train_destination = apply_feature_hashing(train_dataset, "Destination")
test_destination = apply_feature_hashing(test_dataset, "Destination")

# combine features
train_features = hstack(
    [
        csr_matrix(train_dataset[["Time"]]),
        train_source,
        train_destination,
        train_protocol,
        csr_matrix(train_length),
        train_info,
    ]
)
test_features = hstack(
    [
        csr_matrix(test_dataset[["Time"]]),
        test_source,
        test_destination,
        test_protocol,
        csr_matrix(test_length),
        test_info,
    ]
)

Applying TF-IDF vectorization...
Applying one-hot encoding...
Applying standard scaling...
Applying feature hashing...


In [31]:
# report feature extraction results
logger.info(f"TF-IDF Vocabulary size: {len(vectorizer.vocabulary_)}")
logger.info(f"One-Hot Encoding unique categories: {len(encoder.categories_[0])}")  # type: ignore
logger.info(f"Standard Scaling mean: {scaler.mean_[0]:.4f}")  # type: ignore
logger.info(f"Standard Scaling std: {scaler.scale_[0]:.4f}")  # type: ignore
logger.info(f"Feature Hashing features count: {train_source.shape[1]}")
logger.info(f"Total number of features: {train_features.shape[1]}\n")

# write features and models to files
save_npz(FEATURES_TRAIN, train_features)
save_npz(FEATURES_TEST, test_features)
_ = dump(vectorizer, VECTORIZER_MODEL)
_ = dump(encoder, ENCODER_MODEL)
_ = dump(scaler, SCALER_MODEL)

TF-IDF Vocabulary size: 609
One-Hot Encoding unique categories: 6
Standard Scaling mean: 22.7672
Standard Scaling std: 14.1469
Feature Hashing features count: 20
Total number of features: 657



## Model Training

In [32]:
# load features (to prevent forced extraction to define variables)
train_features = load_npz(FEATURES_TRAIN)
train_labels = np.load(LABELS_TRAIN)

# train model
model = GradientBoostingClassifier(verbose=1)
with log_redirector(logger):
    model.fit(train_features, train_labels)  # type: ignore
logger.info("Model trained complete.\n")

# write model to file
_ = dump(model, GBM_MODEL)

      Iter       Train Loss   Remaining Time
         1           0.7654            3.36m
         2           0.6565            3.13m
         3           0.5758            3.07m
         4           0.5121            3.06m
         5           0.4581            2.94m
         6           0.4140            2.86m
         7           0.3766            2.82m
         8           0.3440            2.80m
         9           0.3160            2.74m
        10           0.2917            2.79m
        20           0.1581            2.46m
        30           0.0744            2.12m
        40           0.0528            1.82m
        50           0.0317            1.54m
        60           0.0204            1.23m
        70           0.0173           55.84s


KeyboardInterrupt: 

## Evaluation

In [None]:
# load model and features (to prevent forced training to define variables)
model = load(GBM_MODEL)
test_features = load_npz(FEATURES_TEST)
test_labels = np.load(LABELS_TEST)

# evaluate model
predictions = model.predict(test_features)  # type: ignore
accuracy = metrics.accuracy_score(test_labels, predictions)
logger.info(f"Train accuracy: {accuracy}")
conf_matrix = metrics.confusion_matrix(test_labels, predictions)
logger.info(f"Confusion matrix:\n{conf_matrix}")
report = metrics.classification_report(test_labels, predictions)
logger.info(f"Classification report:\n{report}")

Train accuracy: 0.8133819920640428

Confusion matrix:
[[ 56898   8432]
 [ 50733 200975]]

Classification report:
              precision    recall  f1-score   support

           0       0.53      0.87      0.66     65330
           1       0.96      0.80      0.87    251708

    accuracy                           0.81    317038
   macro avg       0.74      0.83      0.76    317038
weighted avg       0.87      0.81      0.83    317038

