# Bluetooth DoS Classifier

Initial experiment using default parameters for the Gradient Boosting Classifier from `scikit-learn`. Preprocessing and feature extraction are kept to a minimum to create a baseline.

## Setup

In [36]:
import os

import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from joblib import dump, load
from scipy.sparse import csr_matrix, hstack, load_npz, save_npz
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# ensure that the working directory is the same as the notebook
notebook_path = os.path.dirname(os.path.abspath("__file__"))
os.chdir(notebook_path)

### Constants

In [37]:
DATA = "data"
MODELS = "models"

# data paths
ATTACK_TRAIN = os.path.join(DATA, "dos_train.csv")
BENIGN_TRAIN = os.path.join(DATA, "benign_train.csv")
ATTACK_TEST = os.path.join(DATA, "dos_test.csv")
BENIGN_TEST = os.path.join(DATA, "benign_test.csv")

# preprocessed data paths
PREPROCESSED_TRAIN = os.path.join(DATA, "preprocessed_train.csv")
PREPROCESSED_TEST = os.path.join(DATA, "preprocessed_test.csv")
LABELS_TRAIN = os.path.join(DATA, f"labels_train.npy")
LABELS_TEST = os.path.join(DATA, f"labels_test.npy")

# features paths
FEATURES_TEST = os.path.join(DATA, f"features_test.npz")
FEATURES_TRAIN = os.path.join(DATA, f"features_train.npz")

# models paths
VECTORIZER_MODEL = os.path.join(MODELS, f"vectorizer.joblib")
ENCODER_MODEL = os.path.join(MODELS, f"encoder.joblib")
SCALER_MODEL = os.path.join(MODELS, f"scaler.joblib")
GBM_MODEL = os.path.join(MODELS, f"gbm.joblib")  # gradient boosting machine

# create directories
if not os.path.exists(DATA):
    raise Exception("Data directory not found.")
if not os.path.exists(MODELS):
    os.makedirs("models")

# check that data files exist
if not all(
    map(
        os.path.exists,
        [ATTACK_TRAIN, BENIGN_TRAIN, ATTACK_TEST, BENIGN_TEST],
    )
):
    raise Exception("Data files not found.")

## Dataset Preprocessing

In [38]:
# read datasets
attack_train = pd.read_csv(ATTACK_TRAIN)
benign_train = pd.read_csv(BENIGN_TRAIN)
attack_test = pd.read_csv(ATTACK_TEST)
benign_test = pd.read_csv(BENIGN_TEST)

In [39]:
# add type column indicating attack or benign
attack_train["Type"] = 1
attack_test["Type"] = 1
benign_train["Type"] = 0
benign_test["Type"] = 0

# combine datasets
train_dataset = pd.concat([attack_train, benign_train], ignore_index=True)
test_dataset = pd.concat([attack_test, benign_test], ignore_index=True)

# shuffle datasets
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

# generate labels
train_labels = train_dataset["Type"]
train_dataset.drop(columns=["Type"], inplace=True)
test_labels = test_dataset["Type"]
test_dataset.drop(columns=["Type"], inplace=True)

In [40]:
# summary statistics
print(f"Training data:\n{attack_train.describe()}\n")
print(f"Testing data:\n{attack_test.describe()}")

Training data:
                 No.          Time         Length      Type
count  998391.000000  9.983910e+05  998391.000000  998391.0
mean   499196.000000  4.621494e+05      20.064474       1.0
std    288210.800641  4.679402e+05      12.352341       0.0
min         1.000000  0.000000e+00       4.000000       1.0
25%    249598.500000  2.276077e+03       8.000000       1.0
50%    499196.000000  6.853190e+05      16.000000       1.0
75%    748793.500000  8.540422e+05      32.000000       1.0
max    998391.000000  1.198804e+06     255.000000       1.0

Testing data:
                 No.           Time         Length      Type
count  251708.000000  251708.000000  251708.000000  251708.0
mean   125854.500000    1071.256077      20.006591       1.0
std     72661.985116     638.578923      12.006976       0.0
min         1.000000       0.000000       5.000000       1.0
25%     62927.750000     601.485238       8.000000       1.0
50%    125854.500000    1050.763037      16.000000       1.0
75%

In [41]:
# write preprocessed dataset to files
train_dataset.to_csv(PREPROCESSED_TRAIN, index=False)
test_dataset.to_csv(PREPROCESSED_TEST, index=False)
np.save(LABELS_TRAIN, train_labels)
np.save(LABELS_TEST, test_labels)

## Feature Extraction

In [42]:
# load preprocessed datasets
train_dataset = pd.read_csv(PREPROCESSED_TRAIN)
test_dataset = pd.read_csv(PREPROCESSED_TEST)

In [43]:
def apply_feature_hashing(dataset, column, n_features=20):
    """Applies feature hashing to a specified column of the dataset."""
    hasher = FeatureHasher(n_features=n_features, input_type="string")
    hashed_features = hasher.transform(
        dataset[column].apply(lambda x: [str(x)])
    )
    return hashed_features

In [44]:
# apply tf-idf vectorization to Info column
vectorizer = TfidfVectorizer()
train_info = vectorizer.fit_transform(train_dataset["Info"])
test_info = vectorizer.transform(test_dataset["Info"])

# apply one-hot encoding to Protocol column
encoder = OneHotEncoder()
train_protocol = encoder.fit_transform(train_dataset[["Protocol"]])
test_protocol = encoder.transform(test_dataset[["Protocol"]])

# apply standard scaling to Length column
scaler = StandardScaler()
train_length = scaler.fit_transform(train_dataset[["Length"]])
test_length = scaler.transform(test_dataset[["Length"]])

# apply feature hashing to Source and Destination columns
train_source = apply_feature_hashing(train_dataset, "Source")
test_source = apply_feature_hashing(test_dataset, "Source")
train_destination = apply_feature_hashing(train_dataset, "Destination")
test_destination = apply_feature_hashing(test_dataset, "Destination")

# combine features
train_features = hstack(
    [
        csr_matrix(train_dataset[["Time"]]),
        train_source,
        train_destination,
        train_protocol,
        csr_matrix(train_length),
        train_info,
    ]
)
test_features = hstack(
    [
        csr_matrix(test_dataset[["Time"]]),
        test_source,
        test_destination,
        test_protocol,
        csr_matrix(test_length),
        test_info,
    ]
)

In [45]:
# report feature extraction results
print(f"TF-IDF Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"One-Hot Encoding unique categories: {len(encoder.categories_[0])}")  # type: ignore
print(f"Standard Scaling mean: {scaler.mean_[0]:.4f}")  # type: ignore
print(f"Standard Scaling std: {scaler.scale_[0]:.4f}")  # type: ignore
print(f"Feature Hashing features count: {train_source.shape[1]}")
print(f"Total number of features: {train_features.shape[1]}")

TF-IDF Vocabulary size: 609
One-Hot Encoding unique categories: 6
Standard Scaling mean: 22.7672
Standard Scaling std: 14.1469
Feature Hashing features count: 20
Total number of features: 657


In [46]:
# write features and models to files
save_npz(FEATURES_TRAIN, train_features)
save_npz(FEATURES_TEST, test_features)
_ = dump(vectorizer, VECTORIZER_MODEL)
_ = dump(encoder, ENCODER_MODEL)
_ = dump(scaler, SCALER_MODEL)

## Model Training

In [47]:
# load features and labels
train_features = load_npz(FEATURES_TRAIN)
train_labels = np.load(LABELS_TRAIN)

In [48]:
# train model
model = GradientBoostingClassifier(verbose=1)
_ = model.fit(train_features, train_labels)  # type: ignore

      Iter       Train Loss   Remaining Time 
         1           0.7654            3.78m
         2           0.6565            3.24m
         3           0.5758            2.99m
         4           0.5121            2.85m
         5           0.4581            2.76m
         6           0.4140            2.70m
         7           0.3766            2.66m
         8           0.3440            2.63m
         9           0.3160            2.58m
        10           0.2917            2.55m
        20           0.1581            2.37m
        30           0.0744            2.12m
        40           0.0528            1.79m
        50           0.0317            1.50m
        60           0.0204            1.19m
        70           0.0173           53.52s
        80           0.0140           35.68s
        90           0.0126           17.69s
       100           0.0086            0.00s


In [49]:
# write model to file
_ = dump(model, GBM_MODEL)

## Evaluation

In [50]:
# load model and features
model = load(GBM_MODEL)
test_features = load_npz(FEATURES_TEST)
test_labels = np.load(LABELS_TEST)

In [51]:
# evaluate model
predictions = model.predict(test_features)  # type: ignore
accuracy = metrics.accuracy_score(test_labels, predictions)
print(f"Train accuracy: {accuracy}\n")
conf_matrix = metrics.confusion_matrix(test_labels, predictions)
print(f"Confusion matrix:\n{conf_matrix}\n")
report = metrics.classification_report(test_labels, predictions)
print(f"Classification report:\n{report}")

Train accuracy: 0.8341019057652395

Confusion matrix:
[[ 63467   1863]
 [ 50733 200975]]

Classification report:
              precision    recall  f1-score   support

           0       0.56      0.97      0.71     65330
           1       0.99      0.80      0.88    251708

    accuracy                           0.83    317038
   macro avg       0.77      0.88      0.80    317038
weighted avg       0.90      0.83      0.85    317038

