# Machine Learning for Networking - Group Project 5 - DDoS attacks detection and characterization - Section 2

# Preliminary operations 

## Import needed libraries

In [1]:
# import needed python libraries

%matplotlib inline

from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier


ModuleNotFoundError: No module named 'matplotlib'

## Data preparation & standardization

In [None]:
columns_to_remove = ['Active Mean', 'Active Std', 'Average Packet Size', 'Avg Bwd Segment Size', 'Avg Fwd Segment Size',
                     'Bwd IAT Max', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std',
                     'Flow Duration', 'Flow IAT Max', 'Flow IAT Mean', 'Flow IAT Min', 'Flow IAT Std', 'Flow Packets/s',
                     'Fwd Header Length', 'Fwd IAT Max', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Total',
                     'Fwd PSH Flags', 'Fwd Packet Length Mean', 'Fwd Packet Length Min', 'Idle Max', 'Idle Mean',
                     'Min Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
                     'Protocol', 'Subflow Bwd Bytes', 'Subflow Bwd Packets', 'Subflow Fwd Bytes', 'Subflow Fwd Packets',
                     'Total Backward Packets', 'Total Length of Bwd Packets', 'Total Length of Fwd Packets']
ddos_data = pd.read_csv("ddos_dataset.csv", sep=",")
ddos_data.rename(columns=lambda x: x.strip(), inplace=True)
ddos_data['SimillarHTTP'] = ddos_data['SimillarHTTP'].apply(lambda x: str(x) if x == 0 else x)
ddos_data['SimillarHTTP'] = ddos_data['SimillarHTTP'].str.strip().fillna(0)
ddos_data = ddos_data.drop(columns=columns_to_remove, errors='ignore')

# 1. Perform a split to segment the dataset into training and test dataset, in a stratified way with respect to the labels

In [None]:
# Prepare the dataset extracting Features (X) and Labels (Y) 
# Stratify the dataset by having 70% of the data in the traning set and 30% in the test set
ddos_data_copy = ddos_data.copy()

ddos_data_copy['label'] = pd.Categorical(ddos_data_copy['label']).codes  # transform to numerical labels
ddos_data_copy.drop(columns=["Unnamed: 0", "Flow ID", "Destination IP", "Source IP", "Timestamp", "SimillarHTTP"],
                    axis=1, inplace=True)

x = ddos_data_copy.drop(columns=['label'])
y = ddos_data_copy[['label']]

# Run stratified training-test splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, train_size=0.7, random_state=15)

# This line is used to flatten the labels (which might be in a two-dimensional array) into one-dimensional arrays. 
# This is often necessary for compatibility with certain machine learning algorithms that expect 1D arrays for labels.

y_train, y_test = np.ravel(y_train), np.ravel(y_test)

# Standardize data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_s, x_test_s = scaler.transform(x_train), scaler.transform(x_test)

#PCA
pca = PCA(n_components=0.95)  # Adjust n_components as needed
x_train_s = pca.fit_transform(x_train_s)
x_test_s = pca.transform(x_test_s)

In [None]:
x

# 2. Choose at least 3 ML methods, and perform the model training, with default parameter configuration, evaluating the performance on both training and test set. Output the confusion matrix and classification report. Do you observe overfitting or under-fitting? 

## 2 Models Training

In [None]:
# Initialize the models with default parameters
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Train the models on the training data
decision_tree.fit(x_train_s, y_train)
random_forest.fit(x_train_s, y_train)
svm_classifier.fit(x_train_s, y_train)

# Evaluate the models on the training set
y_train_pred_dt = decision_tree.predict(x_train_s)
y_train_pred_rf = random_forest.predict(x_train_s)
y_train_pred_svm = svm_classifier.predict(x_train_s)

# Evaluate the models on the test set
y_test_pred_dt = decision_tree.predict(x_test_s)
y_test_pred_rf = random_forest.predict(x_test_s)
y_test_pred_svm = svm_classifier.predict(x_test_s)


## 2 Output the confusion matrix and classification report

In [None]:
def plot_confusion_matrix_and_report(y_true, y_pred, model_name, dataset_type):
    plt.figure(figsize=(7, 6))
    sns.heatmap(confusion_matrix(y_true, y_pred), cmap='Blues', annot=True, cbar_kws={'label': 'Occurrences'}, fmt='d')
    plt.xlabel('Prediction')
    plt.ylabel('True')
    plt.title(f'{model_name} - {dataset_type} - Confusion matrix')
    plt.show()
    print(classification_report(y_true, y_pred))


plot_confusion_matrix_and_report(y_train, y_train_pred_dt, 'Decision Tree', 'Training Set')
plot_confusion_matrix_and_report(y_train, y_train_pred_rf, 'Random Forest', 'Training Set')
plot_confusion_matrix_and_report(y_train, y_train_pred_svm, 'SVM', 'Training Set')
plot_confusion_matrix_and_report(y_test, y_test_pred_dt, 'Decision Tree', 'Test Set')
plot_confusion_matrix_and_report(y_test, y_test_pred_rf, 'Random Forest', 'Test Set')
plot_confusion_matrix_and_report(y_test, y_test_pred_svm, 'SVM', 'Test Set')

There is overfitting in the training set.

# 3. Tune the hyper-parameters of the models through cross-validation. How do performance vary? Which model generates the best performance?

## Data preparation

In [None]:
x_train_htuning, x_val_htuning, y_train_htuning, y_val_htuning = train_test_split(x, y, stratify=y, train_size=5 / 7,
                                                                                  random_state=42)

# Standardize data and PCA
scaler_h = StandardScaler()
x_train_htuning_s = scaler_h.fit_transform(x_train_htuning)
pca = PCA(n_components=0.95)  # Adjust n_components as needed
x_train_htuning_s = pca.fit_transform(x_train_htuning_s)
x_val_htuning_s = pca.transform(scaler_h.transform(x_val_htuning))

y_train_htuning, y_val_htuning = np.ravel(y_train_htuning), np.ravel(y_val_htuning)

In [None]:
def randomized_search(model, param_dist, n_iter=50):  # Reduced n_iter
    print("Randomized Search for model:", model.__class__.__name__)
    randomized_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=n_iter, cv=2,
                                           # Reduced CV splits
                                           n_jobs=4, verbose=2, random_state=42,
                                           pre_dispatch='2*n_jobs')  # Adjusted n_jobs
    randomized_search.fit(x_train_htuning_s, y_train_htuning)
    best_model = randomized_search.best_estimator_
    accuracy = best_model.score(x_val_htuning_s, y_val_htuning)
    best_params = randomized_search.best_params_  # Get the best found parameters
    return best_model, accuracy, best_params

In [None]:
# Pre-compiled model objects
dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

# Efficient Cross-Validation Strategy
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
# Decision Tree hyperparameters
dt_params = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Random Forest hyperparameters
rf_params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

svm_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

In [None]:
svm_model, svm_accuracy, svm_best_params = randomized_search(svm_clf, svm_params)

In [None]:
dt_model, dt_accuracy, dt_best_params = randomized_search(dt_clf, dt_params)

In [None]:
rf_model, rf_accuracy, rf_best_params = randomized_search(rf_clf, rf_params)

In [None]:
# Print the best models. their accuracies and Hyperparameters
print(f"Decision Tree accuracy: {dt_accuracy}")
print(f"Random Forest accuracy: {rf_accuracy}")
print(f"SVM accuracy: {svm_accuracy}")

print("Random Forest Best Parameters:", rf_best_params)
print("Decision Tree Best Parameters:", dt_best_params)
print("SVM Best Parameters:", svm_best_params)


# Determine the best overall model
best_accuracy = max(dt_accuracy, rf_accuracy, svm_accuracy)

if best_accuracy == dt_accuracy:
    best_model = dt_clf
    best_params = dt_best_params
elif best_accuracy == rf_accuracy:
    best_model = rf_clf
    best_params = rf_best_params
else:
    best_model = svm_clf
    best_params = svm_best_params

print(f"Best model: {best_model.__class__.__name__} with accuracy {best_accuracy} and Hyperparameters {best_params}")

# 4. Investigate the False Positive and False Negative. Can you draw considerations about the misclassification in terms of features? Report your analysis and findings for the ones you consider the most notable samples. 

Ciò che bisogna fare qua è: dati i modelli precedenti, allora aggiungere una colonna al df con la predicted label e poi creare due subset. Poi fare opportune considerazioni.

In [None]:
best_model.set_params(**best_params)
best_model.fit(x_train_s, y_train)
predicted_classes = best_model.predict(x_train_s)
df_withPred = x_train.copy()
# Adding the predicted classes to the DataFrame
df_withPred['Predicted Class'] = predicted_classes # There is a difference in the nr of the lines.
df_withPred['Real Class'] = y_train

In [None]:
df_withPred['misclassified'] = df_withPred['Predicted Class'] != df_withPred['Real Class']

In [None]:
df_withPred.to_csv("./pred_Section2_Output.csv", index=False)

In [None]:
value_counts = df_withPred['misclassified'].value_counts()
value_counts

In [None]:
# Set the style of the visualizations
sns.set_style("whitegrid")

# Prepare data for plotting: Compare distributions of 'Fwd Packet Length Max' for misclassified vs correctly classified samples
correctly_classified_df = df_withPred[df_withPred['misclassified'] == False]
misclassified_fwd_pkt_len_max = df_withPred['Fwd Packet Length Max']
correctly_classified_fwd_pkt_len_max = correctly_classified_df['Fwd Packet Length Max']

# Plot distributions
plt.figure(figsize=(12, 6))
sns.histplot(correctly_classified_fwd_pkt_len_max, color="green", label="Correctly Classified", kde=True, stat="density", bins=30)
sns.histplot(misclassified_fwd_pkt_len_max, color="red", label="Misclassified", kde=True, stat="density", bins=30)
plt.xlabel('Forward Packet Length Max')
plt.ylabel('Density')
plt.title('Distribution of Forward Packet Length Max for Misclassified vs Correctly Classified Samples')
plt.legend()
plt.show()

The plot illustrates the distribution of 'Forward Packet Length Max' for both misclassified and correctly classified samples. It reveals that while there's some overlap between the two distributions, misclassified samples have a more varied distribution with significant density at both lower and higher values of 'Forward Packet Length Max'. This suggests that extreme values in this feature might contribute to misclassification, potentially due to their atypical nature compared to the majority of samples in their actual class.

In [None]:
# Plotting only the distribution of 'Bwd Packet Length Max' for misclassified vs correctly classified samples
misclassified_bwd_pkt_len_max = df_withPred['Bwd Packet Length Max']
correctly_classified_bwd_pkt_len_max = correctly_classified_df['Bwd Packet Length Max']

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(correctly_classified_bwd_pkt_len_max, color="green", label="Correctly Classified", kde=True, stat="density", bins=30)
sns.histplot(misclassified_bwd_pkt_len_max, color="red", label="Misclassified", kde=True, stat="density", bins=30)
plt.xlabel('Backward Packet Length Max')
plt.ylabel('Density')
plt.title('Distribution of Backward Packet Length Max for Misclassified vs Correctly Classified Samples')
plt.legend()
plt.show()

This plot illustrates that while there is overlap between the two groups, misclassified samples exhibit a broader spread across the range of values, particularly towards higher packet sizes. This suggests that larger backward packet sizes might be a factor contributing to misclassification, indicating the model's sensitivity to variations in this feature. 