In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_attacks_multiclass = pd.read_csv('dataset/attacks-multiclass.csv', index_col=0)

In [None]:
df_attacks_multiclass.head(10)

#### Label Encoding for Multiclass Classification Labels 

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder to the existing labels and transform them
df_attacks_multiclass['label'] = label_encoder.fit_transform(df_attacks_multiclass['label'])

In [None]:
df_attacks_multiclass.head(10)

#### Data Preprocessing and Train-Validation-Test Splitting for Multiclass Classification

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and labels (y)
X = df_attacks_multiclass.drop(columns=['label']) 
y = df_attacks_multiclass['label']

# Split into training and temporary set (70% training, 30% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=4, stratify=y)

# Split the temporary set into testing and validation sets (50% testing, 50% validation). Final is 70%-15%-15%
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=4, stratify=y_temp)

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)


#### Normalization

Min-Max Scaling: Scales the values so they fall within a defined range (0 and 1). The formula is:
$$\frac{x - \min(x)}{\max(x) - \min(x)}$$ 

Adjust the numerical values in the dataset to a common scale without distorting differences in the ranges of values.

This is going to be implemented after splitting the data to prevent bias.

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the validation and test data using the same scaler
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Models Implementation

#### XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define XGBoost parameters
params = {
    'objective': 'multi:softmax',  # SoftMax
    'num_class': len(df_attacks_multiclass['label'].unique()),  # Number of classes
    'eval_metric': 'merror'  # Evaluation metric: multiclass classification error rate
}

# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dval = xgb.DMatrix(X_val_scaled, label=y_val)

# Train XGBoost model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

# Predict on the validation set
y_val_pred = model.predict(dval)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)

In [None]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()