In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from HierarchyModel.HierarchyModel import HierarchyModel
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [3]:
data = pd.read_csv('Datasets/Complete/Dataset.csv')

X = data.drop('AttackType', axis=1)
y = data['AttackType']

target_classes = {
    "BruteForce": 50000,
    "XSS": 50000,
    "SQLInjection": 50000
}

smote = SMOTE(sampling_strategy=target_classes, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [4]:
y_resampled_lvl1 = y_resampled.apply(lambda x: 'MALIGN' if x != 'BENIGN' else x)
X_resampled_lvl1 = X_resampled.copy()

y_resampled_lvl2 = y_resampled[y_resampled_lvl1 == 'MALIGN']
y_resampled_lvl2 = y_resampled_lvl2.apply(lambda x: 'WebAttack' if x in ['XSS', 'SQLInjection', 'BruteForce'] else x)
X_resampled_lvl2 = X_resampled[y_resampled_lvl1 == 'MALIGN']

y_resampled_lvl3 = y_resampled[y_resampled.isin(['XSS', 'SQLInjection', 'BruteForce'])]
X_resampled_lvl3 = X_resampled[y_resampled.isin(['XSS', 'SQLInjection', 'BruteForce'])]

print(y_resampled_lvl3.value_counts())

AttackType
BruteForce      50000
XSS             50000
SQLInjection    50000
Name: count, dtype: int64


In [5]:
# Crear una tabla de equivalencia
equivalence_table = pd.DataFrame({'index_lvl1': X_resampled_lvl1.index})

# Añadir índices de nivel 2 (solo filas clasificadas como MALIGN en nivel 1)
equivalence_table['index_lvl2'] = None
equivalence_table.loc[y_resampled_lvl1 == 'MALIGN', 'index_lvl2'] = X_resampled_lvl2.index

# Añadir índices de nivel 3 (solo filas clasificadas como WebAttack en nivel 2)
web_attack_mask = (y_resampled_lvl1 == 'MALIGN') & (y_resampled_lvl2 == 'WebAttack')
equivalence_table['index_lvl3'] = None
equivalence_table.loc[web_attack_mask, 'index_lvl3'] = X_resampled_lvl3.index

# Eliminar índices no relevantes para reducir el espacio

equivalence_table.head()


Unnamed: 0,index_lvl1,index_lvl2,index_lvl3
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,4


In [6]:
# Encode labels for level 2
label_encoder_lvl2 = LabelEncoder()
y_resampled_lvl2_encoded = label_encoder_lvl2.fit_transform(y_resampled_lvl2)

# Encode labels for level 3
label_encoder_lvl3 = LabelEncoder()
y_resampled_lvl3_encoded = label_encoder_lvl3.fit_transform(y_resampled_lvl3)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

#  Split data for level 1 model
X_train_lvl1, X_test_lvl1, y_train_lvl1, y_test_lvl1 = train_test_split(X_resampled_lvl1, y_resampled_lvl1, test_size=0.2, random_state=42)

# Split data for level 2 model
X_train_lvl2, X_test_lvl2, y_train_lvl2, y_test_lvl2 = train_test_split(X_resampled_lvl2, y_resampled_lvl2_encoded, test_size=0.2, random_state=42)

# Split data for level 3 model
X_train_lvl3, X_test_lvl3, y_train_lvl3, y_test_lvl3 = train_test_split(X_resampled_lvl3, y_resampled_lvl3_encoded, test_size=0.2, random_state=42)

In [15]:
model_lvl1 = RandomForestClassifier(random_state=42)
model_lvl1.fit(X_train_lvl1, y_train_lvl1)

y_pred_lvl1 = model_lvl1.predict(X_test_lvl1)

              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00     78452
      MALIGN       1.00      1.00      1.00     87520

    accuracy                           1.00    165972
   macro avg       1.00      1.00      1.00    165972
weighted avg       1.00      1.00      1.00    165972



In [16]:
lvl1_mask = y_pred_lvl1 == 'MALIGN'

In [19]:
model_lvl2 = MLPClassifier(random_state=42, max_iter=300)
model_lvl2.fit(X_train_lvl2, y_train_lvl2)

In [33]:
X_test_lvl2 = X_test_lvl1[lvl1_mask]

y_pred_lvl2 = model_lvl2.predict(X_test_lvl2)

y_pred_lvl2 = label_encoder_lvl2.inverse_transform(y_pred_lvl2)

y_pred_lvl1[lvl1_mask] = y_pred_lvl2

lvl2_mask = y_pred_lvl1 == 'WebAttack'



In [26]:
model_lvl3 = XGBClassifier(random_state=42)
model_lvl3.fit(X_train_lvl3, y_train_lvl3)

In [39]:
X_test_lvl3 = X_test_lvl1[lvl2_mask]

y_pred_lvl3 = model_lvl3.predict(X_test_lvl3)

y_pred_lvl3 = label_encoder_lvl3.inverse_transform(y_pred_lvl3)

y_pred_lvl1[lvl2_mask] = y_pred_lvl3

y_pred_final = y_pred_lvl1



print(classification_report(y_test, y_pred_final))


              precision    recall  f1-score   support

      BENIGN       0.99      1.00      1.00     78452
  BruteForce       0.95      0.87      0.91     10037
        DDoS       1.00      1.00      1.00     25788
    PortScan       1.00      1.00      1.00     31733
SQLInjection       0.97      0.99      0.98     10015
         XSS       0.92      0.96      0.94      9947

    accuracy                           0.99    165972
   macro avg       0.97      0.97      0.97    165972
weighted avg       0.99      0.99      0.99    165972

