# In this file, we will import several models of specific cyber attack classifier (based on DecisionTreeClassifier, best classfier)
### We need to import
- Classification
- All specific attack models

#### Libraries importation

In [8]:
# Libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib
import gc

# Path to datasets
DATASET_DIRECTORY = ".\Files\\"

# Classification training (if you already have classification model in your Models repository, you can go to the next section)
### Table of content
- Importing DataSet
- Scaling
- Classification

#### Importing DataSet and scaling 

In [9]:
# Get all the datasets in the directory and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Only use a part of all datasets (100 files)
df_sets = df_sets[0:10]

# Set 80% of the datasets as training sets and 20% as test sets
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

# Define each column of the dataset and the target column
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

# Define the scaler method
scaler = StandardScaler()

# For each training set
for train_set in tqdm(training_sets):
    # Fit the scaler on the training sets
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:25<00:00,  3.18s/it]


#### Classification

In [14]:
# Turn variable to True if you want to save the model
save_model = True

# Turn variable to True if you want to load the model

# Definition of the models to use and their names
ML_model = DecisionTreeClassifier()
ML_neam = "DecisionTreeClassifier"

# For each dataset of the training set
for train_set in tqdm(training_sets):
    # Load the dataset
    d = pd.read_csv(DATASET_DIRECTORY + train_set)

    # Normalize the dataset
    d[X_columns] = scaler.transform(d[X_columns])

    # # For each model
    # for model in (ML_models):
    # Train the model
    ML_model.fit(d[X_columns], d[y_column])

    # Delete the dataset from the memory
    del d

# Initialize the list of true labels
y_test = []

# Initialize the list of predictions
preds = []

# For each dataset of the test set
for test_set in tqdm(test_sets):
    # Load the dataset
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)

    # Normalize the dataset
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    # Add the true labels to the list
    y_test += list(d_test[y_column].values)
    
    # For each model
    # for i in (range(len(ML_model))):
        # Select the model
    model = ML_model

    # Predict the labels
    y_pred = list(model.predict(d_test[X_columns]))

    # Add the predictions to the list
    preds = preds + y_pred

    prediction_result = {"y_test":y_test, "y_pred":y_pred}

    # Delete the dataset from the memory
    del d_test

# For each prediction
print("Prediction results:")
print()

# If the user wants to save the models
if save_model:
    # Save the model
    joblib.dump(ML_model, f".\Models\model_{ML_neam}_34_classes.pkl")

y_pred = preds
#print('y_pred: ', len(y_pred))
#print('y_test: ', len(y_test))
print(f"##### {ML_neam} (34 classes) #####")
print('accuracy_score: ', accuracy_score(y_pred, y_test))
print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
print()

# Flush the memory
# gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [02:38<00:00, 19.78s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.27s/it]


Prediction results:

##### DecisionTreeClassifier (34 classes) #####
accuracy_score:  0.9919125295657917
recall_score:  0.802321408357807
precision_score:  0.7979221157762059
f1_score:  0.7966748460187614



#### Dictionnary for 7 classes

In [15]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

#### Save of classification prediction

In [19]:
# Créez un dictionnaire pour stocker les prédictions classées par catégorie
classified_predictions = {category: [] for category in dict_7classes.values()}


model_name = ML_neam

# Itérez sur les prédictions et les lignes correspondantes
for index, prediction in enumerate(y_pred):
    # Obtenez la catégorie correspondant à la prédiction
    category = dict_7classes.get(prediction, None)
    
    if category:
        # Obtenez le vrai label correspondant à l'index actuel depuis y_test
        true_label = y_test[index]
        
        # Ajoutez la prédiction, le modèle, le vrai label et l'index de ligne associés à la catégorie correspondante
        classified_predictions[category].append({
            'model_name': model_name,
            'prediction': prediction,
            'true_label': true_label,
            'row_index': index
        })

print(classified_predictions['Mirai'][:5])

# # Chargement des modèles spécifiques par catégorie (à adapter selon votre structure de sauvegarde)
# def load_models_by_category():
#     models_by_category = {}
#     for category in dict_7classes.values():
#         # Chargez le modèle spécifique à la catégorie depuis le fichier local
#         model_filename = f".\Models\model_{category}.pkl"
#         model = joblib.load(model_filename)
#         models_by_category[category] = model
#     return models_by_category

# # Fonction pour prédire une ligne avec le modèle spécifique à la catégorie
# def predict_with_category_model(row, category, models_by_category):
#     if category in models_by_category:
#         model = models_by_category[category]
#         # Prédisez la ligne avec le modèle spécifique à la catégorie
#         prediction = model.predict([row])  # Assurez-vous que "row" est un tableau 2D avec les caractéristiques attendues
#         return prediction[0]  # Retournez la prédiction (assumant que le modèle renvoie un tableau de prédictions)
#     else:
#         return None

# # Chargez les modèles spécifiques par catégorie
# models_by_category = load_models_by_category()

# # Exemple d'utilisation pour prédire une ligne dans une catégorie donnée
# row_to_predict = [0.1, 0.2, 0.3, 0.4]  # Remplacez par les caractéristiques de la ligne à prédire
# category_to_predict = 'DDoS'  # Remplacez par la catégorie souhaitée
# prediction = predict_with_category_model(row_to_predict, category_to_predict, models_by_category)
# if prediction is not None:
#     print(f"Prediction for category '{category_to_predict}': {prediction}")
# else:
#     print(f"No model found for category '{category_to_predict}'")


[{'model_name': 'DecisionTreeClassifier', 'prediction': 'Mirai-udpplain', 'true_label': 'Mirai-udpplain', 'row_index': 4}, {'model_name': 'DecisionTreeClassifier', 'prediction': 'Mirai-udpplain', 'true_label': 'Mirai-udpplain', 'row_index': 10}, {'model_name': 'DecisionTreeClassifier', 'prediction': 'Mirai-greip_flood', 'true_label': 'Mirai-greip_flood', 'row_index': 19}, {'model_name': 'DecisionTreeClassifier', 'prediction': 'Mirai-greip_flood', 'true_label': 'Mirai-greip_flood', 'row_index': 22}, {'model_name': 'DecisionTreeClassifier', 'prediction': 'Mirai-greeth_flood', 'true_label': 'Mirai-greeth_flood', 'row_index': 44}]


#### Some tests

In [None]:
def classify_data(data, model, X_column, dict_7classes):
    # Appliquer les prédictions du modèle aux données
    predictions = model.predict(data[X_column])
    
    # Créer un dictionnaire pour stocker les lignes classées
    classified_data = {category: [] for category in dict_7classes.values()}
    
    # Itérer à travers les données et les prédictions
    for index, row in data.iterrows():
        prediction = predictions[index]
        # Obtenir la catégorie correspondant à la prédiction
        category = dict_7classes.get(prediction, None)
        if category:
            # Ajouter la ligne aux données classées correspondantes
            classified_data[category].append(row)
    
    return classified_data

# Utilisation de la fonction pour classer les données
classified_data = classify_data(data, model, X_column, dict_7classes)


In [7]:
gc.collect()

2203