# In this file, we will import several models of specific cyber attack classifier (based on DecisionTreeClassifier, best classfier)

##### Import modules

In [32]:
# Libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import gc

# Path to datasets
DATASET_DIRECTORY = ".\Files\\"

##### Import datasets

In [33]:
# Get all the datasets in the directory and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Only use a part of all datasets (10 files)
# df_sets = df_sets[:2]

# Set 80% of the datasets as models sets and 20% as test sets for federated model
df_sets = df_sets[:int(len(df_sets)*.8)]
federated_sets = df_sets[int(len(df_sets)*.8):]

# Define each column of the dataset and the target column
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

##### Classification models for each files

In [34]:
# Define the results
results_list = [] 

# Define models list
models = []

# Define the scaler method
scaler = StandardScaler()

# For each dataset
for set in tqdm(df_sets):
    # Definition of the model to use
    ML_model = DecisionTreeClassifier()

    # Define the name of the model
    ML_neam = "DecisionTreeClassifier"

    # Load the dataset
    d = pd.read_csv(DATASET_DIRECTORY + set)

    # Séparez les caractéristiques (X) et les étiquettes (y)
    X = d.drop(columns=y_column)
    y = d[y_column]

    # Divisez les données en ensembles de formation (80%) et de test (20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the scaler to the training data
    scaler.fit(X_train)

    # Normalize the dataset
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # print(X_train[:5])
    # print(y_train.head())

    # Train the model
    ML_model.fit(X_train, y_train)

    # Delete the dataset from the memory
    del d  

    # Save the model and add it to the list of models
    temp_model = '.\Models\FEDERATED\\' + set + '.joblib'
    joblib.dump(ML_model, temp_model)
    models.append(temp_model)


    # Predict the labels
    preds = ML_model.predict(X_test)

    # Results of the model
    result = {
        'Model': set,
        'Accuracy': accuracy_score(y_test, preds),
        'Precision': precision_score(y_test, preds, average='weighted'),
        'Recall': recall_score(y_test, preds, average='weighted'),
        'F1': f1_score(y_test, preds, average='weighted')
    }

    # Append the result to the temporary list
    results_list.append(result)

# Create a DataFrame from the list of results
results = pd.DataFrame(results_list)

# Do the average of all the results
final_result = {
    'Model': 'Average',
    'Accuracy': results['Accuracy'].mean(),
    'Precision': results['Precision'].mean(),
    'Recall': results['Recall'].mean(),
    'F1': results['F1'].mean()
}

print(final_result)

100%|██████████| 135/135 [08:07<00:00,  3.61s/it]

{'Model': 'Average', 'Accuracy': 0.9919186996812641, 'Precision': 0.992022464355332, 'Recall': 0.9919186996812641, 'F1': 0.991936740398441}





#### Federated model

In [64]:
def create_final_model(models):
  """
  Crée un modèle final qui prend les paramètres moyens de tous les autres modèles.

  Args:
    models: Une liste de modèles fédérés.

  Returns:
    Un nouveau modèle avec les paramètres moyens.
  """

  # Obtenir les paramètres de tous les modèles
  params = {}
  for model in models:
    params.update(model.get_params())

  # Créer un nouveau modèle avec les paramètres moyens
  final_model = DecisionTreeClassifier()
  final_model.set_params(**params)

  # Nouveau modèle
  return final_model

# Charger les modèles fédérés
temp_models = []
for model_path in models:
  model = joblib.load(model_path)
  temp_models.append(model)

train_sets = federated_sets[:int(len(federated_sets)*.8)]
test_sets = federated_sets[int(len(federated_sets)*.8):]
print('train_sets: ', len(train_sets))
print('test_sets: ', len(test_sets))

# Créer le modèle final
final_model = create_final_model(temp_models)

# Train the final model
# Define the scaler method
scaler = StandardScaler()

# For each training set
print("Training the final model...")
for train_set in (train_sets):
    # Fit the scaler on the training sets
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

# For each dataset of the training set
for train_set in tqdm(train_sets):
    # Load the dataset
    d = pd.read_csv(DATASET_DIRECTORY + train_set)

    # Normalize the dataset
    d[X_columns] = scaler.transform(d[X_columns])

    # Train the model
    final_model.fit(d[X_columns], d[y_column])

    # Delete the dataset from the memory
    del d  

# Test the final model
# Initialize the list of true labels
y_test = []

# Initialize the list of predictions
y_pred = []

# For each dataset of the test set
print("Testing the final model...")
for test_set in (test_sets):
    # Load the dataset
    d_test_current = pd.read_csv(DATASET_DIRECTORY + test_set)

    # Normalize the dataset
    d_test_current[X_columns] = scaler.transform(d_test_current[X_columns])

    # Add the true labels to the list
    y_test += list(d_test_current[y_column].values)

    # Predict the labels
    y_pred += list(final_model.predict(d_test_current[X_columns]))

    # Delete the dataset from the memory
    del d_test_current

# For each prediction
print("Prediction results:")

print('y_pred: ', len(y_pred))
print('y_test: ', len(y_test))
print(f"##### {final_model} (34 classes) #####")
print('accuracy_score: ', accuracy_score(y_pred, y_test))
print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
print()


# # Sauvegarder le modèle central fédéré
# joblib.dump(central_federated_model, '.\\Models\\FEDERATED\\central_federated_model.joblib')


train_sets:  21
test_sets:  6
Training the final model...


100%|██████████| 21/21 [01:18<00:00,  3.75s/it]


Testing the final model...
Prediction results:
y_pred:  1654928
y_test:  1654928
##### DecisionTreeClassifier() (34 classes) #####
accuracy_score:  0.9921404435721675
recall_score:  0.8312569709662008
precision_score:  0.8174385984988672
f1_score:  0.8228106995305607

