# In this file, we will do a federated learning approach to create a model that can predict the cyber attacks.

### Table of Contents :
- Hyperparameters approach
- Bagging approach
- Voting approach
- Learned parameters approach

## Import modules

In [16]:
# Libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
import joblib
import gc
from collections import Counter
import ast
import re

import functions_ml as fml

# Path to datasets
dataset_directory = ".\Files\\"
hyperparameters_directory = ".\Federated_learning\\Hyperparameters_approach\\"
voting_directory = ".\Federated_learning\\Voting_approach\\"
bagging_directory = ".\Federated_learning\\Bagging_approach\\"

## Import datasets

In [7]:
# Get all the datasets in the directory and sort them
df_sets = [k for k in os.listdir(dataset_directory) if k.endswith('.csv')]
df_sets.sort()

# Only use a part of all datasets (10 files)
# df_sets = df_sets[:2]

# Set 80% of the datasets as models sets and 20% as test sets for federated model
df_sets = df_sets[:int(len(df_sets)*.8)]
federated_sets = df_sets[int(len(df_sets)*.8):]

# Define each column of the dataset and the target column
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

## Hyperparameters approach

### Build sub-models and search for best hyperparameters

In [8]:
sub_models_datasets, final_models_datasets = fml.get_train_and_test_files()
print(len(sub_models_datasets), len(final_models_datasets))

X_columns = fml.x_columns(fml.read_csv_file(sub_models_datasets[0]))
y_column = 'label'

135 34


In [9]:
# sub_models_datasets = sub_models_datasets[:1]

In [10]:
niter = 50
ncv = 8

# Define performance dataframe
df_performance = pd.DataFrame(columns=['file', 'model', 'accuracy', 'precision', 'recall', 'f1', 'params'])

for file in tqdm(sub_models_datasets):
    # Chargement des données
    df = fml.read_csv_file(file, dataset_directory)
    X = df.drop(columns=y_column)
    y = df[y_column]

    # Division en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Création du modèle et recherche par grille
    model = DecisionTreeClassifier(random_state=42)
    search_spaces = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 100, 1000],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2', None],
        'max_leaf_nodes': [None, 10, 100, 1000],
        'random_state': [42],
    }
    search = BayesSearchCV(model, search_spaces, n_iter=niter, cv=ncv, n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)
    best_params = search.best_params_

    model = DecisionTreeClassifier(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy=accuracy_score(y_test, y_pred)
    # class_report = classification_report(y_test, y_pred, output_dict=True)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Ajout des résultats au dataframe
    df_performance = df_performance.append({'file': file, 'model': 'DecisionTreeClassifier', 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'params': best_params}, ignore_index=True)

    # Sauvegarde du modèle
    joblib.dump(model, f'{hyperparameters_directory}DT_{file[:-4]}.joblib')

    # Libération de la mémoire
    del model, search, best_params, y_pred, accuracy, precision, recall, f1

    # Garbage collector
    gc.collect()

# Sauvegarde du dataframe
df_performance.to_csv(f'{hyperparameters_directory}DT_performance.csv', index=False)

100%|██████████| 135/135 [8:15:41<00:00, 220.30s/it]  


### Build main model with the most common hyperparameters of sub-models

In [12]:
# Most common hyperparameters
# Load performance dataframe
df_performance = pd.read_csv(f'{hyperparameters_directory}DT_performance.csv')

def parse_hyperparameters(hyperparam_str):
    # Remove 'OrderedDict' and surrounding brackets
    clean_str = hyperparam_str.replace('OrderedDict([', '').replace('])', '')

    # Initialize an empty dictionary to store hyperparameters
    hyperparams = {}

    # Regular expression pattern to match key-value pairs
    pattern = re.compile(r"\('([^']*)', (.*?)\)")
    matches = pattern.findall(clean_str)

    for key, value in matches:
        # Strip quotes and leading/trailing whitespace from key and value
        key = key.strip()
        value = value.strip().strip("'")

        # Handle None, numeric, and other string values
        if value == 'None':
            value = None
        elif value.isdigit():
            value = int(value)
        elif value.replace('.', '', 1).isdigit():
            value = float(value)

        hyperparams[key] = value

    return hyperparams

# Apply the parsing function to each row in the 'params' column
df_performance['params'] = df_performance['params'].apply(parse_hyperparameters)

# Determine the most common hyperparameters
common_hyperparams = {}
for param in df_performance['params'].iloc[0].keys():
    values = df_performance['params'].apply(lambda x: x[param])
    most_common = Counter(values).most_common(1)[0][0]
    common_hyperparams[param] = most_common

print(common_hyperparams)

# Ensuring max_depth is an integer
if 'max_depth' in common_hyperparams and common_hyperparams['max_depth'] is not None:
    common_hyperparams['max_depth'] = int(common_hyperparams['max_depth'])

# Ensuring max_features is an integer
if 'max_features' in common_hyperparams and common_hyperparams['max_features'] is not None:
    common_hyperparams['max_features'] = int(common_hyperparams['max_features'])

# Ensuring max_leaf_nodes is an integer
if 'max_leaf_nodes' in common_hyperparams and common_hyperparams['max_leaf_nodes'] is not None:
    common_hyperparams['max_leaf_nodes'] = int(common_hyperparams['max_leaf_nodes'])

# Main performance dataframe
model = DecisionTreeClassifier(**common_hyperparams)

# If the main performance dataframe already exists, load it
if os.path.exists(f'{hyperparameters_directory}DT_main_performance.csv'):
    df_main_performance = pd.read_csv(f'{hyperparameters_directory}DT_main_performance.csv')
else:
    df_main_performance = pd.DataFrame(columns=['model', 'nb_sub_files', 'accuracy', 'precision', 'recall', 'f1'])

# Get training and test datasets
train_set, test_set = final_models_datasets[:int(len(final_models_datasets)*.8)], final_models_datasets[int(len(final_models_datasets)*.8):]
print(len(train_set), len(test_set))

for set in tqdm(train_set):
    # Load dataset
    df = fml.read_csv_file(set, dataset_directory)
    X = df.drop(columns=y_column)
    y = df[y_column]

    # Train the model
    model.fit(X, y)

# Test the model
y_true = []
y_pred = []
for set in tqdm(test_set):
    # Load dataset
    df = fml.read_csv_file(set, dataset_directory)
    X = df.drop(columns=y_column)
    y = df[y_column]

    # Predict labels
    y_true.extend(y)
    y_pred.extend(model.predict(X))

# Compute performance metrics
accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro')

# Save performance and model
df_main_performance = df_main_performance.append({'model': model, 'nb_sub_files': len(sub_models_datasets), 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}, ignore_index=True)
joblib.dump(model, f'{hyperparameters_directory}DT_main.joblib')
df_main_performance.to_csv(f'{hyperparameters_directory}DT_main_performance.csv', index=False)

{'criterion': 'entropy', 'max_depth': 100.0, 'max_features': None, 'max_leaf_nodes': 1000.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'random_state': 42, 'splitter': 'best'}
27 7


100%|██████████| 27/27 [02:47<00:00,  6.20s/it]
100%|██████████| 7/7 [00:09<00:00,  1.29s/it]


In [13]:
pd.read_csv(f'{hyperparameters_directory}DT_main_performance.csv').head()

Unnamed: 0,model,nb_sub_files,accuracy,precision,recall,f1
0,"DecisionTreeClassifier(max_depth=1000, max_lea...",10,0.992416,0.856889,0.809058,0.825847
1,"DecisionTreeClassifier(criterion='entropy', ma...",135,0.992448,0.822656,0.796229,0.806956


## Voting approach

### Voting approach with hyperparameted sub-models

In [14]:
sub_models_datasets, final_models_datasets = fml.get_train_and_test_files()
print(len(sub_models_datasets), len(final_models_datasets))

X_columns = fml.x_columns(fml.read_csv_file(sub_models_datasets[0]))
y_column = 'label'

135 34


In [None]:
# sub_models_datasets = sub_models_datasets[:1]

In [17]:
# Load all sub models from the directory
sub_models = []
try:
    for file in tqdm(sub_models_datasets):
        sub_models.append(joblib.load(f'{hyperparameters_directory}DT_{file[:-4]}.joblib'))
except:
    raise Exception('Error : no sub models found, please run hyperparameters approach first')

# Build the federated model
models = []
models.append({'model' : VotingClassifier(estimators=[('model_'+str(i), sub_models[i]) for i in range(len(sub_models))], voting='hard'), 'nb_sub_files' : len(sub_models), 'type' : 'hard'})
models.append({'model' : VotingClassifier(estimators=[('model_'+str(i), sub_models[i]) for i in range(len(sub_models))], voting='soft'), 'nb_sub_files' : len(sub_models), 'type' : 'soft'})

# If the main performance dataframe already exists, load it
if os.path.exists(f'{voting_directory}DT_main_performance.csv'):
    df_main_performance = pd.read_csv(f'{voting_directory}DT_main_performance.csv')
else:
    df_main_performance = pd.DataFrame(columns=['model', 'nb_sub_files', 'accuracy', 'precision', 'recall', 'f1'])

# Get training and test datasets
train_set, test_set = final_models_datasets[:int(len(final_models_datasets)*.8)], final_models_datasets[int(len(final_models_datasets)*.8):]
print(len(train_set), len(test_set))

# For each model
for model in tqdm(models):
    # Train the model
    for set in tqdm(train_set):
        # Load dataset
        df = fml.read_csv_file(set, dataset_directory)
        X = df.drop(columns=y_column)
        y = df[y_column]

        # Train the model
        model['model'].fit(X, y)

    # Test the model
    y_true = []
    y_pred = []
    for set in tqdm(test_set):
        # Load dataset
        df = fml.read_csv_file(set, dataset_directory)
        X = df.drop(columns=y_column)
        y = df[y_column]

        # Predict labels
        y_true.extend(y)
        y_pred.extend(model['model'].predict(X))

    # Compute performance metrics
    accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro')

    # Save performance and model
    df_main_performance = df_main_performance.append({'model': model['model'], 'nb_sub_files': model['nb_sub_files'], 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'type' : model['type']}, ignore_index=True)
    joblib.dump(model['model'], f'{voting_directory}DT_main_{model["type"]}.joblib')
    df_main_performance.to_csv(f'{voting_directory}DT_main_performance.csv', index=False)

100%|██████████| 135/135 [00:00<00:00, 1299.99it/s]


27 7


100%|██████████| 27/27 [4:23:19<00:00, 585.16s/it]
100%|██████████| 7/7 [01:57<00:00, 16.72s/it]
  4%|▎         | 1/27 [13:39<5:55:12, 819.70s/it]it]
 50%|█████     | 1/2 [4:39:43<4:39:43, 16783.09s/it]


KeyboardInterrupt: 

## Other

##### Classification models for each files

In [None]:
# Define the results
results_list = [] 

# Define models list
models = []

# Define the scaler method
scaler = StandardScaler()

# For each dataset
for set in tqdm(df_sets):
    # Definition of the model to use
    ML_model = DecisionTreeClassifier()

    # Define the name of the model
    ML_neam = "DecisionTreeClassifier"

    # Load the dataset
    d = pd.read_csv(dataset_directory + set)

    # Séparez les caractéristiques (X) et les étiquettes (y)
    X = d.drop(columns=y_column)
    y = d[y_column]

    # Divisez les données en ensembles de formation (80%) et de test (20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the scaler to the training data
    scaler.fit(X_train)

    # Normalize the dataset
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # print(X_train[:5])
    # print(y_train.head())

    # Train the model
    ML_model.fit(X_train, y_train)

    # Delete the dataset from the memory
    del d  

    # Save the model and add it to the list of models
    temp_model = '.\Models\FEDERATED\\' + set + '.joblib'
    joblib.dump(ML_model, temp_model)
    models.append(temp_model)


    # Predict the labels
    preds = ML_model.predict(X_test)

    # Results of the model
    result = {
        'Model': set,
        'Accuracy': accuracy_score(y_test, preds),
        'Precision': precision_score(y_test, preds, average='weighted'),
        'Recall': recall_score(y_test, preds, average='weighted'),
        'F1': f1_score(y_test, preds, average='weighted')
    }

    # Append the result to the temporary list
    results_list.append(result)

# Create a DataFrame from the list of results
results = pd.DataFrame(results_list)

# Do the average of all the results
final_result = {
    'Model': 'Average',
    'Accuracy': results['Accuracy'].mean(),
    'Precision': results['Precision'].mean(),
    'Recall': results['Recall'].mean(),
    'F1': results['F1'].mean()
}

print(final_result)

100%|██████████| 135/135 [08:07<00:00,  3.61s/it]

{'Model': 'Average', 'Accuracy': 0.9919186996812641, 'Precision': 0.992022464355332, 'Recall': 0.9919186996812641, 'F1': 0.991936740398441}





#### Main model

In [None]:
def create_final_model(models):
  """
  Crée un modèle final qui prend les paramètres moyens de tous les autres modèles.

  Args:
    models: Une liste de modèles fédérés.

  Returns:
    Un nouveau modèle avec les paramètres moyens.
  """

  # Obtenir les paramètres de tous les modèles et faire la moyenne
  params = {}
  for model in models:
    params[model] = model.get_params()

  for key in params[models[0]].keys():
    params[models[0]][key] = np.mean([params[model][key] for model in models])

  # Créer un nouveau modèle avec les paramètres moyens
  final_model = models[0].__class__(**params[models[0]])

  # Nouveau modèle
  return final_model

# Charger les modèles fédérés
temp_models = []
for model_path in models:
  model = joblib.load(model_path)
  temp_models.append(model)

train_sets = federated_sets[:int(len(federated_sets)*.8)]
test_sets = federated_sets[int(len(federated_sets)*.8):]
print('train_sets: ', len(train_sets))
print('test_sets: ', len(test_sets))

# Créer le modèle final
final_model = create_final_model(temp_models)

# Train the final model
# Define the scaler method
scaler = StandardScaler()

# For each training set
print("Training the final model...")
for train_set in (train_sets):
    # Fit the scaler on the training sets
    scaler.fit(pd.read_csv(dataset_directory + train_set)[X_columns])

# For each dataset of the training set
for train_set in tqdm(train_sets):
    # Load the dataset
    d = pd.read_csv(dataset_directory + train_set)

    # Normalize the dataset
    d[X_columns] = scaler.transform(d[X_columns])

    # Train the model
    final_model.fit(d[X_columns], d[y_column])

    # Delete the dataset from the memory
    del d  

# Test the final model
# Initialize the list of true labels
y_test = []

# Initialize the list of predictions
y_pred = []

# For each dataset of the test set
print("Testing the final model...")
for test_set in (test_sets):
    # Load the dataset
    d_test_current = pd.read_csv(dataset_directory + test_set)

    # Normalize the dataset
    d_test_current[X_columns] = scaler.transform(d_test_current[X_columns])

    # Add the true labels to the list
    y_test += list(d_test_current[y_column].values)

    # Predict the labels
    y_pred += list(final_model.predict(d_test_current[X_columns]))

    # Delete the dataset from the memory
    del d_test_current

# For each prediction
print("Prediction results:")

print('y_pred: ', len(y_pred))
print('y_test: ', len(y_test))
print(f"##### {final_model} (34 classes) #####")
print('accuracy_score: ', accuracy_score(y_pred, y_test))
print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
print()


# # Sauvegarder le modèle central fédéré
# joblib.dump(central_federated_model, '.\\Models\\FEDERATED\\central_federated_model.joblib')


train_sets:  21
test_sets:  6
Training the final model...


100%|██████████| 21/21 [01:18<00:00,  3.75s/it]


Testing the final model...
Prediction results:
y_pred:  1654928
y_test:  1654928
##### DecisionTreeClassifier() (34 classes) #####
accuracy_score:  0.9921404435721675
recall_score:  0.8312569709662008
precision_score:  0.8174385984988672
f1_score:  0.8228106995305607

