In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time
import warnings
from joblib import Parallel, delayed
import tracemalloc


In [2]:


# Funzione per convertire l'uso della memoria in unità appropriate
def format_memory_usage(mem_usage):
    if mem_usage >= 1024 ** 3:
        return f"{mem_usage / (1024 ** 3):.2f} GB"
    else:
        return f"{mem_usage / (1024 ** 2):.2f} MB"

# Funzione per monitorare l'uso della memoria durante una funzione
def monitor_memory_usage(func, *args, **kwargs):
    tracemalloc.start()
    result = func(*args, **kwargs)
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return result, peak

# Funzione per il training e l'evaluation con monitoraggio della memoria
def train_and_evaluate(train_idx, test_idx, X, y, n_estimators):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

    start_train = time.time()
    _, training_mem_usage = monitor_memory_usage(model.fit, X_train, y_train)
    end_train = time.time()

    start_eval = time.time()
    y_pred, eval_mem_usage = monitor_memory_usage(model.predict, X_test)
    end_eval = time.time()

    report = classification_report(y_test, y_pred, digits=4, output_dict=True, zero_division=0)

    training_time = end_train - start_train
    evaluating_time = end_eval - start_eval
    total_mem_usage = training_mem_usage + eval_mem_usage

    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage


In [3]:

# Inizializzazione del dataset e dei parametri
DATA_PATH = "/users/mik2002/som/dataset-processed CIC-IDS-2018/"
FILES = ["processed_bot_dataset.csv", "processed_dos_dataset.csv","processed_Ddos_dataset.csv", "processed_bruteforce_dataset.csv"]
df_dataset = pd.concat([pd.read_csv(DATA_PATH + file) for file in FILES], ignore_index=True)

display(df_dataset["Label"].value_counts())
print("Distribution:")


Label
Benign        4961029
DDos           576191
Dos            507715
Bot            282310
Bruteforce     156668
Name: count, dtype: int64

Distribution:


In [4]:
display(df_dataset["Label"].value_counts(normalize=True))

# Parametri per la Random Forest
n_estimators = 100
sampling_fractions = [0.01, 0.03]  # Diversi frazioni di campionamento
results = []


Label
Benign        0.765129
DDos          0.088865
Dos           0.078304
Bot           0.043540
Bruteforce    0.024163
Name: proportion, dtype: float64

In [5]:

# Funzione wrapper per il parallelismo
def parallel_train_and_evaluate(train_idx, test_idx, data, target, n_estimators):
    report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage = train_and_evaluate(train_idx, test_idx, data, target, n_estimators)
    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage

def display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages):
    print("\nAll Reports and Times for this Model:")
    
    for i, (report, train_time, train_mem_usage, eval_time, eval_mem_usage, total_mem_usage) in enumerate(zip(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)):
        print(f"\nReport for Fold {i+1}:")
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                print(f"\nLabel {label}:")
                for metric_name, metric_value in metrics.items():
                    print(f"  {metric_name.capitalize()}: {metric_value:.4f}")
            else:
                print(f"\n{label.capitalize()}: {metrics:.4f}")
        print(f"\nTraining Time: {train_time:.4f} seconds")
        print(f"Training Memory Usage: {format_memory_usage(train_mem_usage)}")
        print(f"Evaluating Time: {eval_time:.4f} seconds")
        print(f"Evaluating Memory Usage: {format_memory_usage(eval_mem_usage)}")
        print(f"Total Memory Usage: {format_memory_usage(total_mem_usage)}")

# Inizializzazione del KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop su diverse frazioni di campionamento
for frac in sampling_fractions:
    print(f"Evaluating for sampling fraction: {frac}")
    
    # Esegui campionamento casuale stratificato mantenendo le proporzioni delle classi
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        sampled_df = df_dataset.groupby("Label", group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))
    sampled_df["Label"] = sampled_df["Label"].map({"Benign": 0, "DDos": 1, "Dos": 2, "Bot": 3, "Bruteforce": 4})

    scaler = MinMaxScaler()
    data = scaler.fit_transform(sampled_df.iloc[:, :-1])
    target = sampled_df.iloc[:, -1].values

    # Split del dataset con K-Fold e parallelismo
    parallel_results = Parallel(n_jobs=-1)(delayed(parallel_train_and_evaluate)(train_idx, test_idx, data, target, n_estimators) for train_idx, test_idx in kf.split(data))
        
    reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages = zip(*parallel_results)
        
    # Aggrega metriche dai report
    class_metrics = {}
    for report in reports:
        for key, metrics in report.items():
            if key.isdigit() or key in ['macro avg', 'weighted avg']:
                if key not in class_metrics:
                    class_metrics[key] = {m: [] for m in metrics.keys() if m != 'support'}
                for metric, value in metrics.items():
                    if metric != 'support':
                        class_metrics[key][metric].append(value)

    # Calcola medie e prepara output formattato
    averages = {key: {metric: np.mean(values) for metric, values in metrics.items()} for key, metrics in class_metrics.items()}

    # Calcola i tempi medi di training e valutazione e l'uso della memoria
    avg_training_time = np.mean(training_times)
    avg_training_mem_usage = np.mean(training_mem_usages)
    avg_evaluating_time = np.mean(evaluating_times)
    avg_eval_mem_usage = np.mean(eval_mem_usages)
    avg_total_mem_usage = np.mean(total_mem_usages)

    results.append((frac, averages, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages))

# Mostra i risultati
print("\nAverage Metrics Across All Folds:")
for result in results:
    frac, metrics, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages = result
    print("****************************")
    print(f"\nSampling Fraction: {frac}")
    for class_id, class_metrics in metrics.items():
        print(f"\nClass {class_id}:")
        for metric, value in class_metrics.items():
            print(f"{metric}: {value:.4f}")

    # Mostra i tempi medi di training e valutazione e l'uso della memoria
    minutes_train, seconds_train = divmod(avg_training_time, 60)
    minutes_eval, seconds_eval = divmod(avg_evaluating_time, 60)
    print(f"Average Training Time: {int(minutes_train)} minutes and {seconds_train:.2f} seconds")
    print(f"Average Training Memory Usage: {format_memory_usage(avg_training_mem_usage)}")
    print(f"Average Evaluating Time: {int(minutes_eval)} minutes and {seconds_eval:.2f} seconds")
    print(f"Average Evaluating Memory Usage: {format_memory_usage(avg_eval_mem_usage)}")
    print(f"Average Total Memory Usage: {format_memory_usage(avg_total_mem_usage)}")
    print("---------------------------------------")
    display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)


Evaluating for sampling fraction: 0.01
Evaluating for sampling fraction: 0.03

Average Metrics Across All Folds:
****************************

Sampling Fraction: 0.01

Class 0:
precision: 0.9991
recall: 0.9988
f1-score: 0.9990

Class 1:
precision: 0.9972
recall: 0.9953
f1-score: 0.9963

Class 2:
precision: 0.9969
recall: 0.9644
f1-score: 0.9804

Class 3:
precision: 0.9880
recall: 0.9954
f1-score: 0.9917

Class 4:
precision: 0.8959
recall: 0.9958
f1-score: 0.9431

Class macro avg:
precision: 0.9754
recall: 0.9899
f1-score: 0.9821

Class weighted avg:
precision: 0.9958
recall: 0.9955
f1-score: 0.9956
Average Training Time: 0 minutes and 8.23 seconds
Average Training Memory Usage: 17.24 MB
Average Evaluating Time: 0 minutes and 0.07 seconds
Average Evaluating Memory Usage: 4.32 MB
Average Total Memory Usage: 21.55 MB
---------------------------------------

All Reports and Times for this Model:

Report for Fold 1:

Label 0:
  Precision: 0.9988
  Recall: 0.9987
  F1-score: 0.9987
  Support