In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from minisom import MiniSom
import numpy as np
import time
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import warnings
import tracemalloc
from collections import Counter

In [2]:

# Funzione per convertire l'uso della memoria in unità appropriate
def format_memory_usage(mem_usage):
    if mem_usage >= 1024 ** 3:
        return f"{mem_usage / (1024 ** 3):.2f} GB"
    else:
        return f"{mem_usage / (1024 ** 2):.2f} MB"

# Funzione per classificare i dati utilizzando il SOM
def classify2(som, data, training_data, training_labels):
    winmap = som.labels_map(training_data, training_labels)
    default_class = Counter(training_labels).most_common(1)[0][0]
    result = []
    for sample in data:
        winner = som.winner(sample)
        if winner in winmap:
            result.append(winmap[winner].most_common(1)[0][0])
        else:
            result.append(default_class)
    return result

# Funzione per monitorare l'uso della memoria durante una funzione
def monitor_memory_usage(func, *args, **kwargs):
    tracemalloc.start()
    result = func(*args, **kwargs)
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return result, peak

# Funzione per il training e l'evaluation con monitoraggio della memoria
def train_and_evaluate(train_idx, test_idx, X, y, som_shape, sigma, learning_rate):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    som = MiniSom(som_shape[0], som_shape[1], X_train.shape[1], sigma=sigma, learning_rate=learning_rate)

    start_train = time.time()
    _, training_mem_usage = monitor_memory_usage(som.train_random, X_train, X_train.shape[0])
    end_train = time.time()

    start_eval = time.time()
    y_pred, eval_mem_usage = monitor_memory_usage(classify2, som, X_test, X_train, y_train)
    end_eval = time.time()

    report = classification_report(y_test, y_pred, digits=4, output_dict=True, zero_division=0)

    training_time = end_train - start_train
    evaluating_time = end_eval - start_eval
    total_mem_usage = training_mem_usage + eval_mem_usage

    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage


In [3]:

# Inizializzazione del dataset e dei parametri
DATA_PATH = "/users/mik2002/som/dataset-processed CIC-IDS-2018/"
FILES = ["processed_bot_dataset.csv", "processed_dos_dataset.csv","processed_Ddos_dataset.csv", "processed_bruteforce_dataset.csv"]
df_dataset = pd.concat([pd.read_csv(DATA_PATH + file) for file in FILES], ignore_index=True)

display(df_dataset["Label"].value_counts())
print("Distribution:")
display(df_dataset["Label"].value_counts(normalize=True))


Label
Benign        4961029
DDos           576191
Dos            507715
Bot            282310
Bruteforce     156668
Name: count, dtype: int64

Distribution:


Label
Benign        0.765129
DDos          0.088865
Dos           0.078304
Bot           0.043540
Bruteforce    0.024163
Name: proportion, dtype: float64

In [4]:
# Parametri per le SOM
som_sizes = [(20, 20)]
learning_rate = 1.2
sigma = 1.5
sampling_fractions = [0.01, 0.03]  # Diversi frazioni di campionamento
results = []


In [5]:

# Wrapper function for parallelism
def parallel_train_and_evaluate(train_idx, test_idx, som_shape, data, target, sigma, learning_rate):
    report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage = train_and_evaluate(train_idx, test_idx, data, target, som_shape, sigma, learning_rate)
    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage

def display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages):
    print("\nAll Reports and Times for this SOM Shape:")
    
    for i, (report, train_time, train_mem_usage, eval_time, eval_mem_usage, total_mem_usage) in enumerate(zip(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)):
        print(f"\nReport for Fold {i+1}:")
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                print(f"\nLabel {label}:")
                for metric_name, metric_value in metrics.items():
                    print(f"  {metric_name.capitalize()}: {metric_value:.4f}")
            else:
                print(f"\n{label.capitalize()}: {metrics:.4f}")
        print(f"\nTraining Time: {train_time:.4f} seconds")
        print(f"Training Memory Usage: {format_memory_usage(train_mem_usage)}")
        print(f"Evaluating Time: {eval_time:.4f} seconds")
        print(f"Evaluating Memory Usage: {format_memory_usage(eval_mem_usage)}")
        print(f"Total Memory Usage: {format_memory_usage(total_mem_usage)}")

# KFold initialization
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop over different sampling fractions
for frac in sampling_fractions:
    print(f"Evaluating for sampling fraction: {frac}")
    
    # Perform stratified random sampling maintaining class proportions
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        sampled_df = df_dataset.groupby("Label", group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))
    sampled_df["Label"] = sampled_df["Label"].map({"Benign": 0, "DDos": 1, "Dos": 2, "Bot": 3, "Bruteforce": 4})

    scaler = MinMaxScaler()
    data = scaler.fit_transform(sampled_df.iloc[:, :-1])
    target = sampled_df.iloc[:, -1].values

    # Loop over different SOM sizes
    for som_shape in som_sizes:
        print(f"Evaluating for som_shape: {som_shape}")
        # Split dataset with K-Fold and parallelism
        parallel_results = Parallel(n_jobs=-1)(delayed(parallel_train_and_evaluate)(train_idx, test_idx, som_shape, data, target, sigma, learning_rate) for train_idx, test_idx in kf.split(data))
        
        reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages = zip(*parallel_results)
        
        # Aggregate metrics from reports
        class_metrics = {}
        for report in reports:
            for key, metrics in report.items():
                if key.isdigit() or key in ['macro avg', 'weighted avg']:
                    if key not in class_metrics:
                        class_metrics[key] = {m: [] for m in metrics.keys() if m != 'support'}
                    for metric, value in metrics.items():
                        if metric != 'support':
                            class_metrics[key][metric].append(value)

        # Calculate averages and prepare formatted output
        averages = {key: {metric: np.mean(values) for metric, values in metrics.items()} for key, metrics in class_metrics.items()}

        # Calculate average training and evaluation times and memory usage
        avg_training_time = np.mean(training_times)
        avg_training_mem_usage = np.mean(training_mem_usages)
        avg_evaluating_time = np.mean(evaluating_times)
        avg_eval_mem_usage = np.mean(eval_mem_usages)
        avg_total_mem_usage = np.mean(total_mem_usages)

        results.append((frac, som_shape, averages, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages))

# Display results
print("\nAverage Metrics Across All Folds:")
for result in results:
    frac, shape, metrics, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages = result
    print("****************************")
    print(f"\nSampling Fraction: {frac}")
    print(f"SOM Shape: {shape}".upper())
    for class_id, class_metrics in metrics.items():
        print(f"\nClass {class_id}:")
        for metric, value in class_metrics.items():
            print(f"{metric}: {value:.4f}")

    # Display average training and evaluation times and memory usage
    minutes_train, seconds_train = divmod(avg_training_time, 60)
    minutes_eval, seconds_eval = divmod(avg_evaluating_time, 60)
    print(f"Average Training Time: {int(minutes_train)} minutes and {seconds_train:.2f} seconds")
    print(f"Average Training Memory Usage: {format_memory_usage(avg_training_mem_usage)}")
    print(f"Average Evaluating Time: {int(minutes_eval)} minutes and {seconds_eval:.2f} seconds")
    print(f"Average Evaluating Memory Usage: {format_memory_usage(avg_eval_mem_usage)}")
    print(f"Average Total Memory Usage: {format_memory_usage(avg_total_mem_usage)}")
    print("---------------------------------------")
    display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)


Evaluating for sampling fraction: 0.01
Evaluating for som_shape: (20, 20)
Evaluating for sampling fraction: 0.03
Evaluating for som_shape: (20, 20)

Average Metrics Across All Folds:
****************************

Sampling Fraction: 0.01
SOM SHAPE: (20, 20)

Class 0:
precision: 0.9943
recall: 0.9896
f1-score: 0.9919

Class 1:
precision: 0.9796
recall: 0.9862
f1-score: 0.9829

Class 2:
precision: 0.9921
recall: 0.9377
f1-score: 0.9641

Class 3:
precision: 0.8732
recall: 0.9282
f1-score: 0.8995

Class 4:
precision: 0.8412
recall: 0.9994
f1-score: 0.9135

Class macro avg:
precision: 0.9361
recall: 0.9682
f1-score: 0.9504

Class weighted avg:
precision: 0.9839
recall: 0.9828
f1-score: 0.9830
Average Training Time: 0 minutes and 15.29 seconds
Average Training Memory Usage: 0.87 MB
Average Evaluating Time: 0 minutes and 7.74 seconds
Average Evaluating Memory Usage: 2.13 MB
Average Total Memory Usage: 3.01 MB
---------------------------------------

All Reports and Times for this SOM Shape:

R