In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from minisom import MiniSom
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

In [None]:
# Funzione per classificare i dati utilizzando il SOM
def classify2(som, data, training_data, training_labels):
    from collections import Counter
    winmap = som.labels_map(training_data, training_labels)
    default_class = Counter(training_labels).most_common(1)[0][0]
    result = []
    for sample in data:
        winner = som.winner(sample)
        if winner in winmap:
            result.append(winmap[winner].most_common(1)[0][0])
        else:
            result.append(default_class)
    return result

In [None]:
# Funzione per addestrare e valutare il SOM
def train_and_evaluate(train_idx, test_idx, X, y, som_shape, sigma, learning_rate):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    som = MiniSom(som_shape[0], som_shape[1], X_train.shape[1], sigma=sigma, learning_rate=learning_rate)
    start_train = time.time()
    som.train_random(X_train, X_train.shape[0])
    end_train = time.time()

    y_pred = classify2(som, X_test, X_train, y_train)
    start_eval = time.time()
    report = classification_report(y_test, y_pred, digits=4, output_dict=True, zero_division=0)
    end_eval = time.time()

    training_time = end_train - start_train
    evaluating_time = end_eval - start_eval

    return report, training_time, evaluating_time

In [None]:
# Inizializzazione del dataset e dei parametri
DATA_PATH = "/users/mik2002/som/dataset-processed/"
FILES = ["processed_bot_dataset.csv", "processed_dos_dataset.csv","processed_Ddos_dataset.csv", "processed_bruteforce_dataset.csv"]
df_dataset = pd.concat([pd.read_csv(DATA_PATH + file) for file in FILES], ignore_index=True)

In [None]:
display(df_dataset["Label"].value_counts())
print("Distribution:")
display(df_dataset["Label"].value_counts(normalize=True))

In [None]:
# Calcola le proporzioni delle classi nel dataset originale
class_proportions = df_dataset["Label"].value_counts(normalize=True)

# Esegui il campionamento casuale stratificato mantenendo le proporzioni delle classi
# Specifica la percentuale di campionamento desiderata (ad esempio, 10%)
df_dataset = df_dataset.groupby("Label", group_keys=False).apply(lambda x: x.sample(frac=0.01, random_state=42))

df_dataset["Label"] = df_dataset["Label"].map({"Benign": 0, "DDos": 1, "Dos": 2, "Bot": 3, "Bruteforce": 4})

In [None]:
scaler = MinMaxScaler()
data = scaler.fit_transform(df_dataset.iloc[:, :-1])
target = df_dataset.iloc[:, -1].values

In [None]:
# Parametri le SOM
som_sizes = [(15, 15),(20, 20),(25, 25), (30, 30)]
learning_rate = 1.2
sigma = 1.5

In [None]:
# Lista per salvare i risultati
results = []

# Funzione wrapper per il parallellismo
def parallel_train_and_evaluate(train_idx, test_idx, som_shape, data, target, sigma, learning_rate):
    report, training_time, evaluating_time = train_and_evaluate(train_idx, test_idx, data, target, som_shape, sigma, learning_rate)
    return report, training_time, evaluating_time

def display_report_and_times(all_reports, all_training_times, all_evaluating_times):
    print("\nAll Reports and Times for this SOM Shape:")
    
    for i, (report, train_time, eval_time) in enumerate(zip(all_reports, all_training_times, all_evaluating_times)):
        print(f"\nReport for Fold {i+1}:")
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                print(f"\nLabel {label}:")
                for metric_name, metric_value in metrics.items():
                    print(f"  {metric_name.capitalize()}: {metric_value:.4f}")
            else:
                print(f"\n{label.capitalize()}: {metrics:.4f}")
        print(f"\nTraining Time: {train_time:.4f} seconds")
        print(f"Evaluating Time: {eval_time:.4f} seconds")

# KFold initialization
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop sulle diverse dimensioni del SOM
for som_shape in som_sizes:
    # Suddivisione del dataset con K-Fold e parallellismo
    parallel_results = Parallel(n_jobs=-1)(delayed(parallel_train_and_evaluate)(train_idx, test_idx, som_shape, data, target, sigma, learning_rate) for train_idx, test_idx in kf.split(data))
    
    reports, training_times, evaluating_times = zip(*parallel_results)
    
    # Aggregazione dei risultati dei rapporti
    class_metrics = {}
    for report in reports:
        for key, metrics in report.items():
            if key.isdigit() or key in ['macro avg', 'weighted avg']:
                if key not in class_metrics:
                    class_metrics[key] = {m: [] for m in metrics.keys() if m != 'support'}
                for metric, value in metrics.items():
                    if metric != 'support':
                        class_metrics[key][metric].append(value)

    # Calcolo delle medie e preparazione dell'output formattato
    averages = {key: {metric: np.mean(values) for metric, values in metrics.items()} for key, metrics in class_metrics.items()}

    # Calcolo della media dei tempi di allenamento e valutazione
    avg_training_time = np.mean(training_times)
    avg_evaluating_time = np.mean(evaluating_times)

    results.append((som_shape, averages, avg_training_time, avg_evaluating_time, reports, training_times, evaluating_times))

# Visualizzazione dei risultati
print("\nAverage Metrics Across All Folds:")
for result in results:
    shape, metrics, avg_training_time, avg_evaluating_time, all_reports, all_training_times, all_evaluating_times = result
    print("****************************")
    print(f"\nSOM Shape: {shape}".upper())
    for class_id, class_metrics in metrics.items():
        print(f"\nClass {class_id}:")
        for metric, value in class_metrics.items():
            print(f"{metric}: {value:.4f}")

    # Visualizzazione dei tempi medi di allenamento e valutazione
    minutes_train, seconds_train = divmod(avg_training_time, 60)
    minutes_eval, seconds_eval = divmod(avg_evaluating_time, 60)
    print(f"Average Training Time: {int(minutes_train)} minutes and {seconds_train:.2f} seconds")
    print(f"Average Evaluating Time: {int(minutes_eval)} minutes and {seconds_eval:.2f} seconds")
    print("---------------------------------------")
    display_report_and_times(all_reports, all_training_times, all_evaluating_times)