In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from minisom import MiniSom
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Funzione per classificare i dati utilizzando il SOM
def classify2(som, data, training_data, training_labels):
    from collections import Counter
    winmap = som.labels_map(training_data, training_labels)
    default_class = Counter(training_labels).most_common(1)[0][0]
    result = []
    for sample in data:
        winner = som.winner(sample)
        if winner in winmap:
            result.append(winmap[winner].most_common(1)[0][0])
        else:
            result.append(default_class)
    return result

In [None]:
# Funzione per addestrare e valutare il SOM
def train_and_evaluate(train_idx, test_idx, X, y, som_shape, sigma, learning_rate):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    som = MiniSom(som_shape[0], som_shape[1], X_train.shape[1], sigma=sigma, learning_rate=learning_rate)
    som.train_random(X_train, X_train.shape[0],verbose=True) 

    y_pred = classify2(som, X_test, X_train, y_train)
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))
    print('Topographic error:', som.topographic_error(X_train))  
    report = classification_report(y_test, y_pred, digits=4, output_dict=True)
    return report

In [None]:
# Inizializzazione del dataset e dei parametri
DATA_PATH = "/users/mik2002/som/dataset-processed/"
FILES = ["processed_bot_dataset.csv", "processed_dos_dataset.csv","processed_Ddos_dataset.csv", "processed_bruteforce_dataset.csv"]
df_dataset = pd.concat([pd.read_csv(DATA_PATH + file) for file in FILES], ignore_index=True)

In [None]:
display(df_dataset["Label"].value_counts())
print("Distribution:")
display(df_dataset["Label"].value_counts(normalize=True))

In [None]:
# Calcola le proporzioni delle classi nel dataset originale
class_proportions = df_dataset["Label"].value_counts(normalize=True)

# Esegui il campionamento casuale stratificato mantenendo le proporzioni delle classi
# Specifica la percentuale di campionamento desiderata (ad esempio, 10%)
df_dataset = df_dataset.groupby("Label", group_keys=False).apply(lambda x: x.sample(frac=0.5, random_state=42))

df_dataset["Label"] = df_dataset["Label"].map({"Benign": 0, "DDos": 1, "Dos": 2, "Bot": 3, "Bruteforce": 4})

In [None]:
scaler = MinMaxScaler()
data = scaler.fit_transform(df_dataset.iloc[:, :-1])
target = df_dataset.iloc[:, -1].values

In [None]:
# Parametri per il K-Fold e il SOM
kf = KFold(n_splits=5, shuffle=True, random_state=42)
som_sizes = [(15, 15),(20, 20),(25, 25), (30, 30)]
learning_rate = 1.2
sigma = 1.5

In [None]:
# Lista per salvare i risultati
results = []

# Loop sulle diverse dimensioni del SOM
for som_shape in som_sizes:
    class_metrics = {}
    error_metrics = {}
    # Suddivisione del dataset con K-Fold
    for train_idx, test_idx in kf.split(data):
        print(f"\nTraining SOM with shape: {som_shape}")
        start_time = time.time() 
        report = train_and_evaluate(train_idx, test_idx, data, target, som_shape, sigma, learning_rate)
        elapsed_time = time.time() - start_time
        minutes, seconds = divmod(elapsed_time, 60)
        print(f"Training Time + Evaluating Time: {int(minutes)} minutes and {seconds:.2f} seconds")
        for key, metrics in report.items():
            if key.isdigit() or key in ['macro avg', 'weighted avg']:
                if key not in class_metrics:
                    class_metrics[key] = {m: [] for m in metrics.keys() if m != 'support'}
                for metric, value in metrics.items():
                    if metric != 'support':
                        class_metrics[key][metric].append(value)

    # Calcolo delle medie e preparazione dell'output formattato
    averages = {}
    for key, metrics in class_metrics.items():
        averages[key] = {metric: np.mean(values) for metric, values in metrics.items()}

    results.append((som_shape, averages))

# Visualizzazione dei risultati
print("\nAverage Metrics Across All Folds:")
for result in results:
    shape, metrics = result
    print(f"\nSOM Shape: {shape}")
    for class_id, class_metrics in metrics.items():
        print(f"\nClass {class_id}:")
        for metric, value in class_metrics.items():
            print(f"{metric}: {value:.4f}")