In [None]:
%%capture
import pandas as pd
import numpy as np
import warnings
from google.colab import drive
import ipaddress
import random

from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


warnings.filterwarnings('ignore')
drive.mount('/content/drive')

FILEPATH = "/content/drive/MyDrive/data analytics/reti_2.0/results_0.csv"

# Creazione df result

In [None]:
def create_empty_df(filepath):
  """
  Crea un DataFrame vuoto e lo salva in un file CSV.
  """
  # Definisci le colonne del DataFrame
  columns = [
      'ds', 'random', 'outlier', 'dim_reduction', 'pca_threshold', 'scaler', 'target count',
      'batch_size', 'hidden_size', 'batch_norm', 'dropout', 'depth', 'epoch',
      'learning_rate', 'gamma', 'step_size', 'weight_decay', 'info'
  ]

  # Crea un DataFrame vuoto
  results_df = pd.DataFrame(columns=columns)

  # Salva il DataFrame in formato CSV
  results_df.to_csv(filepath, index=False)

  print(f"DataFrame creato e salvato in {filepath}")

create_empty_df(FILEPATH)

DataFrame creato e salvato in /content/drive/MyDrive/data analytics/reti_2.0/results_0.csv


# Data Cleaning
Scelta colonne, cast delle colonne e gestione dei valori nulli



In [None]:
# Funzione per determinare il tipo di dato di una colonna
def type_data(column):
    default_val = [np.nan, '-']
    column = column[~column.isin(default_val)]
    unique_count = column.nunique()
    if is_binary_dtype(column):
        return 'Binario'
    if  is_numeric_dtype(column):
        return 'Numerico Discreto' if pd.api.types.is_integer_dtype(column) else 'Numerico Continuo'
    if is_category_dtype(column):
        return 'Categorico'
    return 'Unknown'

# Funzioni ausiliarie per verificare il tipo di dato
def is_numeric_dtype(column):
    return pd.api.types.is_numeric_dtype(column)

def is_binary_dtype(column):
    return set(column.unique()) == {True, False}

def is_category_dtype(column):
    return pd.api.types.is_object_dtype(column) or pd.api.types.is_categorical_dtype(column)

In [None]:
def clean_service_columns(data):
    service_related_cols = {}
    categorial_columns = data.select_dtypes(exclude=np.number).columns
    categorial_columns = categorial_columns.drop(['dns_qclass', 'dns_qtype', 'http_version', 'http_orig_mime_types', 'http_resp_mime_types'])
    for col in categorial_columns:
      for prefix in ['dns', 'http', 'ssl']:
        if col.startswith(prefix) and not pd.api.types.is_numeric_dtype(col):
          if prefix not in service_related_cols:
            service_related_cols[prefix] = []
          service_related_cols[prefix].append(col)
    for col in data.columns:
        for service, columns in service_related_cols.items():
            if col in columns and f"service_{service}" in data.columns:
                data.loc[~data[f"service_{service}"], col] = '/'
    return data

def boolean_mapping(value, def_val=None):
    if value in {True, False}:
        return value
    if value == 'T':
        return True
    if value == 'F':
        return False
    return def_val if def_val is not None else value

def categorize_ports(df, port_columns):
    port_bins = [0, 1023, 49151, 65535]
    port_labels = ["Well-Known", "Registered", "Dynamic"]
    for col in port_columns:
        df[col] = pd.cut(df[col], bins=port_bins, labels=port_labels, right=True)
    return df


def categorize_ip(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.is_loopback:
            return "Loopback"
        if ip_obj.is_private:
            return "Private"
        if ip_obj.is_multicast:
            return "Multicast"
        if ip_obj.is_reserved:
            return "Reserved"
        if ip_obj.is_link_local:
            return "Link-Local"
        return "Public"
    except ValueError:
        return "Invalid"

def df_mapping(df):
  rcode_mapping = {0: 'No Error', 2: 'ServerFailure', 3: 'NameError', 5: 'Refuse'}
  qclass_mapping = {0: '-', 1: 'IN', 32769: 'CH'}
  qtype_mapping = {0: '-', 1: 'A', 2: 'NS', 5: 'CNAME', 28: 'AAAA', 255: 'ANY'}

  for col in df.columns:
    if col in ['dns_RD', 'dns_AA', 'dns_rejected', 'http_trans_depth', 'ssl_established','ssl_resumed']:
      df[col] = df[col].map(lambda x: boolean_mapping(x)).astype(str)
    if col in ['http_status_code', 'weird_addl', 'http_trans_depth']:
      df[col] = df[col].astype(str)
    if col == 'dns_qclass':
      df[col] = df[col].apply(lambda x: qclass_mapping.get(x, None))
    if col == 'dns_qtype':
      df[col] = df[col].apply(lambda x: qtype_mapping.get(x, None))
    if col == 'dns_rcode':
      df[col] = df[col].apply(lambda x: rcode_mapping.get(x, None))
    if col in ['src_ip', 'dst_ip']:
      df[col] = df[col].apply(categorize_ip)
    if col == 'src_bytes':
      df = df[df['src_bytes'] != '0.0.0.0']
      df['src_bytes'] = df['src_bytes'].astype(int)
  df = categorize_ports(df, ['src_port', 'dst_port'])
  return df

def data_cleaning(df):
    services = df['service'].str.split(';').explode().unique()  # Estrazione di tutti i servizi unici
    for service in services:
        df[f'service_{service}'] = df['service'].apply(lambda x: service in x.split(';'))

    df.drop(['http_referrer', 'service', 'service_-'], axis=1, inplace=True, errors='ignore')
    df.drop(['ts', 'ssl_subject', 'ssl_issuer', 'dns_query', 'http_uri', 'http_user_agent', 'weird_name', 'label'],
             axis=1, inplace=True, errors='ignore')

    df = df_mapping(df)
    df = clean_service_columns(df)

    return df

In [None]:
def replace_default_new(df, info):
    mode_values = {}
    if info == 'mode' or info=='mode_all':
      for col in df.columns:
        if is_category_dtype(df[col]) or is_binary_dtype(df[col]):
            valid_values = df[(df[col] != '/') & (df[col] != '-')][col]
            mode_value = valid_values.mode()[0] if not valid_values.empty else '-'  # Usa '-' se non c'è moda
            mode_values[col] = mode_value

            # Sostituzione valori
            df[col] = df[col].replace('-', mode_value)
            if info == 'mode_all':
              df[col] = df[col].replace('/', mode_value)

    # Salva le mode con joblib
    joblib.dump(mode_values, "mode.pk")

    return df


def apply_saved_modes(val, info):
    if info == 'mode' or info=='mode_all':
      mode_values = joblib.load("mode.pk")
      # Applica le mode ai nuovi dati
      for col, mode_value in mode_values.items():
          if col in val.columns:
              val[col] = val[col].replace('-', mode_value)
              if info == 'mode_all':
                val[col] = val[col].replace('/', mode_value)
    return val

In [None]:
df=pd.read_csv('/content/drive/MyDrive/data analytics/train_dataset.csv')
df = data_cleaning(df)

# Divisione val e train

In [None]:
from sklearn.model_selection import train_test_split

# Definisci le features (X) e il target (y)
X = df.drop('type', axis=1)  # Assumi che 'label' sia la colonna del target
y = df['type']

# Dividi il dataset in train e test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=19)

# Unisci X_train e y_train
train_df = pd.concat([X_train, y_train], axis=1)

# Unisci X_test e y_test
test_df = pd.concat([X_val, y_val], axis=1)

# Pipline

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, LabelEncoder, Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import joblib
from imblearn.over_sampling import SMOTE,  BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
#rimozione outlier per classe
def remove_outliers(x, y, out):
    x_train = x.copy()
    y_train = y.copy()

    df = pd.concat([x_train, y_train], axis=1)
    numeric_cols = x_train.select_dtypes(include=np.number).columns

    # Controlla se non deve essere applicata nessuna rimozione
    if out == 'no':
        return x, y

    if out == 'base':
      #rimozione outlier piÃ¹ ASSURDI
      before = df.shape[0]
      df = df[df['duration'] < 1000]
      df = df[df['src_bytes']<100000000]
      df = df[df['dst_bytes']<100000000]
      df = df[df['missed_bytes']<100000000]
      df = df[df['src_pkts']<20000]
      df = df[df['dst_pkts']<20000]
      df = df[df['src_ip_bytes']<1000000]
      df = df[df['dst_ip_bytes']<1000000]
      print('  Rimosse ',before-df.shape[0],' istanze')
      x_train = df.drop('type', axis=1)
      y_train = df['type']
      return x_train, y_train

    filtered_data = []
    # Itera su ciascuna classe
    for cls in df['type'].unique():
        class_df = df[df['type'] == cls]
        before = class_df.shape[0]

        if out == 'iqr':
            for col in numeric_cols:
                Q1 = class_df[col].quantile(0.25)
                Q3 = class_df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'percentile':
            for col in numeric_cols:
                lower_bound = class_df[col].quantile(0.01)
                upper_bound = class_df[col].quantile(0.99)
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'isolation_forest':
            from sklearn.ensemble import IsolationForest
            iso = IsolationForest(contamination=0.05, random_state=19)
            numeric_data = class_df[numeric_cols]
            class_df['outlier'] = iso.fit_predict(numeric_data)
            class_df = class_df[class_df['outlier'] == 1].drop(columns=['outlier'])

        elif out == 'dynamic_threshold':
            for col in numeric_cols:
                mean = class_df[col].mean()
                std = class_df[col].std()
                lower_bound = mean - 3 * std
                upper_bound = mean + 3 * std
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        filtered_data.append(class_df)

    # Combina i dati filtrati per ciascuna classe
    filtered_df = pd.concat(filtered_data)

    x_train = filtered_df.drop('type', axis=1)
    y_train = filtered_df['type']

    return x_train, y_train

In [None]:
# scaling and normalization
def scale_train_data(x_train, y_train, scaling_method):
    scaled_df = x_train.copy()

    numeric_columns = x_train.select_dtypes(include=np.number).columns
    if len(numeric_columns) == 0:
        print("  Warning: No numeric columns to scale. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'none':
        print("No scaling applied.")
        return scaled_df, y_train
    elif scaling_method == 'standard':
        scaler = StandardScaler()
    elif scaling_method == 'minmax':
        scaler = MinMaxScaler()
    elif scaling_method == 'quantile':
        scaled_df = pd.concat([scaled_df, y_train], axis=1)
        scaled_df = scaled_df.sort_values(by='src_bytes')
        y_train = scaled_df['type']
        scaled_df = scaled_df.drop('type', axis=1)
        scaler = QuantileTransformer(output_distribution='uniform', random_state=19)
    elif scaling_method == 'l1':
        scaler = Normalizer(norm='l1')
    elif scaling_method == 'l2':
        scaler = Normalizer(norm='l2')
    else:
        raise ValueError(f"Metodo di scaling '{scaling_method}' non supportato.")

    if scaled_df[numeric_columns].shape[0] < 1:
        print("  Warning: Not enough samples to fit the scaler. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'l1' or scaling_method == 'l2':
        scaled_df = scaler.fit_transform(scaled_df)
    else:
        scaled_df[numeric_columns] = scaler.fit_transform(scaled_df[numeric_columns])
    joblib.dump(scaler, "scaler.pkl")
    return scaled_df, y_train

# carica scaler e effettua scaling
def scale_validation_data(x_val, y_val, scaling_method):
    if scaling_method == 'quantile':
        x_val = pd.concat([x_val, y_val], axis=1)
        x_val = x_val.sort_values(by='src_bytes')
        y_val = x_val['type']
        x_val = x_val.drop('type', axis=1)

    numeric_columns = x_val.select_dtypes(include=np.number).columns
    if scaling_method == 'none':
        print("No scaling applied to validation data.")
        return x_val, y_val

    scaler = joblib.load("scaler.pkl")

    if scaling_method == 'l1' or scaling_method == 'l2':
        x_val = scaler.transform(x_val)
    else:
        x_val[numeric_columns] = scaler.transform(x_val[numeric_columns])
    return x_val, y_val

In [None]:
# ENCODING
def encode_categorical_train_data(x_train):
    categorical_columns = x_train.select_dtypes(include=['object', 'category']).columns

    if len(categorical_columns) > 0:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(x_train[categorical_columns])
        joblib.dump(encoder, "onehot_encoder.pkl")
        x_train_encoded = encoder.transform(x_train[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_train_encoded_df = pd.DataFrame(x_train_encoded, columns=encoded_feature_names, index=x_train.index)
        x_train = x_train.drop(columns=categorical_columns)
        x_train = pd.concat([x_train, x_train_encoded_df], axis=1)

    return x_train

def encode_categorical_validation_data(x_val):
    categorical_columns = x_val.select_dtypes(include=['object', 'category']).columns
    encoder = joblib.load("onehot_encoder.pkl")

    if len(categorical_columns) > 0:
        x_val_encoded = encoder.transform(x_val[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_val_encoded_df = pd.DataFrame(x_val_encoded, columns=encoded_feature_names, index=x_val.index)
        x_val = x_val.drop(columns=categorical_columns)
        x_val = pd.concat([x_val, x_val_encoded_df], axis=1)

    return x_val

In [None]:
# BILANCIAMENTO
def balance_data(x_train, y_train, target_count, num_datasets, random_seed):
    smote = SMOTE(random_state=random_seed)
    oversampler = RandomOverSampler(random_state=random_seed)

    class_counts = pd.Series(y_train).value_counts()
    smote_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count / 2]

    if smote_classes:
        smote_strategy = {cls: target_count for cls in smote_classes}
        smote = SMOTE(sampling_strategy=smote_strategy, random_state=random_seed)
        x_train, y_train = smote.fit_resample(x_train, y_train)
        class_counts = pd.Series(y_train).value_counts()

    over_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count]
    if over_classes:
        over_strategy = {cls: target_count for cls in over_classes}
        oversampler = RandomOverSampler(sampling_strategy=over_strategy, random_state=random_seed)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)

    datasets = []
    for i in range(num_datasets):
        undersampler = RandomUnderSampler(sampling_strategy={cls: target_count for cls in pd.Series(y_train).value_counts().index}, random_state=random_seed + i)
        x_resampled, y_resampled = undersampler.fit_resample(x_train, y_train)
        x_resampled, y_resampled = shuffle(x_resampled, y_resampled, random_state=random_seed + i)
        datasets.append((x_resampled, y_resampled))



    return datasets

In [None]:
# PCA
def apply_pca_train(x_train, random_state, pca_threshold=0.99):
    pca = PCA(random_state=random_state)
    pca.fit(x_train)
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    n_components = (cumulative_variance >= pca_threshold).argmax() + 1
    pca = PCA(n_components=n_components, random_state=random_state)
    transformed_data = pca.fit_transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    print(f"  Numero di colonne selezionate (componenti principali): {n_components}")
    joblib.dump(pca, "pca_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"PC{i+1}" for i in range(n_components)])

def apply_pca_validation(x_val):
    pca = joblib.load("pca_model.pkl")
    x_val = pca.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

# LDA
def apply_lda_train(x_train, y_train, lda_components=None):
    lda = LDA(n_components=lda_components)
    lda.fit(x_train, y_train)
    transformed_data = lda.transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    n_components = transformed_data.shape[1]
    print(f"  Numero di colonne selezionate (componenti discriminanti): {n_components}")
    joblib.dump(lda, "lda_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"LD{i+1}" for i in range(n_components)])

def apply_lda_validation(x_val):
    lda = joblib.load("lda_model.pkl")
    x_val = lda.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

In [None]:
def preprocessing_pipeline(x_train, y_train, x_validation, y_validation, scaling_method, use_pca, pca_threshold, target_count=20000, num_datasets=1, random_seed=19):
    # Encoding delle feature
    x_train = encode_categorical_train_data(x_train)
    x_validation = encode_categorical_validation_data(x_validation)

    # Bilanciamento
    datasets = balance_data(x_train, y_train, target_count, num_datasets, random_seed)
    validation = []
    data = []
    i = 0

    for x_train, y_train in datasets:
      print(f"  Dataset bilanciato {i+1}:")
      i+=1
      # Scaling
      x_train, y_train = scale_train_data(x_train, y_train, scaling_method)
      x_val, y_val = scale_validation_data(x_validation, y_validation, scaling_method)

      if use_pca == 'PCA':
          x_train = apply_pca_train(x_train, random_state=random_seed, pca_threshold=pca_threshold)
          x_val = apply_pca_validation(x_val)
      elif use_pca == 'LDA':
          x_train = apply_lda_train(x_train, y_train)
          x_val = apply_lda_validation(x_val)

      # Bilanciamento
      data.append((x_train, y_train))
      validation.append((x_val, y_val))
    return data, validation


# Train

In [None]:
def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

# Verifica se una combinazione è già presente nel file CSV.
def is_combination_tested(filepath, new_row, num_epochs):
  return False
  existing_results = pd.read_csv(filepath)
  # Colonne per la comparazione
  comparison_columns = [
      'ds', 'random', 'outlier', 'dim_reduction', 'pca_threshold', 'scaler', 'target count',
      'batch_size', 'hidden_size', 'batch_norm', 'dropout', 'depth',
      'learning_rate', 'gamma', 'step_size', 'weight_decay'
  ]

  # Aggiungi colonne mancanti a `new_row` con valore NaN
  for col in comparison_columns:
    if col not in new_row:
        new_row[col] = np.nan

  # Filtra righe che corrispondono ai valori di new_row
  filtered_results = existing_results.copy()
  filtered_results = filtered_results[filtered_results['epoch'] == num_epochs]
  for col in new_row.keys():
    # Mantieni solo le righe in cui i valori corrispondono (o sono entrambi NaN)
    filtered_results = filtered_results[
        (filtered_results[col] == new_row[col]) | (pd.isna(filtered_results[col]) & pd.isna(new_row[col]))
    ]

  # Controlla se tutte le colonne non in new_row sono NaN
  for _, row in filtered_results.iterrows():
    all_remaining_nan = all(pd.isna(row[col]) for col in comparison_columns if col not in new_row)
    if all_remaining_nan:
        print("  Configurazione già testata, salto...")
        return True
  return False

def append_and_save_results(filepath, new_row):
  results_df = pd.read_csv(filepath)
  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
  results_df.to_csv(filepath, index=False)

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.values
        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.values
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

        self.num_features = X.shape[1]
        self.num_classes = len(np.unique(y))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx, :], self.y[idx]

class FeedForwardPlus(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size, depth=1, batch_norm=False, drop=0):
        super(FeedForwardPlus, self).__init__()

        model = []
        model += [nn.Linear(input_size, hidden_size)]
        if batch_norm:
            model += [nn.BatchNorm1d(hidden_size)]
        model += [nn.ReLU()]

        block = [nn.Linear(hidden_size, hidden_size), nn.ReLU()]
        block_batch_norm = [nn.Linear(hidden_size, hidden_size), nn.BatchNorm1d(hidden_size), nn.ReLU()]
        block_dropout = [nn.Dropout(drop), nn.Linear(hidden_size, hidden_size), nn.ReLU()]

        for i in range(depth):
            if not batch_norm and drop == 0:
                model += block
            elif batch_norm and drop == 0:
                model += block_batch_norm
            elif drop > 0 and not batch_norm:
                model += block_dropout

        self.model = nn.Sequential(*model)
        self.output = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h = self.model(x)
        out = self.output(h)
        return out

In [None]:
def test_model(model, data_loader, device):
    model = model.to(device)
    model.eval()  # Imposta il modello in modalità di valutazione

    y_pred = []
    y_test = []

    with torch.no_grad():
      for data, targets in data_loader:
          # Sposta i dati e i target sulla GPU (o CPU se non disponibile)
          data, targets = data.to(device), targets.to(device)

          # Calcola le predizioni
          output = model(data)
          y_pred.append(output)
          y_test.append(targets)

    # Unisci tutte le predizioni e i target in un unico tensor
    y_test = torch.cat(y_test).squeeze()
    y_pred = torch.cat(y_pred).squeeze()
    # Determina le classi previste (indice della classe con probabilità massima)
    y_pred_c = y_pred.argmax(dim=1, keepdim=True).squeeze()
    return y_test, y_pred_c, y_pred


def train_model(model, criterion, optimizer, epoch, scheduler, train_loader, val_loader, device, new_row):
    n_iter = 0

    best_valid_loss = float('inf')
    best_model = None
    best_accuracy = 0
    best_report = None

    # per ogni epoca
    for epoch in range(epoch):
        new_row['epoch'] = epoch+1
        model.train()
        # per ogni batch
        for data, targets in train_loader:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad() # resetta i gradienti
            y_pred = model(data) # Forward pass
            loss = criterion(y_pred, targets) # Compute Loss
            loss.backward() # Backward pass
            optimizer.step()

            n_iter += 1

        # valutazione sul validation
        labels, y_pred_c, y_pred = test_model(model, val_loader, device)
        loss_val = criterion(y_pred, labels)
        val_accuracy = accuracy_score(labels, y_pred_c)
        report = classification_report(labels, y_pred_c, output_dict=True)
        print(report)

        # train sul validation
        labels_t, y_pred_c_t, y_pred_t = test_model(model, train_loader, device)
        loss_t = criterion(y_pred_t, labels_t)
        t_accuracy = accuracy_score(labels_t, y_pred_c_t)
        t_report = classification_report(labels_t, y_pred_c_t, output_dict=True)

        # Valutazione
        new_row.update({
            'val_accuracy': report['accuracy'],
            'val_precision': report['weighted avg']['precision'],
            'val_recall': report['weighted avg']['recall'],
            'val_f1': report['weighted avg']['f1-score'],
            'validation_loss': loss_val.item(),
            'train_accuracy': t_report['accuracy'],
            'train_precision': t_report['weighted avg']['precision'],
            'train_recall': t_report['weighted avg']['recall'],
            'train_f1': t_report['weighted avg']['f1-score'],
            'train_loss': loss_t.item()
        })

        # Salva i risultati
        append_and_save_results(FILEPATH, new_row)
        if report['accuracy']> best_accuracy:
          best_accuracy = report['accuracy']
          best_model = model
          best_report = report
          best_valid_loss = loss_val.item()
        scheduler.step()
        print(f"{new_row['epoch']}. Validation score: {val_accuracy} - Train score {t_accuracy} -  Validation loss: {loss_val.item()} - Train Loss: {loss_t.item()} ")

    return best_model, best_accuracy, {'validation_score': val_accuracy, 'train_score': t_accuracy, 'validation_loss': {loss_val.item()}, 'train_loss': {loss_t.item()}, 'classification_report': report}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from itertools import product
import time
from sklearn.preprocessing import LabelEncoder

def dnn_with_grid(x_train, y_train, x_val, y_val, param_grid, metadata, random_state = 19, scoring='accuracy'):
  keys, values = zip(*param_grid.items())
  param_combinations = [dict(zip(keys, v)) for v in product(*values)]

  best_score = -float('inf')
  best_model = None
  best_report = None

  le = LabelEncoder()
  y_train = le.fit_transform(y_train)  # Converte le categorie in interi
  y_val = le.transform(y_val)          # Trasforma anche il validation set
  joblib.dump(le, "label_encoder.pkl")

  # Create the dataset
  train_dataset = MyDataset(x_train,y_train)
  val_dataset = MyDataset(x_val,y_val)

  for params in param_combinations:
      fix_random(random_state)
      print(f"Valutando configurazione: {params}")

      # Estrai i parametri dalla configurazione attuale
      batch_size = params['batch_size']
      hidden_size = params['hidden_size']
      depth = params['depth']
      batch_norm = params['batch_norm']
      learning_rate = params['learning_rate']
      step_size = params['step_size']
      gamma = params['gamma']
      num_epochs = params['num_epochs']
      weight_decay = params['weight_decay']

      # Crea una nuova riga di metadata
      new_row = metadata.copy()
      new_row.update(params)


      # Verifica se la configurazione è già testata
      if not is_combination_tested(FILEPATH, new_row, num_epochs):
            start = time.time()

            # Create relative dataloaders
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)

            #define architecture, loss and optimizer
            model = FeedForwardPlus(train_dataset.num_features, train_dataset.num_classes, hidden_size, depth, batch_norm=batch_norm)
            model.to(device)

            #train
            criterion = torch.nn.CrossEntropyLoss()
            #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
            model, accuracy, results = train_model(model, criterion, optimizer, num_epochs, scheduler, train_loader, val_loader, device, new_row)

            print("time elapsed:", time.time() - start)

            if accuracy > best_score:
                print('interessante')
                best_score = accuracy
                best_model = model
                balance_data = results['classification_report']

  return best_model, best_score, best_report

In [None]:
def apply(x_train, y_train, x_val, y_val, scaling_methods, param_grid,
  dim_reduction=['no'], pca_threshold=0.99, target_count=20000, num_datasets=1,
  random_seed=19, outs=['no'], info=''):
  """
  Esegue l'intera pipeline di preprocessing, training e valutazione.
  """
  results = {}
  x_train_fix = x_train
  x_val_fix = x_val
  bestbest = 0

  for dim_redx in dim_reduction:
    print(f"\n=== Testing Dimensionality Reduction: {dim_redx} ===")
    if dim_redx != 'PCA':
      pca_threshold = None
    for scaling_method in scaling_methods:
      for out in outs:
          print(f"\n=== Testing Scaling Method: {scaling_method}, Outlier: {out} ===")

          x_train = replace_default_new(x_train_fix.copy(), info)
          x_val = apply_saved_modes(x_val_fix.copy(), info)

          # Rimuovi outlier
          x_train_filtered, y_train_filtered = remove_outliers(x_train, y_train, out)

          # Preprocessing
          datasets, validation = preprocessing_pipeline(x_train_filtered, y_train_filtered, x_val, y_val,
              scaling_method, dim_redx, pca_threshold, target_count, num_datasets, random_seed
          )

          results = {}
          for i, (x_train_processed, y_train_processed) in enumerate(datasets):
              x_val_processed, y_val_processed = validation[i]
              ds_name = f"dataset_{i+1}"
              metadata = {
                  'ds': ds_name,
                  'random': random_seed,
                  'outlier': out,
                  'dim_reduction': dim_redx,
                  'pca_threshold': pca_threshold,
                  'scaler': scaling_method,
                  'target count': target_count,
                  'info': info
              }

              print(f"--- Training Dataset {i+1}/{len(datasets)} ---")
              best_model, best_score, best_report = dnn_with_grid(
                  x_train_processed, y_train_processed,
                  x_val_processed, y_val_processed,
                  param_grid, metadata, random_seed
              )
              if best_score > bestbest:
                bestbest = best_score
                joblib.dump(best_model, "best_model.pkl")
                print(best_score)
                print(best_report)
              if best_model:
                  results[f"{scaling_method}_dataset_{i+1}"] = {
                      'best_model': best_model,
                      'best_score': best_score,
                      'classification_report': best_report # Store the report in results
                  }

              print(f"--- Training Dataset {i+1}/{len(datasets)} ---")
              best_model, best_score, best_report = dnn_with_grid(
                  x_train_processed, y_train_processed,
                  x_val_processed, y_val_processed,
                  param_grid, metadata, random_seed
              )
              if best_model:
                  results[f"{scaling_method}_dataset_{i+1}"] = {
                      'best_model': best_model,
                      'best_score': best_score,
                      'classification_report': best_report # Store the report in results
                  }
                  joblib.dump(best_model, f"best_model.pkl")
                  print(best_score)
                  print(best_report)
  return results

#Run

In [None]:
scaling_methods = ['l1', 'l2','minmax',]
out = ['no','base','isolation_forest', 'percentile',  'dynamic_threshold']
replace = ['no', 'mode', 'mode_all']
pca_threshold=0.99
dim_reduction = ['LDA', 'PCA', 'no']
param_grid = {
    "batch_size": [16, 32, 64, 128],
    "hidden_size": [16, 32, 64, 128, 192, 256, 512],
    "batch_norm": [False, True],
    "dropout": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
    "weight_decay": [1e-06, 1e-05, 0.001, 0.1],
    "learning_rate": [1.0e-05, 1.0e-04, 1.0e-03, 1.0e-02, 2.0e-03, 2.0e-05,
                      5.0e-04, 5.0e-05, 5.0e-03, 9.0e-04, 1.2e-03, 1.0e-01],
    "gamma": [0.1, 0.3, 0.4, 0.5, 0.8, 0.85, 0.9, 1.0],
    "step_size": [7.5, 10, 12, 12.5, 15, 20, 22, 25, 50],
    "num_epochs": [80],
}

In [None]:
# look for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('mps')
print("Device: {}".format(device))

Device: cpu


In [None]:
# Esecuzione dell'esperimento
for r in replace:
  print(f"\n=== Testing Replace Value: {r} ===")
  results = apply(
      X_train.copy(), y_train,
      X_val, y_val,
      scaling_methods, param_grid,
      dim_reduction=dim_reduction, pca_threshold=pca_threshold,
      target_count=20000, num_datasets=5, random_seed=19,
      outs = out,
      info = r
  )

  # Analisi dei risultati
  for key, value in results.items():
      print(f"\n=== Results for {key} ===")
      print(f"Best Model: {value['best_model']}")
      print("Classification Report:")
      print(value['classification_report'])

In [None]:
model = joblib.load("best_model_temp.pkl")
print(model)