In [40]:
%%capture
!pip install pytorch-tabnet
#ELIMINAREEE

In [41]:
%%capture
import pandas as pd
import numpy as np
import warnings
from google.colab import drive
import ipaddress
import random
import os

from sklearn import datasets
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.tensorboard import SummaryWriter


warnings.filterwarnings('ignore')
drive.mount('/content/drive')

FILEPATH = "/content/drive/MyDrive/data analytics/tabnet/results_0.csv"

# Creazione df result

In [42]:
def create_empty_df(filepath):
  """
  Crea un DataFrame vuoto e lo salva in un file CSV.
  """
  # Definisci le colonne del DataFrame
  columns = [
      'ds', 'random', 'outlier', 'dim_reduction', 'pca_threshold', 'scaler', 'target count',
      'batch_size', 'dim_embedding', 'num_heads', 'num_layers', 'learning_rate', 'epoch',
      'gamma', 'step_size', 'weight_decay', 'info'
  ]
  # Crea un DataFrame vuoto
  results_df = pd.DataFrame(columns=columns)

  # Salva il DataFrame in formato CSV
  results_df.to_csv(filepath, index=False)

  print(f"DataFrame creato e salvato in {filepath}")

create_empty_df(FILEPATH)

DataFrame creato e salvato in /content/drive/MyDrive/data analytics/tabnet/results_0.csv


# Data Cleaning
Scelta colonne, cast delle colonne e gestione dei valori nulli



In [43]:
# Funzione per determinare il tipo di dato di una colonna
def type_data(column):
    default_val = [np.nan, '-']
    column = column[~column.isin(default_val)]
    unique_count = column.nunique()
    if is_binary_dtype(column):
        return 'Binario'
    if  is_numeric_dtype(column):
        return 'Numerico Discreto' if pd.api.types.is_integer_dtype(column) else 'Numerico Continuo'
    if is_category_dtype(column):
        return 'Categorico'
    return 'Unknown'

# Funzioni ausiliarie per verificare il tipo di dato
def is_numeric_dtype(column):
    return pd.api.types.is_numeric_dtype(column)

def is_binary_dtype(column):
    return set(column.unique()) == {True, False}

def is_category_dtype(column):
    return pd.api.types.is_object_dtype(column) or pd.api.types.is_categorical_dtype(column)

In [44]:
def clean_service_columns(data):
    service_related_cols = {}
    categorial_columns = data.select_dtypes(exclude=np.number).columns
    categorial_columns = categorial_columns.drop(['dns_qclass', 'dns_qtype', 'http_version', 'http_orig_mime_types', 'http_resp_mime_types'])
    for col in categorial_columns:
      for prefix in ['dns', 'http', 'ssl']:
        if col.startswith(prefix) and not pd.api.types.is_numeric_dtype(col):
          if prefix not in service_related_cols:
            service_related_cols[prefix] = []
          service_related_cols[prefix].append(col)
    for col in data.columns:
        for service, columns in service_related_cols.items():
            if col in columns and f"service_{service}" in data.columns:
                data.loc[~data[f"service_{service}"], col] = '/'
    return data

def boolean_mapping(value, def_val=None):
    if value in {True, False}:
        return value
    if value == 'T':
        return True
    if value == 'F':
        return False
    return def_val if def_val is not None else value

def categorize_ports(df, port_columns):
    port_bins = [0, 1023, 49151, 65535]
    port_labels = ["Well-Known", "Registered", "Dynamic"]
    for col in port_columns:
        df[col] = pd.cut(df[col], bins=port_bins, labels=port_labels, right=True)
    return df


def categorize_ip(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.is_loopback:
            return "Loopback"
        if ip_obj.is_private:
            return "Private"
        if ip_obj.is_multicast:
            return "Multicast"
        if ip_obj.is_reserved:
            return "Reserved"
        if ip_obj.is_link_local:
            return "Link-Local"
        return "Public"
    except ValueError:
        return "Invalid"

def df_mapping(df):
  rcode_mapping = {0: 'No Error', 2: 'ServerFailure', 3: 'NameError', 5: 'Refuse'}
  qclass_mapping = {0: '-', 1: 'IN', 32769: 'CH'}
  qtype_mapping = {0: '-', 1: 'A', 2: 'NS', 5: 'CNAME', 28: 'AAAA', 255: 'ANY'}

  for col in df.columns:
    if col in ['dns_RD', 'dns_AA', 'dns_rejected', 'http_trans_depth','ssl_established','ssl_resumed']:
      df[col] = df[col].map(lambda x: boolean_mapping(x,  def_val=False)).astype(str)
    if col in ['http_status_code', 'weird_addl', 'http_trans_depth']:
      df[col] = df[col].astype(str)
    if col == 'dns_qclass':
      df[col] = df[col].apply(lambda x: qclass_mapping.get(x, None))
    if col == 'dns_qtype':
      df[col] = df[col].apply(lambda x: qtype_mapping.get(x, None))
    if col == 'dns_rcode':
      df[col] = df[col].apply(lambda x: rcode_mapping.get(x, None))
    if col in ['src_ip', 'dst_ip']:
      df[col] = df[col].apply(categorize_ip)
    if col == 'src_bytes':
      df = df[df['src_bytes'] != '0.0.0.0']
      df['src_bytes'] = df['src_bytes'].astype(int)
  df = categorize_ports(df, ['src_port', 'dst_port'])
  return df

def data_cleaning(df):
    services = df['service'].str.split(';').explode().unique()  # Estrazione di tutti i servizi unici
    for service in services:
        df[f'service_{service}'] = df['service'].apply(lambda x: service in x.split(';'))

    df.drop(['http_referrer', 'service', 'service_-'], axis=1, inplace=True, errors='ignore')
    df.drop(['ts', 'ssl_subject', 'ssl_issuer', 'dns_query', 'http_uri', 'http_user_agent', 'weird_name', 'label'],
             axis=1, inplace=True, errors='ignore')

    df = df_mapping(df)
    df = clean_service_columns(df)

    return df

In [45]:
def replace_default_new(df, info):
    mode_values = {}
    if info == 'mode' or info=='mode_all':
      for col in df.columns:
        if is_category_dtype(df[col]) or is_binary_dtype(df[col]):
            valid_values = df[(df[col] != '/') & (df[col] != '-')][col]
            mode_value = valid_values.mode()[0] if not valid_values.empty else '-'  # Usa '-' se non c'è moda
            mode_values[col] = mode_value

            # Sostituzione valori
            df[col] = df[col].replace('-', mode_value)
            if info == 'mode_all':
              df[col] = df[col].replace('/', mode_value)

    # Salva le mode con joblib
    joblib.dump(mode_values, "mode.pk")

    return df


def apply_saved_modes(val, info):
    if info == 'mode' or info=='mode_all':
      mode_values = joblib.load("mode.pk")
      # Applica le mode ai nuovi dati
      for col, mode_value in mode_values.items():
          if col in val.columns:
              val[col] = val[col].replace('-', mode_value)
              if info == 'mode_all':
                val[col] = val[col].replace('/', mode_value)
    return val

In [46]:
df=pd.read_csv('/content/drive/MyDrive/data analytics/train_dataset.csv')
df = data_cleaning(df)

# Divisione val e train

In [47]:
from sklearn.model_selection import train_test_split

# Definisci le features (X) e il target (y)
X = df.drop('type', axis=1)  # Assumi che 'label' sia la colonna del target
y = df['type']

# Dividi il dataset in train e test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=19)

# Unisci X_train e y_train
train_df = pd.concat([X_train, y_train], axis=1)

# Unisci X_test e y_test
test_df = pd.concat([X_val, y_val], axis=1)

# Pipline

In [48]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, LabelEncoder, Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import joblib
from imblearn.over_sampling import SMOTE,  BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [49]:
#rimozione outlier per classe
def remove_outliers(x, y, out):
    x_train = x.copy()
    y_train = y.copy()

    df = pd.concat([x_train, y_train], axis=1)
    numeric_cols = x_train.select_dtypes(include=np.number).columns

    # Controlla se non deve essere applicata nessuna rimozione
    if out == 'no':
        return x, y

    if out == 'base':
      #rimozione outlier piÃ¹ ASSURDI
      before = df.shape[0]
      df = df[df['duration'] < 1000]
      df = df[df['src_bytes']<100000000]
      df = df[df['dst_bytes']<100000000]
      df = df[df['missed_bytes']<100000000]
      df = df[df['src_pkts']<20000]
      df = df[df['dst_pkts']<20000]
      df = df[df['src_ip_bytes']<1000000]
      df = df[df['dst_ip_bytes']<1000000]
      print('  Rimosse ',before-df.shape[0],' istanze')
      x_train = df.drop('type', axis=1)
      y_train = df['type']
      return x_train, y_train

    filtered_data = []
    # Itera su ciascuna classe
    for cls in df['type'].unique():
        class_df = df[df['type'] == cls]
        before = class_df.shape[0]

        if out == 'iqr':
            for col in numeric_cols:
                Q1 = class_df[col].quantile(0.25)
                Q3 = class_df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'percentile':
            for col in numeric_cols:
                lower_bound = class_df[col].quantile(0.01)
                upper_bound = class_df[col].quantile(0.99)
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'isolation_forest':
            from sklearn.ensemble import IsolationForest
            iso = IsolationForest(contamination=0.05, random_state=19)
            numeric_data = class_df[numeric_cols]
            class_df['outlier'] = iso.fit_predict(numeric_data)
            class_df = class_df[class_df['outlier'] == 1].drop(columns=['outlier'])

        elif out == 'dynamic_threshold':
            for col in numeric_cols:
                mean = class_df[col].mean()
                std = class_df[col].std()
                lower_bound = mean - 3 * std
                upper_bound = mean + 3 * std
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        filtered_data.append(class_df)

    # Combina i dati filtrati per ciascuna classe
    filtered_df = pd.concat(filtered_data)

    x_train = filtered_df.drop('type', axis=1)
    y_train = filtered_df['type']

    return x_train, y_train

In [50]:
# scaling and normalization
def scale_train_data(x_train, y_train, scaling_method):
    scaled_df = x_train.copy()

    numeric_columns = x_train.select_dtypes(include=np.number).columns
    if len(numeric_columns) == 0:
        print("  Warning: No numeric columns to scale. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'none':
        print("No scaling applied.")
        return scaled_df, y_train
    elif scaling_method == 'standard':
        scaler = StandardScaler()
    elif scaling_method == 'minmax':
        scaler = MinMaxScaler()
    elif scaling_method == 'quantile':
        scaled_df = pd.concat([scaled_df, y_train], axis=1)
        scaled_df = scaled_df.sort_values(by='src_bytes')
        y_train = scaled_df['type']
        scaled_df = scaled_df.drop('type', axis=1)
        scaler = QuantileTransformer(output_distribution='uniform', random_state=19)
    elif scaling_method == 'l1':
        scaler = Normalizer(norm='l1')
    elif scaling_method == 'l2':
        scaler = Normalizer(norm='l2')
    else:
        raise ValueError(f"Metodo di scaling '{scaling_method}' non supportato.")

    if scaled_df[numeric_columns].shape[0] < 1:
        print("  Warning: Not enough samples to fit the scaler. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'l1' or scaling_method == 'l2':
        scaled_df = scaler.fit_transform(scaled_df)
    else:
        scaled_df[numeric_columns] = scaler.fit_transform(scaled_df[numeric_columns])
    joblib.dump(scaler, "scaler.pkl")
    return scaled_df, y_train

# carica scaler e effettua scaling
def scale_validation_data(x_val, y_val, scaling_method):
    if scaling_method == 'quantile':
        x_val = pd.concat([x_val, y_val], axis=1)
        x_val = x_val.sort_values(by='src_bytes')
        y_val = x_val['type']
        x_val = x_val.drop('type', axis=1)

    numeric_columns = x_val.select_dtypes(include=np.number).columns
    if scaling_method == 'none':
        print("No scaling applied to validation data.")
        return x_val, y_val

    scaler = joblib.load("scaler.pkl")

    if scaling_method == 'l1' or scaling_method == 'l2':
        x_val = scaler.transform(x_val)
    else:
        x_val[numeric_columns] = scaler.transform(x_val[numeric_columns])
    return x_val, y_val

In [51]:
# ENCODING
def encode_categorical_train_data(x_train):
    categorical_columns = x_train.select_dtypes(include=['object', 'category']).columns

    if len(categorical_columns) > 0:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(x_train[categorical_columns])
        joblib.dump(encoder, "onehot_encoder.pkl")
        x_train_encoded = encoder.transform(x_train[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_train_encoded_df = pd.DataFrame(x_train_encoded, columns=encoded_feature_names, index=x_train.index)
        x_train = x_train.drop(columns=categorical_columns)
        x_train = pd.concat([x_train, x_train_encoded_df], axis=1)

    return x_train

def encode_categorical_validation_data(x_val):
    categorical_columns = x_val.select_dtypes(include=['object', 'category']).columns
    encoder = joblib.load("onehot_encoder.pkl")

    if len(categorical_columns) > 0:
        x_val_encoded = encoder.transform(x_val[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_val_encoded_df = pd.DataFrame(x_val_encoded, columns=encoded_feature_names, index=x_val.index)
        x_val = x_val.drop(columns=categorical_columns)
        x_val = pd.concat([x_val, x_val_encoded_df], axis=1)

    return x_val

In [52]:
# BILANCIAMENTO
def balance_data(x_train, y_train, target_count, num_datasets, random_seed):
    smote = BorderlineSMOTE(random_state=random_seed)
    oversampler = RandomOverSampler(random_state=random_seed)

    class_counts = pd.Series(y_train).value_counts()
    smote_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count / 2]

    if smote_classes:
        smote_strategy = {cls: target_count for cls in smote_classes}
        smote = BorderlineSMOTE(sampling_strategy=smote_strategy, random_state=random_seed)
        x_train, y_train = smote.fit_resample(x_train, y_train)
        class_counts = pd.Series(y_train).value_counts()

    over_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count]
    if over_classes:
        over_strategy = {cls: target_count for cls in over_classes}
        oversampler = RandomOverSampler(sampling_strategy=over_strategy, random_state=random_seed)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)

    datasets = []
    for i in range(num_datasets):
        undersampler = RandomUnderSampler(sampling_strategy={cls: target_count for cls in pd.Series(y_train).value_counts().index}, random_state=random_seed + i)
        x_resampled, y_resampled = undersampler.fit_resample(x_train, y_train)
        x_resampled, y_resampled = shuffle(x_resampled, y_resampled, random_state=random_seed + i)
        datasets.append((x_resampled, y_resampled))


    return datasets

In [53]:
# PCA
def apply_pca_train(x_train, random_state, pca_threshold=0.99):
    pca = PCA(random_state=random_state)
    pca.fit(x_train)
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    n_components = (cumulative_variance >= pca_threshold).argmax() + 1
    pca = PCA(n_components=n_components, random_state=random_state)
    transformed_data = pca.fit_transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    print(f"  Numero di colonne selezionate (componenti principali): {n_components}")
    joblib.dump(pca, "pca_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"PC{i+1}" for i in range(n_components)])

def apply_pca_validation(x_val):
    pca = joblib.load("pca_model.pkl")
    x_val = pca.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

# LDA
def apply_lda_train(x_train, y_train, lda_components=None):
    lda = LDA(n_components=lda_components)
    lda.fit(x_train, y_train)
    transformed_data = lda.transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    n_components = transformed_data.shape[1]
    print(f"  Numero di colonne selezionate (componenti discriminanti): {n_components}")
    joblib.dump(lda, "lda_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"LD{i+1}" for i in range(n_components)])

def apply_lda_validation(x_val):
    lda = joblib.load("lda_model.pkl")
    x_val = lda.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

In [54]:
def preprocessing_pipeline(x_train, y_train, x_validation, y_validation, scaling_method, use_pca, pca_threshold, target_count=20000, num_datasets=1, random_seed=19):
    # Encoding delle feature
    x_train = encode_categorical_train_data(x_train)
    x_validation = encode_categorical_validation_data(x_validation)

    # Bilanciamento
    datasets = balance_data(x_train, y_train, target_count, num_datasets, random_seed)
    validation = []
    data = []
    i = 0

    for x_train, y_train in datasets:
      print(f"  Dataset bilanciato {i+1}:")
      i+=1
      # Scaling
      x_train, y_train = scale_train_data(x_train, y_train, scaling_method)
      x_val, y_val = scale_validation_data(x_validation, y_validation, scaling_method)

      if use_pca == 'PCA':
          x_train = apply_pca_train(x_train, random_state=random_seed, pca_threshold=pca_threshold)
          x_val = apply_pca_validation(x_val)
      elif use_pca == 'LDA':
          x_train = apply_lda_train(x_train, y_train)
          x_val = apply_lda_validation(x_val)

      # Bilanciamento
      data.append((x_train, y_train))
      validation.append((x_val, y_val))
    return data, validation


# Train

In [55]:
def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

def is_combination_tested(filepath, new_row, num_epochs):
  """
  Verifica se una combinazione è già presente nel file CSV.

  Parametri:
  - filepath (str): Percorso del file CSV.
  - new_row (dict): Dizionario con i valori da verificare.

  Ritorna:
  - bool: True se la combinazione esiste, False altrimenti.
  """
  return False
  # Leggi i risultati esistenti
  existing_results = pd.read_csv(filepath)
  comparison_columns = [
      'ds', 'random', 'outlier', 'pca', 'pca_threshold', 'scaler', 'target count',
      'batch_size', 'dim_embedding', 'num_heads', 'num_layers','learning_rate', 'new'
  ]

  filtered_results = existing_results.copy()
  filtered_results = filtered_results[filtered_results['end'] == True] #solo combinazioni terminate

  for col in comparison_columns:
    # Mantieni solo le righe in cui i valori corrispondono (o sono entrambi NaN)
    filtered_results = filtered_results[
        (filtered_results[col] == new_row[col]) | (pd.isna(filtered_results[col]) & pd.isna(new_row[col]))
    ]

  # Controlla se tutte le colonne non in new_row sono NaN
  for _, row in filtered_results.iterrows():
    all_remaining_nan = all(pd.isna(row[col]) for col in comparison_columns if col not in new_row)
    if all_remaining_nan:
        print("  Configurazione già testata, salto...")
        return True

def append_and_save_results(filepath, new_row):
  results_df = pd.read_csv(filepath)
  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
  results_df.to_csv(filepath, index=False)

In [56]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer

class TabNet(torch.nn.Module):
    '''
    Wrapper class for TabNetClassifier
    '''
    def __init__(self, n_d,
                 n_a,
                 n_steps,
                 gamma,
                 optimizer_fn,
                 n_independent,
                 n_shared,
                 epsilon,
                 seed,
                 lambda_sparse,
                 clip_value,
                 momentum,
                 optimizer_params,
                 scheduler_params,
                 mask_type,
                 scheduler_fn,
                 device_name,
                 output_dim,
                 batch_size,
                 num_epochs,
                 unsupervised_model,
                 verbose=0):
        super(TabNet, self).__init__()

        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.unsupervised_model = unsupervised_model
        self.network = TabNetClassifier(n_d=n_d,
                                        n_a=n_a,
                                        n_steps=n_steps,
                                        gamma=gamma,
                                        optimizer_fn=optimizer_fn,
                                        n_independent=n_independent,
                                        n_shared=n_shared,
                                        epsilon=epsilon,
                                        seed=seed,
                                        lambda_sparse=lambda_sparse,
                                        clip_value=clip_value,
                                        momentum=momentum,
                                        optimizer_params=optimizer_params,
                                        scheduler_params=scheduler_params,
                                        mask_type=mask_type,
                                        scheduler_fn=scheduler_fn,
                                        device_name=device_name,
                                        output_dim=output_dim,
                                        verbose=verbose)

    def fit_model(self, X_train, y_train, X_val, y_val, criterion):
        self.network.fit(X_train=X_train,
                         y_train=y_train,
                         eval_set=[(X_train,y_train),(X_val, y_val)],
                         eval_metric=['accuracy'],
                         patience=10,
                         batch_size=self.batch_size,
                         virtual_batch_size=128,
                         num_workers=0,
                         drop_last=True,
                         max_epochs=self.num_epochs,
                         loss_fn=criterion,
                         from_unsupervised=self.unsupervised_model)

    def predict(self, X):
        return self.network.predict(X)

    def explain(self, X):
        return self.network.explain(X)

    def feature_importances(self):
        return self.network.feature_importances_

def get_unsupervised_model(n_d_a,n_step,n_independent,n_shared,gamma,lr):
    tabnet_params = dict(n_d=n_d_a,
                        n_a=n_d_a,
                        n_steps=n_step,
                        gamma=gamma,
                        n_independent=n_independent,
                        n_shared=n_shared,
                        lambda_sparse=1e-3,
                        optimizer_fn=torch.optim.AdamW,
                        optimizer_params=dict(lr=lr),
                        mask_type="sparsemax",
                        verbose=0
                        )
    unsupervised_model = TabNetPretrainer(**tabnet_params)
    return unsupervised_model

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from itertools import product
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight


def trasf_with_grid(x_train, y_train, x_val, y_val, param_grid, metadata, random_state = 19, scoring='accuracy'):
  """
  Cerca i migliori iperparametri di una Random Forest valutando direttamente sul validation set.
  Salta configurazioni già testate.
  """
  keys, values = zip(*param_grid.items())
  param_combinations = [dict(zip(keys, v)) for v in product(*values)]

  best_score = -float('inf')
  best_model = None
  best_report = None


  # Poi va salvato però
  le = LabelEncoder()
  y_train = le.fit_transform(y_train)  # Converte le categorie in interiù
  joblib.dump(le, "label_encoder.pkl")
  y_val = le.transform(y_val)          # Trasforma anche il validation set


  class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
  class_weights = dict(enumerate(class_weights))
  print(class_weights)

  num_feature = x_train.shape[1]
  num_classes = len(np.unique(y_train))

  # Initialize the model, loss, and optimizer
  best_acc = -float('inf')
  best_loss = float('inf')

  # per togliere i pesi modificare QUI
  criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights.values()), dtype=torch.float32).to(device))
  current_iter = 0

 # Convert to numpy if not already a numpy array
  if not isinstance(x_train, np.ndarray):
    x_train = x_train.to_numpy()
  if not isinstance(x_val, np.ndarray):
    x_val = x_val.to_numpy()

  for params in param_combinations:
      fix_random(random_state)
      print(f"Valutando configurazione: {params}")

      # Estrai i parametri dalla configurazione attuale
      num_epochs = params['num_epochs']
      batch_size = params['batch_size']
      patience_ = params['patience']
      n_d = params['n_d_a']  # Numero di dimensioni nascoste per il decoder
      n_a = params['n_d_a']  # Numero di dimensioni nascoste per il decoder (uguale a n_d per TabNet)
      n_s = params['n_shared']  # Numero di layer condivisi
      n_i = params['n_indipendents']  # Numero di layer indipendenti
      n_steps_ = params['n_steps']  # Numero di steps TabNet
      gamma_ = params['gamma']  # Fattore gamma
      epsilon_ = params['epsilon']  # Valore di stabilità numerica
      lr = params['learning_rate']  # Learning rate
      pretraining_ratio_ = params['pretraining_ratio']  # Percentuale di dati per pretraining
      moment = params['momentum']  # Momento per l'ottimizzatore


      # Crea una nuova riga di metadata
      new_row = metadata.copy()
      new_row.update(params)


      # Verifica se la configurazione è già testata
      if True:#not is_combination_tested(FILEPATH, new_row):

            # Addestra il modello
            print(f'Iteration {current_iter+1}')
            print(f'Hyperparameters: num_epochs={num_epochs}, batch_size={batch_size}, patience={patience_}, n_d={n_d}, n_indipendent={n_i}, n_shared={n_s}, n_steps={n_steps_}, gamma={gamma_}, epsilon={epsilon_}, lr={lr}, pretraining_ratio={pretraining_ratio_}, momentum={moment}')

            unsupervised_model = get_unsupervised_model(n_d, n_steps_, n_i, n_s, gamma_, lr)

            unsupervised_model.fit(
                X_train=x_train,
                eval_set=[x_val],
                max_epochs=num_epochs,
                patience=patience_,
                batch_size=batch_size,
                virtual_batch_size=128,
                num_workers=0,
                drop_last=False,
                pretraining_ratio=pretraining_ratio_,
            )

            model = TabNet(n_d=n_d,
                          n_a=n_d,
                          n_steps=n_steps_,
                          gamma=gamma_,
                          optimizer_fn=torch.optim.AdamW,
                          n_independent=n_i,
                          n_shared=n_s,
                          epsilon=epsilon_,
                          seed=random_state,
                          lambda_sparse=1e-4,
                          clip_value=1,
                          momentum=moment,
                          optimizer_params=dict(lr=lr),
                          scheduler_params=dict(step_size=10, gamma=0.9),
                          mask_type='sparsemax',
                          scheduler_fn=torch.optim.lr_scheduler.StepLR,
                          device_name=device,
                          output_dim=len(np.unique(y_train)),
                          batch_size=batch_size,
                          num_epochs=num_epochs,
                          unsupervised_model=None,
                          verbose=0)

            model.fit_model(x_train, y_train, x_val, y_val, criterion)
            y_pred = model.predict(x_val)
            acc = accuracy_score(y_val, y_pred)
            print(acc)
            new_row.update({
            'val_accuracy': acc})

            # Salva i risultati
            #append_and_save_results(FILEPATH, new_row)


            if acc > best_acc:
                best_acc = acc
                best_model = model
                best_hyperparameters = f"num_epochs={num_epochs}, batch_size={batch_size}, patience={patience_}, n_d={n_d}, n_indipendent={n_i}, n_shared={n_s}, n_steps={n_steps_}, gamma={gamma_}, epsilon={epsilon_}, lr={lr}, pretraining_ratio={pretraining_ratio_}, momentum={moment}"
            current_iter += 1

  return best_model, best_score

In [58]:
def apply(x_train, y_train, x_val, y_val, scaling_methods, param_grid,
  dim_reduction=['no'], pca_threshold=0.99, target_count=20000, num_datasets=1,
  random_seed=19, outs=['no'], info=''):
    """
    Esegue l'intera pipeline di preprocessing, training e valutazione.
    """
    results = {}
    x_train_fix = x_train
    x_val_fix = x_val
    bestbest = 0


    for dim_redx in dim_reduction:
      print(f"\n=== Testing Dimensionality Reduction: {dim_redx} ===")
      if dim_redx != 'PCA':
        pca_threshold = None
      for scaling_method in scaling_methods:
        for out in outs:
            print(f"\n=== Testing Scaling Method: {scaling_method}, Outlier: {out} ===")

            x_train = replace_default_new(x_train_fix.copy(), info)
            x_val = apply_saved_modes(x_val_fix.copy(), info)

            # Rimozione degli outlier
            x_train_filtered, y_train_filtered = remove_outliers(x_train, y_train, out)

            # Preprocessing
            datasets, validation = preprocessing_pipeline(
                x_train_filtered, y_train_filtered, x_val, y_val,
                scaling_method, dim_redx, pca_threshold, target_count, num_datasets, random_seed
            )

            results = {}
            for i, (x_train_processed, y_train_processed) in enumerate(datasets):
                    x_val_processed, y_val_processed = validation[i]
                    ds_name = f"dataset_{i+1}"
                    metadata = {
                        'ds': ds_name,
                        'random': random_seed,
                        'outlier': out,
                        'dim_reduction': dim_redx,
                        'pca_threshold': pca_threshold,
                        'scaler': scaling_method,
                        'target count': target_count,
                        'info': info
                    }

                    print(f"--- Training Dataset {i+1}/{len(datasets)} ---")
                    best_model, best_score = trasf_with_grid(
                        x_train_processed, y_train_processed,
                        x_val_processed, y_val_processed,
                        param_grid, metadata, random_seed
                    )
                    if best_model:
                        results[f"{scaling_method}_dataset_{i+1}"] = {
                            'best_model': best_model,
                            'best_score': best_score,
                        }
                        joblib.dump(best_model, f"best_model.pkl")
                        print(best_score)
    return results

#Run

In [59]:
scaling_methods = ['l1', 'l2']
out = ['no','base','isolation_forest', 'percentile',  'dynamic_threshold']
replace = ['no', 'mode']

scaling_methods = ['l1']
out = ['no']
replace = ['mode']

pca_threshold=0.99
dim_reduction = ['LDA', 'PCA']

param_grid = {
    'num_epochs': [1],
    'batch_size': [512],
    'patience': [20],
    'n_d_a' : [128], # Dimensioni delle feature (decoder e encoder)
    'n_shared': [1], # Livelli condivisi tra le step layers
    'n_indipendents':[1], # Livelli indipendenti tra le step layers
    'n_steps' : [5],  # Numero di passi decisionali
    'gamma': [1.0],  # Peso del mascheramento tra i passi decisionali
    'epsilon' : [1e-15],  # Stabilità numerica
    'learning_rate': [0.001],
    'pretraining_ratio': [0.5],  # Percentuale dei dati usati per il pretraining
    'momentum' : [0.90],  # Momento per l'ottimizzazione
}

In [60]:
scaling_methods = ['l1']
out = ['no']
replace = ['mode']

pca_threshold=0.99
dim_reduction = ['LDA', 'PCA', 'no']

param_grid = {
    'num_epochs': [1],
    'batch_size': [512],
    'patience': [20],
    'n_d_a' : [128], # Dimensioni delle feature (decoder e encoder)
    'n_shared': [1], # Livelli condivisi tra le step layers
    'n_indipendents':[1], # Livelli indipendenti tra le step layers
    'n_steps' : [5],  # Numero di passi decisionali
    'gamma': [1.0],  # Peso del mascheramento tra i passi decisionali
    'epsilon' : [1e-15],  # Stabilità numerica
    'learning_rate': [0.001],
    'pretraining_ratio': [0.5],  # Percentuale dei dati usati per il pretraining
    'momentum' : [0.90],  # Momento per l'ottimizzazione
}

In [61]:
# look for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('mps')
print("Device: {}".format(device))

Device: cpu


In [64]:
# Esecuzione dell'esperimento
for r in replace:
  print(f"\n=== Testing Replace Value: {r} ===")
  results = apply(
      X_train.copy(), y_train,
      X_val, y_val,
      scaling_methods, param_grid,
      dim_reduction=dim_reduction, pca_threshold=pca_threshold,
      target_count=20000, num_datasets=5, random_seed=19,
      outs = out,
      info = r
  )
  # Analisi dei risultati
  for key, value in results.items():
      print(f"\n=== Results for {key} ===")
      print(f"Best Model: {value['best_model']}")
      print("Classification Report:")
      print(value['classification_report'])


=== Testing Replace Value: mode ===

=== Testing Dimensionality Reduction: LDA ===

=== Testing Scaling Method: l1, Outlier: no ===
  Dataset bilanciato 1:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 2:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 3:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 4:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 5:
  Numero di colonne selezionate (componenti discriminanti): 9
--- Training Dataset 1/5 ---
{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0}
Valutando configurazione: {'num_epochs': 1, 'batch_size': 512, 'patience': 20, 'n_d_a': 128, 'n_shared': 1, 'n_indipendents': 1, 'n_steps': 5, 'gamma': 1.0, 'epsilon': 1e-15, 'learning_rate': 0.001, 'pretraining_ratio': 0.5, 'momentum': 0.9}
Iteration 1
Hyperparameters: num_epochs=1, batch_size=512, patience=20, n_d=128, n_in

TypeError: '>=' not supported between instances of 'float' and 'NoneType'

In [None]:
model = joblib.load("best_model_temp.pkl")
print(model)