In [1]:
%%capture
import pandas as pd
import numpy as np
import warnings
from google.colab import drive
import ipaddress

warnings.filterwarnings('ignore')
drive.mount('/content/drive')

FILEPATH = "/content/drive/MyDrive/data analytics/svm_3.0/results_0.csv"

# Creazione df result

In [2]:
def create_empty_df(filepath):
  """
  Crea un DataFrame vuoto e lo salva in un file CSV.
  """
  # Definisci le colonne del DataFrame
  columns = ['ds', 'random', 'outlier', 'dim_reduction', 'pca_threshold', 'degree', 'scaler', 'kernel', 'C', 'gamma', 'target count']

  # Crea un DataFrame vuoto
  results_df = pd.DataFrame(columns=columns)

  # Salva il DataFrame in formato CSV
  results_df.to_csv(filepath, index=False)

  print(f"DataFrame creato e salvato in {filepath}")

create_empty_df(FILEPATH)

DataFrame creato e salvato in /content/drive/MyDrive/data analytics/svm_3.0/results_0.csv


# Data Cleaning
Scelta colonne, cast delle colonne e gestione dei valori nulli



In [3]:
# Funzione per determinare il tipo di dato di una colonna
def type_data(column):
    default_val = [np.nan, '-']
    column = column[~column.isin(default_val)]
    unique_count = column.nunique()
    if is_binary_dtype(column):
        return 'Binario'
    if  is_numeric_dtype(column):
        return 'Numerico Discreto' if pd.api.types.is_integer_dtype(column) else 'Numerico Continuo'
    if is_category_dtype(column):
        return 'Categorico'
    return 'Unknown'

# Funzioni ausiliarie per verificare il tipo di dato
def is_numeric_dtype(column):
    return pd.api.types.is_numeric_dtype(column)

def is_binary_dtype(column):
    return set(column.unique()) == {True, False}

def is_category_dtype(column):
    return pd.api.types.is_object_dtype(column) or pd.api.types.is_categorical_dtype(column)

In [4]:
def clean_service_columns(data):
    service_related_cols = {}
    categorial_columns = data.select_dtypes(exclude=np.number).columns
    categorial_columns = categorial_columns.drop(['http_orig_mime_types', 'http_resp_mime_types'])
    for col in categorial_columns:
      for prefix in ['dns', 'http', 'ssl']:
        if col.startswith(prefix) and not pd.api.types.is_numeric_dtype(col):
          if prefix not in service_related_cols:
            service_related_cols[prefix] = []
          service_related_cols[prefix].append(col)
    for col in data.columns:
        for service, columns in service_related_cols.items():
            if col in columns and f"service_{service}" in data.columns:
                data.loc[~data[f"service_{service}"], col] = '/'
    return data

def boolean_mapping(value, def_val=None):
    if value in {True, False}:
        return value
    if value == 'T':
        return True
    if value == 'F':
        return False
    return def_val if def_val is not None else value

def categorize_ports(df, port_columns):
    port_bins = [0, 1023, 49151, 65535]
    port_labels = ["Well-Known", "Registered", "Dynamic"]
    for col in port_columns:
        df[col] = pd.cut(df[col], bins=port_bins, labels=port_labels, right=True)
    return df


def categorize_ip(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.is_loopback:
            return "Loopback"
        if ip_obj.is_private:
            return "Private"
        if ip_obj.is_multicast:
            return "Multicast"
        if ip_obj.is_reserved:
            return "Reserved"
        if ip_obj.is_link_local:
            return "Link-Local"
        return "Public"
    except ValueError:
        return "Invalid"

def df_mapping(df):
  rcode_mapping = {0: 'No Error', 2: 'ServerFailure', 3: 'NameError', 5: 'Refuse'}
  qclass_mapping = {0: '-', 1: 'IN', 32769: 'CH'}
  qtype_mapping = {0: '-', 1: 'A', 2: 'NS', 5: 'CNAME', 28: 'AAAA', 255: 'ANY'}

  for col in df.columns:
    if col in ['dns_RD', 'dns_RA', 'dns_AA', 'dns_rejected','ssl_established','ssl_resumed', 'weird_notice', 'http_trans_depth']:
      df[col] = df[col].map(lambda x: boolean_mapping(x, def_val=False)).astype(str)
    if col in ['http_status_code', 'weird_addl', 'http_trans_depth']:
      df[col] = df[col].astype(str)
    if col == 'dns_qclass':
      df[col] = df[col].apply(lambda x: qclass_mapping.get(x, None))
    if col == 'dns_qtype':
      df[col] = df[col].apply(lambda x: qtype_mapping.get(x, None))
    if col == 'dns_rcode':
      df[col] = df[col].apply(lambda x: rcode_mapping.get(x, None))
    if col in ['src_ip', 'dst_ip']:
      df[col] = df[col].apply(categorize_ip)
    if col == 'src_bytes':
      df = df[df['src_bytes'] != '0.0.0.0']
      df['src_bytes'] = df['src_bytes'].astype(int)
  df = categorize_ports(df, ['src_port', 'dst_port'])
  return df

def data_cleaning(df):
    services = df['service'].str.split(';').explode().unique()  # Estrazione di tutti i servizi unici
    for service in services:
        df[f'service_{service}'] = df['service'].apply(lambda x: service in x.split(';'))

    df.drop(['http_referrer', 'service', 'service_-'], axis=1, inplace=True, errors='ignore')
    df.drop( ['ts', 'ssl_subject', 'ssl_issuer', 'dns_query', 'http_uri', 'http_user_agent', 'weird_name', 'label'],
             axis=1, inplace=True, errors='ignore')

    df = df_mapping(df)
    df = clean_service_columns(df)

    return df

In [5]:
def replace_default_new(df, info):

    mode_values = {}
    if info == 'mode' or info=='mode_all':
      for col in df.columns:
        if is_category_dtype(df[col]) or is_binary_dtype(df[col]):
            valid_values = df[(df[col] != '/') & (df[col] != '-')][col]
            mode_value = valid_values.mode()[0] if not valid_values.empty else '-'  # Usa '-' se non c'è moda
            mode_values[col] = mode_value

            # Sostituzione valori
            df[col] = df[col].replace('-', mode_value)
            if info == 'mode_all':
              df[col] = df[col].replace('/', mode_value)

    # Salva le mode con joblib
    joblib.dump(mode_values, "mode.pk")

    return df


def apply_saved_modes(val, info):

    if info == 'mode' or info=='mode_all':
      mode_values = joblib.load("mode.pk")
      # Applica le mode ai nuovi dati
      for col, mode_value in mode_values.items():
          if col in val.columns:
              val[col] = val[col].replace('-', mode_value)
              if info == 'mode_all':
                val[col] = val[col].replace('/', mode_value)
    return val

In [6]:
df=pd.read_csv('/content/drive/MyDrive/data analytics/train_dataset.csv')
df = data_cleaning(df)

# Divisione val e train

In [7]:
from sklearn.model_selection import train_test_split

# Definisci le features (X) e il target (y)
X = df.drop('type', axis=1)  # Assumi che 'label' sia la colonna del target
y = df['type']

# Dividi il dataset in train e test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=19)

# Unisci X_train e y_train
train_df = pd.concat([X_train, y_train], axis=1)

# Unisci X_test e y_test
test_df = pd.concat([X_val, y_val], axis=1)

# Pipline

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, LabelEncoder, Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import joblib
from imblearn.over_sampling import SMOTE,  BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [9]:
#rimozione outlier per classe
def remove_outliers(x, y, out):
    x_train = x.copy()
    y_train = y.copy()

    df = pd.concat([x_train, y_train], axis=1)
    numeric_cols = x_train.select_dtypes(include=np.number).columns

    # Controlla se non deve essere applicata nessuna rimozione
    if out == 'no':
        return x, y

    if out == 'base':
      #rimozione outlier piÃ¹ ASSURDI
      before = df.shape[0]
      df = df[df['duration'] < 1000]
      df = df[df['src_bytes']<100000000]
      df = df[df['dst_bytes']<100000000]
      df = df[df['missed_bytes']<100000000]
      df = df[df['src_pkts']<20000]
      df = df[df['dst_pkts']<20000]
      df = df[df['src_ip_bytes']<1000000]
      df = df[df['dst_ip_bytes']<1000000]
      print('  Rimosse ',before-df.shape[0],' istanze')
      x_train = df.drop('type', axis=1)
      y_train = df['type']
      return x_train, y_train

    filtered_data = []
    # Itera su ciascuna classe
    for cls in df['type'].unique():
        class_df = df[df['type'] == cls]
        before = class_df.shape[0]

        if out == 'iqr':
            for col in numeric_cols:
                Q1 = class_df[col].quantile(0.25)
                Q3 = class_df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'percentile':
            for col in numeric_cols:
                lower_bound = class_df[col].quantile(0.01)
                upper_bound = class_df[col].quantile(0.99)
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        elif out == 'isolation_forest':
            from sklearn.ensemble import IsolationForest
            iso = IsolationForest(contamination=0.05, random_state=19)
            numeric_data = class_df[numeric_cols]
            class_df['outlier'] = iso.fit_predict(numeric_data)
            class_df = class_df[class_df['outlier'] == 1].drop(columns=['outlier'])

        elif out == 'dynamic_threshold':
            for col in numeric_cols:
                mean = class_df[col].mean()
                std = class_df[col].std()
                lower_bound = mean - 3 * std
                upper_bound = mean + 3 * std
                class_df = class_df[(class_df[col] >= lower_bound) & (class_df[col] <= upper_bound)]

        filtered_data.append(class_df)

    # Combina i dati filtrati per ciascuna classe
    filtered_df = pd.concat(filtered_data)

    x_train = filtered_df.drop('type', axis=1)
    y_train = filtered_df['type']

    return x_train, y_train

In [10]:
# scaling and normalization
def scale_train_data(x_train, y_train, scaling_method):
    scaled_df = x_train.copy()

    numeric_columns = x_train.select_dtypes(include=np.number).columns
    if len(numeric_columns) == 0:
        print("  Warning: No numeric columns to scale. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'none':
        print("No scaling applied.")
        return scaled_df, y_train
    elif scaling_method == 'standard':
        scaler = StandardScaler()
    elif scaling_method == 'minmax':
        scaler = MinMaxScaler()
    elif scaling_method == 'quantile':
        scaled_df = pd.concat([scaled_df, y_train], axis=1)
        scaled_df = scaled_df.sort_values(by='src_bytes')
        y_train = scaled_df['type']
        scaled_df = scaled_df.drop('type', axis=1)
        scaler = QuantileTransformer(output_distribution='uniform', random_state=19)
    elif scaling_method == 'l1':
        scaler = Normalizer(norm='l1')
    elif scaling_method == 'l2':
        scaler = Normalizer(norm='l2')
    else:
        raise ValueError(f"Metodo di scaling '{scaling_method}' non supportato.")

    if scaled_df[numeric_columns].shape[0] < 1:
        print("  Warning: Not enough samples to fit the scaler. Returning original DataFrame.")
        return scaled_df, y_train

    if scaling_method == 'l1' or scaling_method == 'l2':
        scaled_df = scaler.fit_transform(scaled_df)
    else:
        scaled_df[numeric_columns] = scaler.fit_transform(scaled_df[numeric_columns])
    joblib.dump(scaler, "scaler.pkl")
    return scaled_df, y_train

# carica scaler e effettua scaling
def scale_validation_data(x_val, y_val, scaling_method):
    if scaling_method == 'quantile':
        x_val = pd.concat([x_val, y_val], axis=1)
        x_val = x_val.sort_values(by='src_bytes')
        y_val = x_val['type']
        x_val = x_val.drop('type', axis=1)

    numeric_columns = x_val.select_dtypes(include=np.number).columns
    if scaling_method == 'none':
        print("No scaling applied to validation data.")
        return x_val, y_val

    scaler = joblib.load("scaler.pkl")

    if scaling_method == 'l1' or scaling_method == 'l2':
        x_val = scaler.transform(x_val)
    else:
        x_val[numeric_columns] = scaler.transform(x_val[numeric_columns])
    return x_val, y_val

In [11]:
# ENCODING
def encode_categorical_train_data(x_train):
    categorical_columns = x_train.select_dtypes(include=['object', 'category']).columns

    if len(categorical_columns) > 0:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(x_train[categorical_columns])
        joblib.dump(encoder, "onehot_encoder.pkl")
        x_train_encoded = encoder.transform(x_train[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_train_encoded_df = pd.DataFrame(x_train_encoded, columns=encoded_feature_names, index=x_train.index)
        x_train = x_train.drop(columns=categorical_columns)
        x_train = pd.concat([x_train, x_train_encoded_df], axis=1)

    return x_train

def encode_categorical_validation_data(x_val):
    categorical_columns = x_val.select_dtypes(include=['object', 'category']).columns
    encoder = joblib.load("onehot_encoder.pkl")

    if len(categorical_columns) > 0:
        x_val_encoded = encoder.transform(x_val[categorical_columns])
        encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
        x_val_encoded_df = pd.DataFrame(x_val_encoded, columns=encoded_feature_names, index=x_val.index)
        x_val = x_val.drop(columns=categorical_columns)
        x_val = pd.concat([x_val, x_val_encoded_df], axis=1)

    return x_val

In [12]:
# BILANCIAMENTO
def balance_data(x_train, y_train, target_count, num_datasets, random_seed):
    smote = SMOTE(random_state=random_seed)
    oversampler = RandomOverSampler(random_state=random_seed)

    class_counts = pd.Series(y_train).value_counts()
    smote_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count / 2]

    if smote_classes:
        smote_strategy = {cls: target_count for cls in smote_classes}
        smote = SMOTE(sampling_strategy=smote_strategy, random_state=random_seed)
        x_train, y_train = smote.fit_resample(x_train, y_train)
        class_counts = pd.Series(y_train).value_counts()

    over_classes = [cls for cls in class_counts.index if class_counts[cls] < target_count]
    if over_classes:
        over_strategy = {cls: target_count for cls in over_classes}
        oversampler = RandomOverSampler(sampling_strategy=over_strategy, random_state=random_seed)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)

    datasets = []
    for i in range(num_datasets):
        undersampler = RandomUnderSampler(sampling_strategy={cls: target_count for cls in pd.Series(y_train).value_counts().index}, random_state=random_seed + i)
        x_resampled, y_resampled = undersampler.fit_resample(x_train, y_train)
        x_resampled, y_resampled = shuffle(x_resampled, y_resampled, random_state=random_seed + i)
        datasets.append((x_resampled, y_resampled))



    return datasets

In [13]:
# PCA
def apply_pca_train(x_train, random_state, pca_threshold=0.99):
    pca = PCA(random_state=random_state)
    pca.fit(x_train)
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    n_components = (cumulative_variance >= pca_threshold).argmax() + 1
    pca = PCA(n_components=n_components, random_state=random_state)
    transformed_data = pca.fit_transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    print(f"  Numero di colonne selezionate (componenti principali): {n_components}")
    joblib.dump(pca, "pca_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"PC{i+1}" for i in range(n_components)])

def apply_pca_validation(x_val):
    pca = joblib.load("pca_model.pkl")
    x_val = pca.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

# LDA
def apply_lda_train(x_train, y_train, lda_components=None):
    lda = LDA(n_components=lda_components)
    lda.fit(x_train, y_train)
    transformed_data = lda.transform(x_train)
    transformed_data = transformed_data.astype(np.float32)

    n_components = transformed_data.shape[1]
    print(f"  Numero di colonne selezionate (componenti discriminanti): {n_components}")
    joblib.dump(lda, "lda_model.pkl")
    return pd.DataFrame(transformed_data, columns=[f"LD{i+1}" for i in range(n_components)])

def apply_lda_validation(x_val):
    lda = joblib.load("lda_model.pkl")
    x_val = lda.transform(x_val)
    x_val = x_val.astype(np.float32)
    return x_val

In [14]:
def preprocessing_pipeline(x_train, y_train, x_validation, y_validation, scaling_method, use_pca, pca_threshold, target_count=20000, num_datasets=1, random_seed=19):
    # Encoding delle feature
    x_train = encode_categorical_train_data(x_train)
    x_validation = encode_categorical_validation_data(x_validation)

    # Bilanciamento
    datasets = balance_data(x_train, y_train, target_count, num_datasets, random_seed)
    validation = []
    data = []
    i = 0

    for x_train, y_train in datasets:
      print(f"  Dataset bilanciato {i+1}:")
      i+=1
      # Scaling
      x_train, y_train = scale_train_data(x_train, y_train, scaling_method)
      x_val, y_val = scale_validation_data(x_validation, y_validation, scaling_method)

      if use_pca == 'PCA':
          x_train = apply_pca_train(x_train, random_state=random_seed, pca_threshold=pca_threshold)
          x_val = apply_pca_validation(x_val)
      elif use_pca == 'LDA':
          x_train = apply_lda_train(x_train, y_train)
          x_val = apply_lda_validation(x_val)

      # Bilanciamento
      data.append((x_train, y_train))
      validation.append((x_val, y_val))
    return data, validation


# Train

In [15]:
import os

# Verifica se una combinazione è già presente nel file CSV.
def is_combination_tested(filepath, new_row):
  existing_results = pd.read_csv(filepath)
  # Colonne per la comparazione
  comparison_columns = ['ds', 'random', 'outlier', 'dim_reduction', 'pca_threshold', 'scaler', 'n_neighbors', 'metric', 'weights', 'target count', 'info']
  new_row['pca_threshold'] = str(new_row['pca_threshold'])
  # Verifica se la combinazione esiste già
  is_tested = ((existing_results[comparison_columns] == pd.DataFrame([new_row])[comparison_columns].iloc[0]).all(axis=1).any())
  if is_tested:
      print("  Configurazione già testata, salto...")
  return is_tested

def append_and_save_results(filepath, new_row):
  results_df = pd.read_csv(filepath)
  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
  results_df.to_csv(filepath, index=False)

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from itertools import product


def evaluate_model(model, x_train, y_train, x_val, y_val):
  # Predizioni e metriche sul validation set
  y_pred_val = model.predict(x_val)
  score_val = accuracy_score(y_val, y_pred_val)
  report = classification_report(y_val, y_pred_val, output_dict=True)

  # Predizioni e metriche sul training set
  y_pred_train = model.predict(x_train)
  train_score = accuracy_score(y_train, y_pred_train)

  print(f" Validation score: {score_val} - Train score {train_score}")
  return {'validation_score': score_val, 'train_score': train_score, 'classification_report': report}

import time
import threading
def train_svm_with_grid(x_train, y_train, x_val, y_val, param_grid, metadata):
  keys, values = zip(*param_grid.items())
  param_combinations = [dict(zip(keys, v)) for v in product(*values)]

  best_score = -float('inf')
  best_model = None
  best_report = None

  for params in param_combinations:
    print(f"Valutando configurazione: {params}")
    nan_row = {'accuracy': np.nan, 'precision': np.nan, 'recall': np.nan, 'f1': np.nan, 'train_score': np.nan}
    new_row = metadata.copy()
    new_row.update(params)

    if True:
      model = SVC(**params, random_state=new_row['random'])
      result_container = {}

      def fit_model():
        try:
          model.fit(x_train, y_train)
          result_container['model'] = model
        except Exception as e:
          result_container['error'] = e
      thread = threading.Thread(target=fit_model)
      thread.start()
      thread.join(timeout=2700)  # 2700 secondi = 45 minuti

      if thread.is_alive():
        print(f"  Configurazione interrotta: superato il limite di 45 minuti.")
        thread = None
        new_row.update(nan_row)
      elif 'error' in result_container:
        print(f"  Errore durante l'addestramento per configurazione {params}: {result_container['error']}")
        new_row.update(nan_row)
      else:
        results = evaluate_model(result_container['model'], x_train, y_train, x_val, y_val)
        new_row.update({
            'accuracy': results['classification_report']['accuracy'],
            'precision': results['classification_report']['weighted avg']['precision'],
            'recall': results['classification_report']['weighted avg']['recall'],
            'f1': results['classification_report']['weighted avg']['f1-score'],
            'train_score': results['train_score']
        })

        if results['validation_score'] > best_score:
          best_score = results['validation_score']
          best_model = result_container['model']
          best_report = results['classification_report']

      append_and_save_results(FILEPATH, new_row)

  return best_model, best_score, best_report

In [17]:
def apply(x_train, y_train, x_val, y_val, scaling_methods, param_grid,
  dim_reduction=['no'], pca_threshold=0.99, target_count=20000, num_datasets=1,
  random_seed=19, outs=['no'], info=''):
  """
  Esegue l'intera pipeline di preprocessing, training e valutazione.
  """
  results = {}
  x_train_fix = x_train
  x_val_fix = x_val
  bestbest = 0

  for dim_redx in dim_reduction:
    print(f"\n=== Testing Dimensionality Reduction: {dim_redx} ===")
    if dim_redx != 'PCA':
      pca_threshold = None
    for scaling_method in scaling_methods:
      for out in outs:
          print(f"\n=== Testing Scaling Method: {scaling_method}, Outlier: {out} ===")

          x_train = replace_default_new(x_train_fix.copy(), info)
          x_val = apply_saved_modes(x_val_fix.copy(), info)

          # Rimuovi outlier
          x_train_filtered, y_train_filtered = remove_outliers(x_train, y_train, out)

          # Preprocessing
          datasets, validation = preprocessing_pipeline(x_train_filtered, y_train_filtered, x_val, y_val,
              scaling_method, dim_redx, pca_threshold, target_count, num_datasets, random_seed
          )

          results = {}
          for i, (x_train_processed, y_train_processed) in enumerate(datasets):
              x_val_processed, y_val_processed = validation[i]
              ds_name = f"dataset_{i+1}"
              metadata = {
                  'ds': ds_name,
                  'random': random_seed,
                  'outlier': out,
                  'dim_reduction': dim_redx,
                  'pca_threshold': pca_threshold,
                  'scaler': scaling_method,
                  'target count': target_count,
                  'info': info
              }

              print(f"--- Training Dataset {i+1}/{len(datasets)} ---")
              best_model, best_score, best_report = train_svm_with_grid(
                  x_train_processed, y_train_processed,
                  x_val_processed, y_val_processed,
                  param_grid, metadata
              )
              if best_score > bestbest:
                bestbest = best_score
                joblib.dump(best_model, "best_model.pkl")
                print(best_score)
                print(best_report)
              if best_model:
                  results[f"{scaling_method}_dataset_{i+1}"] = {
                      'best_model': best_model,
                      'best_score': best_score,
                      'classification_report': best_report # Store the report in results
                  }

  return results

#Run

In [18]:
scaling_methods = ['standard', 'minmax', 'quantile', 'l1', 'l2']
out = ['no','base','isolation_forest', 'percentile',  'dynamic_threshold']
replace = ['no', 'mode', 'mode_all']
pca_threshold=0.99
dim_reduction = ['LDA', 'PCA', 'no']
param_grid = {
    'C': [0.5],
    'kernel': ['poly'],
    'gamma': [0.2],
    'degree': [5],
}

In [19]:
scaling_methods = ['standard', 'minmax', 'quantile', 'l1', 'l2']
scaling_methods = ['quantile', 'l1', 'l2']
out = ['no','base','isolation_forest', 'percentile',  'dynamic_threshold']
out = ['base','isolation_forest', 'percentile',  'dynamic_threshold']
replace = ['no', 'mode', 'mode_all']
pca_threshold=0.99
dim_reduction = ['LDA', 'PCA', 'no']
param_grid = {
    'C': [0.5],
    'kernel': ['poly'],
    'gamma': [0.2],
    'degree': [5],
}

In [20]:
# Esecuzione dell'esperimento
for r in replace:
  print(f"\n=== Testing Replace Value: {r} ===")
  results = apply(
      X_train.copy(), y_train,
      X_val, y_val,
      scaling_methods, param_grid,
      dim_reduction=dim_reduction, pca_threshold=pca_threshold,
      target_count=15000, num_datasets=5, random_seed=19,
      outs = out,
      info = r
  )

  # Analisi dei risultati
  for key, value in results.items():
      print(f"\n=== Results for {key} ===")
      print(f"Best Model: {value['best_model']}")
      print("Classification Report:")
      print(value['classification_report'])

#


=== Testing Replace Value: no ===

=== Testing Dimensionality Reduction: LDA ===

=== Testing Scaling Method: quantile, Outlier: base ===
  Rimosse  927  istanze
  Dataset bilanciato 1:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 2:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 3:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 4:
  Numero di colonne selezionate (componenti discriminanti): 9
  Dataset bilanciato 5:
  Numero di colonne selezionate (componenti discriminanti): 9
--- Training Dataset 1/5 ---
Valutando configurazione: {'C': 0.5, 'kernel': 'poly', 'gamma': 0.2, 'degree': 5}
 Validation score: 0.9734517577270296 - Train score 0.98154
0.9734517577270296
{'backdoor': {'precision': 0.7929399367755532, 'recall': 0.9986728599867286, 'f1-score': 0.8839941262848752, 'support': 1507.0}, 'ddos': {'precision': 0.9967364393427864, 'recall': 0.9682956160489778, 'f1-score': 0.98

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-20-f7eb6e8ba9e0>", line 4, in <cell line: 0>
    results = apply(
              ^^^^^^
  File "<ipython-input-17-d69a0247e225>", line 47, in apply
    best_model, best_score, best_report = train_svm_with_grid(
                                          ^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-16-029838fface6>", line 47, in train_svm_with_grid
    thread.join(timeout=2700)  # 2700 secondi = 45 minuti
    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/threading.py", line 1123, in join
    self._wait_for_tstate_lock(timeout=max(timeout, 0))
  File "/usr/lib/python3.11/threading.py", line 1139, in _wait_for_tstate_lock
    if lock.acquire(block, timeout):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt

During handling of the above exception, another exception 

TypeError: object of type 'NoneType' has no len()

In [None]:
model = joblib.load("best_model.pkl")
print(model)