In [None]:
import pandas as pd

dataset_dir = "./datasets/classification/census_income"
df = pd.read_csv(f"{dataset_dir}/train.csv")
df.head()

In [None]:
import numpy as np

def clean_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Sostituisce i valori '?' presenti nel dataset con NaN.

    Parametri:
        dataset (pd.DataFrame): Il dataset da elaborare.

    Ritorna:
        pd.DataFrame: Il dataset con i valori '?' sostituiti da NaN.
    """
    return dataset.replace(['?', 'nan', 'NaN'], np.nan)

In [None]:
df = clean_dataset(df)

In [None]:
# https://pycaret.readthedocs.io/en/stable/api/classification.html

from pycaret.classification import ClassificationExperiment
from pycaret.classification import *

claexp = ClassificationExperiment()
claexp.setup(session_id=123,
             data=df,
             target='salary',
             imputation_type='iterative',
             iterative_imputation_iters=20, # default is 5
             numeric_iterative_imputer='lightgbm', # default is 'lightgbm'
             categorical_iterative_imputer='lightgbm', # default is 'lightgbm'
             remove_outliers=True,
             outliers_threshold=0.02, # default is 0.05
             outliers_method='iforest', # default is 'iforest'
            #  transformation=True,
             transformation_method='yeo-johnson', # default is 'yeo-johnson'
             normalize=True,
             normalize_method='zscore', # default is 'zscore'
            #  pca=True,
             pca_method='linear', # default is 'linear'
             pca_components=0.95,
             train_size=0.8,
             numeric_features=['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'],
             categorical_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
            #  use_gpu=True,
            #  log_plots=True,
            #  log_experiment=True,
            #  experiment_name='adult_classification_experiment',
            #  log_data=True,
            #  log_profile=True,
            profile=True
             )


In [None]:
type(claexp.dataset_transformed)

In [None]:
claexp.dataset_transformed.head()
# Salva il dataset trasformato
claexp.dataset_transformed.to_csv(f"{dataset_dir}/train_transformed.csv", index=False)

In [None]:
# calcola gli indici delle righe che non sono presenti
# nel dataset trasformato (ovvero rimosse da pycaret)
removed_rows = df.index.difference(claexp.dataset_transformed.index)

# le righe rimosse che presentavano valori NaN
rows_with_nan = df.loc[removed_rows][df.loc[removed_rows].isna().any(axis=1)]

# le rimanenti righe rimosse, considerate come outliers
rows_outliers = df.loc[removed_rows][~df.loc[removed_rows].isna().any(axis=1)]

print(f"Righe rimosse per presenza di valori NaN: {len(rows_with_nan)}")
# print(rows_with_nan)

print(f"\nRighe rimosse come outliers: {len(rows_outliers)}")
# print(len(rows_outliers))

In [None]:
claexp.models()

In [None]:
top3 = claexp.compare_models(n_select=3)

In [None]:
import os

for model in top3:
    print(f"Saving plots of {model.__class__.__name__}")

    output_dir = f"img/classification/{model.__class__.__name__}"
    os.makedirs(name=output_dir, exist_ok=True)

    try:
        claexp.plot_model(model, save=output_dir, plot = 'confusion_matrix')
        claexp.plot_model(model, save=output_dir, plot = 'error')
        # claexp.plot_model(best, save="img/", plot = 'learning')
        claexp.plot_model(model, save=output_dir, plot = 'manifold')
        claexp.plot_model(model, save=output_dir, plot = 'class_report')
        claexp.plot_model(model, save=output_dir, plot = 'auc')
        claexp.plot_model(model, save=output_dir, plot = 'pr')
        claexp.plot_model(model, save=output_dir, plot = 'feature_all')
        claexp.plot_model(model, save=output_dir, plot = 'feature')
    except Exception as e:
        print(f"Errore durante la generazione del plot per {model.__class__.__name__}: {e}")

In [None]:
print("Learning rate best models:")
for model in top3:
    print(f"{model.__class__.__name__}: {model.learning_rate if hasattr(model, 'learning_rate') else 'N/A'}")

In [None]:
holdout_pred = [claexp.predict_model(model, raw_score=True) for model in top3]

In [None]:
unseen_df = pd.read_csv("datasets/classification/adult/train.csv")
# unseen_df.head()

In [None]:
holdout_pred = [claexp.predict_model(estimator=model,
                                    data=unseen_df,
                                    raw_score=True
                                    )
                for model in top3]

In [None]:
claexp.get_metrics()

In [None]:
# tuned_top3 = [claexp.tune_model(i) for i in top3]

In [None]:
# blender = claexp.blend_models(tuned_top3)
# stacker = claexp.stack_models(tuned_top3, return_train_score=True)
# best_auc_model = claexp.automl(optimize='Recall', return_train_score=True)

# Regressione

In [None]:
from pycaret.regression import RegressionExperiment
regexp = RegressionExperiment()
regexp.setup(session_id=123,
             data=df,
             target='writing score',
             imputation_type='iterative'
             )
type(regexp.dataset_transformed)
regexp.dataset_transformed.head()

In [None]:
best = regexp.compare_models()

In [None]:
from pycaret.regression import *
regexp.plot_model(best, plot = 'feature')

In [None]:
holdout_pred = regexp.predict_model(best)

In [None]:
holdout_pred.head()

In [None]:
! pip install sdv

In [None]:
! pip freeze | grep sdv

In [None]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests')

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data=real_data)

In [None]:
synthetic_data = synthesizer.sample(num_rows=500)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata)