In [1]:
%cd ~/src

/home/jovyan/src


In [2]:
import pandas as pd

dataset_dir = "datasets/classification/adult"
df = pd.read_csv(f"{dataset_dir}/train.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
import numpy as np

def clean_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Sostituisce i valori '?' presenti nel dataset con NaN.

    Parametri:
        dataset (pd.DataFrame): Il dataset da elaborare.

    Ritorna:
        pd.DataFrame: Il dataset con i valori '?' sostituiti da NaN.
    """
    return dataset.replace(['?', 'nan', 'NaN'], np.nan)

In [4]:
df = clean_dataset(df)

In [5]:
# https://pycaret.readthedocs.io/en/stable/api/classification.html

from pycaret.classification import ClassificationExperiment
from pycaret.classification import *

claexp = ClassificationExperiment()
claexp.setup(session_id=123,
             data=df,
             target='salary',
             imputation_type='iterative',
             iterative_imputation_iters=20, # default is 5
             numeric_iterative_imputer='lightgbm', # default is 'lightgbm'
             categorical_iterative_imputer='lightgbm', # default is 'lightgbm'
             remove_outliers=True,
             outliers_threshold=0.02, # default is 0.05
             outliers_method='iforest', # default is 'iforest'
            #  transformation=True,
             transformation_method='yeo-johnson', # default is 'yeo-johnson'
             normalize=True,
             normalize_method='zscore', # default is 'zscore'
            #  pca=True,
             pca_method='linear', # default is 'linear'
             pca_components=0.95,
             train_size=0.8,
             numeric_features=['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'],
             categorical_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
            #  use_gpu=True,
            #  log_plots=True,
            #  log_experiment=True,
            #  experiment_name='adult_classification_experiment',
            #  log_data=True,
            #  log_profile=True,
            profile=True
             )


Unnamed: 0,Description,Value
0,Session id,123
1,Target,salary
2,Target type,Binary
3,Target mapping,"<=50K: 0, >50K: 1"
4,Original data shape,"(32561, 15)"
5,Transformed data shape,"(32040, 65)"
6,Transformed train set shape,"(25527, 65)"
7,Transformed test set shape,"(6513, 65)"
8,Numeric features,6
9,Categorical features,8


Loading profile... Please Wait!
Profiler Failed. No output to show, continue with modeling.


<pycaret.classification.oop.ClassificationExperiment at 0xffff5bf49cd0>

In [6]:
type(claexp.dataset_transformed)

pandas.core.frame.DataFrame

In [7]:
claexp.dataset_transformed.head()
# Salva il dataset trasformato
claexp.dataset_transformed.to_csv(f"{dataset_dir}/train_transformed.csv", index=False)

In [199]:
# calcola gli indici delle righe che non sono presenti
# nel dataset trasformato (ovvero rimosse da pycaret)
removed_rows = df.index.difference(claexp.dataset_transformed.index)

# le righe rimosse che presentavano valori NaN
rows_with_nan = df.loc[removed_rows][df.loc[removed_rows].isna().any(axis=1)]

# le rimanenti righe rimosse, considerate come outliers
rows_outliers = df.loc[removed_rows][~df.loc[removed_rows].isna().any(axis=1)]

print(f"Righe rimosse per presenza di valori NaN: {len(rows_with_nan)}")
# print(rows_with_nan)

print(f"\nRighe rimosse come outliers: {len(rows_outliers)}")
# print(len(rows_outliers))

In [200]:
claexp.models()

In [201]:
top3 = claexp.compare_models(n_select=3)

In [None]:
import os

for model in top3:
    print(f"Saving plots of {model.__class__.__name__}")

    output_dir = f"img/classification/{model.__class__.__name__}"
    os.makedirs(name=output_dir, exist_ok=True)

    try:
        claexp.plot_model(model, save=output_dir, plot = 'confusion_matrix')
        claexp.plot_model(model, save=output_dir, plot = 'error')
        # claexp.plot_model(best, save="img/", plot = 'learning')
        claexp.plot_model(model, save=output_dir, plot = 'manifold')
        claexp.plot_model(model, save=output_dir, plot = 'class_report')
        claexp.plot_model(model, save=output_dir, plot = 'auc')
        claexp.plot_model(model, save=output_dir, plot = 'pr')
        claexp.plot_model(model, save=output_dir, plot = 'feature_all')
        claexp.plot_model(model, save=output_dir, plot = 'feature')
    except Exception as e:
        print(f"Errore durante la generazione del plot per {model.__class__.__name__}: {e}")

In [None]:
print("Learning rate best models:")
for model in top3:
    print(f"{model.__class__.__name__}: {model.learning_rate if hasattr(model, 'learning_rate') else 'N/A'}")

In [205]:
holdout_pred = [claexp.predict_model(model, raw_score=True) for model in top3]

In [193]:
unseen_df = pd.read_csv("datasets/classification/adult/train.csv")
# unseen_df.head()

In [206]:
holdout_pred = [claexp.predict_model(estimator=model,
                                    data=unseen_df,
                                    raw_score=True
                                    )
                for model in top3]

In [151]:
claexp.get_metrics()

In [195]:
# tuned_top3 = [claexp.tune_model(i) for i in top3]

In [196]:
# blender = claexp.blend_models(tuned_top3)
# stacker = claexp.stack_models(tuned_top3, return_train_score=True)
# best_auc_model = claexp.automl(optimize='Recall', return_train_score=True)

# Regressione

In [58]:
from pycaret.regression import RegressionExperiment
regexp = RegressionExperiment()
regexp.setup(session_id=123,
             data=df,
             target='writing score',
             imputation_type='iterative'
             )
type(regexp.dataset_transformed)
regexp.dataset_transformed.head()

In [59]:
best = regexp.compare_models()

In [64]:
from pycaret.regression import *
regexp.plot_model(best, plot = 'feature')

In [65]:
holdout_pred = regexp.predict_model(best)

In [66]:
holdout_pred.head()

In [67]:
! pip install sdv

In [68]:
! pip freeze | grep sdv

In [70]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests')

In [71]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data=real_data)

In [72]:
synthetic_data = synthesizer.sample(num_rows=500)

In [73]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata)