In [9]:
%cd /app

/app



This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.



In [10]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score
from sklearn.utils import shuffle
from plotly import express as px
from joblib import load, dump
import os

from utils import plot_confusion_matrix  # Asegúrate de que utils.py esté en tu directorio de trabajo y correctamente importado


In [11]:
# Paths
BASE_DIR = './'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_MODELS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/models")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_artifacts")

# Datos
dataset = pd.read_csv(PATH_TO_TRAIN)

# Dividir datos en train y test
train, test = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset.AdoptionSpeed)

# Definir características y etiquetas
features = ['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt']
label = 'AdoptionSpeed'

X_train = train[features]
y_train = train[label]
X_test = test[features]
y_test = test[label]

In [12]:
# Definición de la métrica personalizada
def lgb_custom_metric_kappa(dy_pred, dy_true):
    metric_name = 'kappa'
    value = cohen_kappa_score(dy_true.get_label(), dy_pred.argmax(axis=1), weights='quadratic')
    is_higher_better = True
    return(metric_name, value, is_higher_better)

# Definición de la función de objetivo para Optuna
def refined_cv_es_lgb_objective(trial):
    lgb_params = {      
        'objective': 'multiclass',
        'verbosity': -1,
        'num_class': len(y_train.unique()),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-7, 1e-5, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-7, 1e-5, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 70, 90),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.6),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 0.8),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 3),
        'min_child_samples': trial.suggest_int('min_child_samples', 70, 90),
    }

    scores_ensemble = np.zeros((len(y_test), len(y_train.unique())))
    score_folds = 0
    n_splits = 5

    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index], label=y_train.iloc[if_index], free_raw_data=False)
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index], label=y_train.iloc[oof_index], free_raw_data=False)

        lgb_model = lgb.train(lgb_params, lgb_if_dataset, valid_sets=lgb_oof_dataset, callbacks=[lgb.early_stopping(10, verbose=False)], feval=lgb_custom_metric_kappa)

        scores_ensemble = scores_ensemble + lgb_model.predict(X_test)
        score_folds = score_folds + cohen_kappa_score(y_train.iloc[oof_index], lgb_model.predict(X_train.iloc[oof_index]).argmax(axis=1), weights='quadratic') / n_splits

    predicted_filename = os.path.join(PATH_TO_TEMP_FILES, f'test_{trial.study.study_name}_{trial.number}.joblib')
    predicted_df = test.copy()
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    dump(predicted_df, predicted_filename)
    upload_artifact(trial, predicted_filename, artifact_store)    

    cm_filename = os.path.join(PATH_TO_TEMP_FILES, f'cm_{trial.study.study_name}_{trial.number}.jpg')
    plot_confusion_matrix(y_test, scores_ensemble.argmax(axis=1)).write_image(cm_filename)
    upload_artifact(trial, cm_filename, artifact_store)

    test_score = cohen_kappa_score(y_test, scores_ensemble.argmax(axis=1), weights='quadratic')
    trial.set_user_attr("test_score", test_score)

    return(score_folds)


In [13]:
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

study = optuna.create_study(direction='maximize', storage="sqlite:///db.sqlite3", study_name="04 - LGB Multiclass CV mejora", load_if_exists=True)
study.optimize(refined_cv_es_lgb_objective, n_trials=100)  # Aumenta los ensayos si es necesario



FileSystemArtifactStore is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 19:22:56,947] A new study created in RDB with name: 04 - LGB Multiclass CV mejora

upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 19:23:04,522] Trial 0 finished with value: 0.35821711634973474 and parameters: {'lambda_l1': 3.415509915900981e-06, 'lambda_l2': 9.851500759639637e-07, 'num_leaves': 71, 'feature_fraction': 0.5797037400440351, 'bagging_fraction': 0.7390187686786416, 'bagging_freq': 1, 'min_child_samples': 90}. Best is trial 0 with value: 0.35821711634973474.

upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2024-06-19 19:23:11,153] Trial 1 finishe

In [14]:
# Obtener los mejores hiperparámetros
best_params = study.best_params

# Entrenar el modelo final con los mejores hiperparámetros
lgb_params = {      
    'objective': 'multiclass',
    'verbosity': -1,
    'num_class': len(y_train.unique())
}
lgb_params.update(best_params)

In [15]:
lgb_train_dataset = lgb.Dataset(data=X_train, label=y_train)
lgb_model = lgb.train(lgb_params, lgb_train_dataset)

# Evaluar el modelo final
y_pred = lgb_model.predict(X_test).argmax(axis=1)

print(f'Kappa: {cohen_kappa_score(y_test, y_pred, weights="quadratic")}')
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred)}')

display(plot_confusion_matrix(y_test, y_pred))


Kappa: 0.3231769099909053
Accuracy: 0.3847949316438813
Balanced Accuracy: 0.3188386862400628
