In [5]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score,balanced_accuracy_score
from sklearn.utils import shuffle

from plotly import express as px

#from UA_MDM_LDI_II.tutoriales.utils import plot_confusion_matrix
from utils import plot_confusion_matrix

import os

import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

from joblib import load, dump


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Paths
BASE_DIR = './'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_MODELS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/models")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "UA_MDM_LDI_II/work/optuna_artifacts")


SEED = 42
BATCH_SIZE = 50
TEST_SIZE = 0.2

In [None]:
# Datos Tabulares
dataset = pd.read_csv(PATH_TO_TRAIN)

In [None]:
dataset.columns

In [None]:
train, test = train_test_split(dataset,
                               test_size = TEST_SIZE,
                               random_state = SEED,
                               stratify = dataset.AdoptionSpeed)

In [None]:
char_feats = [f for f in dataset.columns if dataset[f].dtype=='O']
numeric_feats = [f for f in dataset.columns if dataset[f].dtype!='O']

In [None]:
features = ['Type',
 'Age',
 'Breed1',
 'Breed2',
 'Gender',
 'Color1',
 'Color2',
 'Color3',
 'MaturitySize',
 'FurLength',
 'Vaccinated',
 'Dewormed',
 'Sterilized',
 'Health',
 'Quantity',
 'Fee',
 'State',
 'VideoAmt',
 'PhotoAmt']

label = 'AdoptionSpeed'

In [None]:
X_train = train[features]
y_train = train[label]

X_test = test[features]
y_test = test[label]

In [None]:
y_train.unique()

In [None]:
lgb_params = params = {
                        'objective': 'multiclass',
                        'num_class': len(y_train.unique())
                        }


lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

In [None]:
y_pred = lgb_model.predict(X_test).argmax(axis=1)

cohen_kappa_score(y_test,y_pred, weights = 'quadratic')

In [None]:
display(plot_confusion_matrix(y_test,y_pred))

In [None]:
cohen_kappa_score(y_test,y_test, weights = 'quadratic')

In [None]:
display(plot_confusion_matrix(y_test,y_test))

In [None]:

y_shuffled = shuffle(y_test,
                     random_state = 42)


dict_map_cerca = {0:1,
                  1:2,
                  2:3,
                  3:4,
                  4:3}

dict_map_lejos = {0:4,
                  1:4,
                  2:0,
                  3:0,
                  4:0}

y_cerca = [dict_map_cerca[i] for i in y_test]

y_lejos = [dict_map_lejos[i] for i in y_test]


In [None]:
random_list =  np.random.rand(len(y_test))

kappa_progression = pd.DataFrame()

for i in range(101):

    y_simulado = [y_test.iloc[sample] if random_list[sample]<i/100 else y_shuffled.iloc[sample] for sample in range(len(y_test))]

    y_simulado_cerca = [y_test.iloc[sample] if random_list[sample]<i/100 else y_cerca[sample] for sample in range(len(y_test))]

    y_simulado_lejos = [y_test.iloc[sample] if random_list[sample]<i/100 else y_lejos[sample] for sample in range(len(y_test))]


    kappa_progression = pd.concat([kappa_progression,
                                   pd.DataFrame({'Conocidos':[i],
                                                'kappa':cohen_kappa_score(y_test,
                                                                        y_simulado,
                                                                        weights = 'quadratic'),
                                                'kappa_cerca':cohen_kappa_score(y_test,
                                                                        y_simulado_cerca,
                                                                        weights = 'quadratic'),
                                                'kappa_lejos':cohen_kappa_score(y_test,
                                                                        y_simulado_lejos,
                                                                        weights = 'quadratic'),                                                                        
                                                'accuracy':accuracy_score(y_test,
                                                                        y_simulado),
                                                'balanced_accuracy':balanced_accuracy_score(y_test,
                                                                        y_simulado),
                                                                        })],
                ignore_index=True)

In [None]:
px.line(kappa_progression,x='Conocidos',y=['kappa',
                                           'kappa_cerca',
                                           'kappa_lejos',
                                           'accuracy',
                                           'balanced_accuracy'])

In [None]:
y_simulado_cerca = [y_test.iloc[sample] if random_list[sample]<50/100 else y_cerca[sample] for sample in range(len(y_test))]

display(plot_confusion_matrix(y_test,y_simulado_cerca, 
                              title = "Kappa " + str(cohen_kappa_score(y_test,y_simulado_cerca, weights = 'quadratic'))))


y_simulado_lejos = [y_test.iloc[sample] if random_list[sample]<50/100 else y_lejos[sample] for sample in range(len(y_test))]

display(plot_confusion_matrix(y_test,y_simulado_lejos, 
                              title = "Kappa " + str(cohen_kappa_score(y_test,y_simulado_lejos, weights = 'quadratic'))))


In [None]:
lgb_params = params = {
                        'objective': 'multiclassova',
                        'num_class': len(y_train.unique())
                        }


lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

In [None]:

y_pred = lgb_model.predict(X_test).argmax(axis=1)

display(plot_confusion_matrix(y_test,y_pred))

{'kappa':cohen_kappa_score(y_test,
                y_pred,
                weights = 'quadratic'),
 'accuracy':accuracy_score(y_test,y_pred),
 'balanced_accuracy':balanced_accuracy_score(y_test,y_pred)}




In [None]:
def lgb_objective(trial):
    lgb_params = {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 


    lgb_train_dataset = lgb.Dataset(data=X_train,
                                    label=y_train)


    lgb_model = lgb.train(lgb_params,
                        lgb_train_dataset)
    
    return(cohen_kappa_score(y_test,lgb_model.predict(X_test).argmax(axis=1),
                             weights = 'quadratic'))

In [None]:
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass",
                            load_if_exists=True)
study.optimize(lgb_objective, n_trials=100)

In [None]:
lgb_params =  {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique())} | study.best_params

lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


lgb_model = lgb.train(lgb_params,
                    lgb_train_dataset)

display(plot_confusion_matrix(y_test,lgb_model.predict(X_test).argmax(axis=1)))


In [None]:
def lgb_custom_metric_kappa(dy_pred, dy_true):

    metric_name = 'kappa'
    value = cohen_kappa_score(dy_true.get_label(),dy_pred.argmax(axis=1),weights = 'quadratic')
    is_higher_better = True
    return(metric_name, value, is_higher_better)

def cv_es_lgb_objective(trial):

    lgb_params = {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 

    scores_ensemble = np.zeros((len(y_test),len(y_train.unique())))
    score_folds = 0
    n_splits = 5


    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index],
                                        label=y_train.iloc[if_index],
                                        free_raw_data=False)
        
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index],
                                        label=y_train.iloc[oof_index],
                                        free_raw_data=False)

        lgb_model = lgb.train(lgb_params,
                                lgb_if_dataset,
                                valid_sets=lgb_oof_dataset,
                                callbacks=[lgb.early_stopping(10, verbose=False)],
                                feval = lgb_custom_metric_kappa
                                )
        
        scores_ensemble = scores_ensemble + lgb_model.predict(X_test) #prediction!!!!
        
        score_folds = score_folds + cohen_kappa_score(y_train.iloc[oof_index], 
                                                            lgb_model.predict(X_train.iloc[oof_index]).argmax(axis=1),weights = 'quadratic')/n_splits


    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    predicted_df = test.copy()
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    dump(predicted_df, predicted_filename)
    upload_artifact(trial, predicted_filename, artifact_store)    

    cm_filename = os.path.join(PATH_TO_TEMP_FILES,f'cm_{trial.study.study_name}_{trial.number}.jpg')
    plot_confusion_matrix(y_test,scores_ensemble.argmax(axis=1)).write_image(cm_filename)
    upload_artifact(trial, cm_filename, artifact_store)

    test_score = cohen_kappa_score(y_test,scores_ensemble.argmax(axis=1),weights = 'quadratic')
    trial.set_user_attr("test_score", test_score)

    return(score_folds)

In [None]:
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

study = optuna.create_study(direction='maximize',
                            storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass CV",
                            load_if_exists = True)

study.optimize(cv_es_lgb_objective, n_trials=100)

In [8]:
!optuna-dashboard sqlite:///db.sqlite3 --artifact-dir ../work/optuna_artifacts/

Traceback (most recent call last):
  File "C:\Users\Usuario\.conda\envs\ldi2\Lib\site-packages\sqlalchemy\engine\base.py", line 1967, in _exec_single_context
    self.dialect.do_execute(
  File "C:\Users\Usuario\.conda\envs\ldi2\Lib\site-packages\sqlalchemy\engine\default.py", line 924, in do_execute
    cursor.execute(statement, parameters)
sqlite3.OperationalError: no such table: version_info

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\Usuario\.conda\envs\ldi2\Lib\site-packages\optuna\storages\_rdb\storage.py", line 72, in _create_scoped_session
    yield session
  File "C:\Users\Usuario\.conda\envs\ldi2\Lib\site-packages\optuna\storages\_rdb\storage.py", line 1042, in _init_version_info_model
    version_info = models.VersionInfoModel.find(session)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\.conda\envs\ldi2\Lib\site-packages\optuna\storages\_rdb\models.py", line 578,