In [2]:
import pandas as pd
from my_libs import lib_tools as pt

run_type = 'dev'
# run_type = 'prd'
resample = True

X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(run_type)

In [3]:
if resample:
    X_train, y_train = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train.to_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train.to_pickle(f'./pickles/y_train_smote_{run_type}.pkl')
else:
    # Load data previously generated
    X_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')

--- Smote applied in 16.512786149978638 seconds ---
Classes cardinality after resampling :
0    4900
1    4900
Name: grav, dtype: int64
X shape : (6400, 28) -> (9800, 28)
y shape : (6400,) -> (9800,)


In [4]:
import time
import optuna
from sklearn.model_selection import cross_val_score


start_time = time.time()

def objective(trial):

    dt_iterations    = trial.suggest_int('iterations', 50, 300)
    dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)

    classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, 
                                        cat_features=list(X_train.columns), verbose=0)
    score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="f1", verbose=1)
    accuracy = score.mean()

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
print(f"Best params : {study.best_params}")

[32m[I 2023-02-24 18:04:17,660][0m A new study created in memory with name: no-name-c28b470f-091a-4125-8628-9f08d3c51c14[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.0s finished
[32m[I 2023-02-24 18:04:44,788][0m Trial 0 finished with value: 0.7308633398267905 and parameters: {'iterations': 95, 'learning_rate': 0.09621447770409826}. Best is trial 0 with value: 0.7308633398267905.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   48.8s finished
[32m[I 2023-02-24 18:05:33,701][0m Trial 1 finished with value: 0.710413553649729 and parameters: {'iterations': 295, 'learning_rate': 0.09836800318120442}. Best is trial 0 with value: 0.7308633398267905.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   39.0s finishe

--- Optimization with Optuna performed in 193.74201822280884 seconds ---
Best params : {'iterations': 95, 'learning_rate': 0.09621447770409826}


In [None]:
from catboost import CatBoostClassifier
# params = study.best_params
{'iterations': 1500, 'learning_rate': 0.0007813953195885828}
params['cat_features'] = list(X_train.columns)
model = CatBoostClassifier(**params)
model.fit(X_train, y_train, plot=True)
y_pred = model.predict(X_test_final)

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = {'iterations': 206, 'learning_rate': 0.0811, 'random_seed': 123, 'cat_features': list(X_train.columns)}
evaluator = ModelEvaluator(model_type='CatBoostClassifier', params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()

In [None]:
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)