In [None]:
import pandas as pd
from my_libs import lib_tools as pt

run_type = 'dev'
# run_type = 'prd'
resample = False

if run_type == 'dev': filename_train, filename_test = 'df-dev-train.pkl', 'df-dev-test.pkl'
if run_type == 'prd': filename_train, filename_test = 'df-prd.pkl', 'df-prd.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
# classifier_name = 'GradientBoostingClassifier'

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']
X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [None]:
from imblearn.over_sampling import SMOTEN
import time

if resample:
    sampler = SMOTEN()

    start_time = time.time()
    X_train, y_train = sampler.fit_resample(X_train, y_train)
    print(f"X_train : {X_train.shape} - y_train : {y_train.shape}")

    X_train['dep'] = X_train['dep'].astype('int')

    print(f"--- Smote applied in %s seconds ---" % (time.time() - start_time))

In [None]:
X_train.to_pickle('./X_train_catboost.pkl')
y_train.to_pickle('./y_train_catboost.pkl')
X_test.to_pickle('./X_test_catboost.pkl')
y_test.to_pickle('./y_test_catboost.pkl')
X_test_final.to_pickle('./X_test_final_catboost.pkl')
y_test_final.to_pickle('./y_test_final_catboost.pkl')

In [None]:
X_train

In [None]:
import time
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier


start_time = time.time()

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    classifier_name = trial.suggest_categorical('classifier', ['CatBoostClassifier'])
    if classifier_name == 'CatBoostClassifier':
        dt_iterations    = trial.suggest_int('iterations', 50, 300)
        dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)

        classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, cat_features=list(X_train.columns))

        score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="f1", verbose=1)
        accuracy = score.mean()

    return accuracy

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))

# fig = optuna.visualization.plot_param_importances(study)
# fig.show()

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = {'iterations': 206, 'learning_rate': 0.0811, 'random_seed': 123, 'cat_features': list(X_train.columns)}
evaluator = ModelEvaluator(model_type='CatBoostClassifier', params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [None]:
plot_optimization_history(study)

In [None]:
# plot_intermediate_values(study)

In [None]:
# plot_parallel_coordinate(study)

In [None]:
plot_contour(study)

In [None]:
# plot_contour(study, params=["bagging_freq", "bagging_fraction"])

In [None]:
plot_slice(study)

In [None]:
# plot_slice(study, params=["bagging_freq", "bagging_fraction"])

In [None]:
plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [None]:
plot_edf(study)