In [83]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'very-light'
run_type = 'light'
encoding_step = False

if run_type == 'very-light': filename_train, filename_test = 'df-very-light-train.pkl', 'df-very-light-test.pkl'
if run_type == 'light'     : filename_train, filename_test = 'df-light-train.pkl', 'df-light-test.pkl'
if run_type == 'full'      : filename_train, filename_test = 'df-full-train.pkl', 'df-full-test.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
# classifier_name = 'GradientBoostingClassifier'

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']
X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [84]:
import time
from my_libs.encoder_custom import EncoderCustom

if encoding_step :
    cols_target_encoded = ['dep']
    cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

    encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
    X_train, y_train = encoder.transform(X_train, y_train, 'Train')
    X_test,  y_test  = encoder.transform(X_test,  y_test,  'Test')
    X_test_final, y_test_final = encoder.transform(X_test_final, y_test_final, 'Test')

    X_train.to_pickle('./X_train.pkl')
    y_train.to_pickle('./y_train.pkl')
    X_test.to_pickle('./X_test.pkl')
    y_test.to_pickle('./y_test.pkl')
    X_test_final.to_pickle('./X_test_final.pkl')
    y_test_final.to_pickle('./y_test_final.pkl')

In [85]:
import pandas as pd

X_train = pd.read_pickle(f'./X_train.pkl')
y_train = pd.read_pickle(f'./y_train.pkl')
X_test = pd.read_pickle(f'./X_test.pkl')
y_test = pd.read_pickle(f'./y_test.pkl')
X_test_final = pd.read_pickle(f'./X_test_final.pkl')
y_test_final = pd.read_pickle(f'./y_test_final.pkl')

In [86]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# if classifier_name == 'DecisionTreeClassifier': n_trials = 100
# if classifier_name == 'RandomForestClassifier': n_trials = 10
# if classifier_name == 'GradientBoostingClassifier': n_trials = 3
n_trials = 50
start_time = time.time()

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    # classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'])
    classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier'])
    if classifier_name == 'DecisionTreeClassifier':
        dt_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
        dt_max_depth = trial.suggest_int('max_depth', 2, 300, log=True)
        dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
        classifier_obj = DecisionTreeClassifier(criterion=dt_criterion, splitter=dt_splitter, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
    else:
        if classifier_name == 'RandomForestClassifier':
            dt_n_estimators = trial.suggest_int('n_estimators', 5, 50, log=True)
            dt_criterion    = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion)
        else:
            if classifier_name == 'GradientBoostingClassifier':
                dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5, log=True)
                dt_n_estimators  = trial.suggest_int('n_estimators', 50, 200, log=True)
                classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)


    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring="f1", verbose=1)
    accuracy = score.mean()

    return accuracy

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

print(f"--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
print(f"Best params : {study.best_params}")

# fig = optuna.visualization.plot_param_importances(study)
# fig.show()

[32m[I 2023-02-20 19:52:59,643][0m A new study created in memory with name: no-name-478e1145-6a6c-483d-89ec-a740f8b1efb0[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.4s finished
[32m[I 2023-02-20 19:53:08,176][0m Trial 0 finished with value: 0.7932549781399338 and parameters: {'classifier': 'DecisionTreeClassifier', 'criterion': 'entropy', 'splitter': 'random', 'max_depth': 43, 'min_samples_split': 5}. Best is trial 0 with value: 0.7932549781399338.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished
[32m[I 2023-02-20 19:53:11,521][0m Trial 1 finished with value: 0.7494593476777887 and parameters: {'classifier': 'DecisionTreeClassifier', 'criterion': 'gini', 'splitter': 'random', 'max_depth': 8, 'min_samples_split': 6}. Best is trial 0 with value: 0.7932549781399338.[0m
[Parallel(n

--- Optimization with Optuna performed in 387.00122022628784 seconds ---
Best params : {'classifier': 'DecisionTreeClassifier', 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 152, 'min_samples_split': 2}


In [87]:
# [I 2023-02-20 19:59:26,641] Trial 49 finished with value: 0.7997973898133507 and parameters: {'classifier': 'DecisionTreeClassifier', 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 108, 'min_samples_split': 2}. Best is trial 21 with value: 0.8011703733889626.
#
# --- Optimization with Optuna performed in 387.00122022628784 seconds ---
# Best params : {'classifier': 'DecisionTreeClassifier', 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 152, 'min_samples_split': 2}

In [88]:
from my_libs.model_evaluator import ModelEvaluator

params = study.best_params
model_type = params['classifier']
params.pop('classifier')

evaluator = ModelEvaluator(model_type=model_type, params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()


--- Model DecisionTreeClassifier fit and trained in 2.079439163208008 seconds ---
--- Params : {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 152, 'min_samples_split': 2}


Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11420,4901
1,1800,1878



Classification report :
              precision    recall  f1-score   support

           0       0.86      0.70      0.77     16321
           1       0.28      0.51      0.36      3678

    accuracy                           0.66     19999
   macro avg       0.57      0.61      0.57     19999
weighted avg       0.76      0.66      0.70     19999



In [89]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [90]:
plot_optimization_history(study)

In [91]:
plot_contour(study)

[33m[W 2023-02-20 19:59:28,961][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,964][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,968][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,971][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,973][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,979][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,986][0m Param classifier unique value length is less than 2.[0m
[33m[W 2023-02-20 19:59:28,994][0m Param classifier unique value length is less than 2.[0m


In [92]:
plot_contour(study, params=[list(params.keys())[0], list(params.keys())[1]])

In [93]:
plot_slice(study)

In [94]:
plot_param_importances(study)

In [95]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [96]:
plot_edf(study)