In [1]:
import pandas as pd
from my_libs import lib_tools as pt

run_gridSearchCV = True  # True to run hyperparameters optimization with GridSearchCV()
run_optuna = True        # True to run hyperparameters optimization with Optuna

# run_type = 'very-light'
run_type = 'light'
# run_type = 'full'

if run_type == 'very-light': filename_train, filename_test = 'df-very-light-train.pkl', 'df-very-light-test.pkl'
if run_type == 'light'     : filename_train, filename_test = 'df-light-train.pkl', 'df-light-test.pkl'
if run_type == 'full'      : filename_train, filename_test = 'df-full-train.pkl', 'df-full-test.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
# classifier_name = 'GradientBoostingClassifier'

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']
X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [2]:
import time
from my_libs.encoder_custom import EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
X_train, y_train = encoder.transform(X_train, y_train, 'Train')
X_test, y_test = encoder.transform(X_test,  y_test,  'Test')
X_test_final, y_test_final = encoder.transform(X_test_final, y_test_final, 'Test')



Classes cardinality after resampling :
0    48878
1    48878
Name: grav, dtype: int64
X shape : (97756, 12)
Columns target encoded : ['dep']
Columns one hot encoded : Index(['catv', 'agg', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ',
       'lum', 'age_cls'],
      dtype='object')
Features normalized
--- Train set - features encoding performed in 281.00 seconds ---
--- Test set - features encoding performed in 0.41 seconds ---
--- Test set - features encoding performed in 0.48 seconds ---


In [3]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

if run_optuna:

    # if classifier_name == 'DecisionTreeClassifier': n_trials = 100
    # if classifier_name == 'RandomForestClassifier': n_trials = 10
    # if classifier_name == 'GradientBoostingClassifier': n_trials = 3
    n_trials = 50
    start_time = time.time()

    # 1. Define an objective function to be maximized.
    def objective(trial):

        # 2. Suggest values for the hyperparameters using a trial object.
        classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'])
        if classifier_name == 'DecisionTreeClassifier':
            dt_max_depth         = trial.suggest_int('max_depth', 2, 50, log=True)
            dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
            classifier_obj = DecisionTreeClassifier(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
        else:
            if classifier_name == 'RandomForestClassifier':
                dt_n_estimators      = trial.suggest_int('n_estimators', 5, 50)
                dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
                classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion)
            else:
                if classifier_name == 'GradientBoostingClassifier':
                    dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log=True)
                    dt_n_estimators  = trial.suggest_int('n_estimators ', 1, 30, log=True)
                    classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)


        score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

    # fig = optuna.visualization.plot_param_importances(study)
    # fig.show()

[32m[I 2023-02-19 23:23:01,249][0m A new study created in memory with name: no-name-f9bdda33-e78f-4e45-a65e-6fa557ef449d[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s finished
[32m[I 2023-02-19 23:23:02,673][0m Trial 0 finished with value: 0.7166658041195375 and parameters: {'classifier': 'DecisionTreeClassifier', 'max_depth': 5, 'min_samples_split': 6}. Best is trial 0 with value: 0.7166658041195375.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   21.2s finished
[32m[I 2023-02-19 23:23:24,024][0m Trial 1 finished with value: 0.7835206354970028 and parameters: {'classifier': 'RandomForestClassifier', 'n_estimators': 42, 'criterion': 'gini'}. Best is trial 1 with value: 0.7835206354970028.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)

--- Optimization with Optuna performed in 849.2120630741119 seconds ---
Best params : {'classifier': 'RandomForestClassifier', 'n_estimators': 42, 'criterion': 'entropy'}


In [4]:
from my_libs.model_evaluator import ModelEvaluator

params = study.best_params
model_type = params['classifier']
params.pop('classifier')

evaluator = ModelEvaluator(model_type=model_type, params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()


--- Model RandomForestClassifier fit and trained in 13.853517532348633 seconds ---
--- Params : {'n_estimators': 42, 'criterion': 'entropy'}


Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13212,2827
1,1837,1879



Classification report :
              precision    recall  f1-score   support

           0       0.88      0.82      0.85     16039
           1       0.40      0.51      0.45      3716

    accuracy                           0.76     19755
   macro avg       0.64      0.66      0.65     19755
weighted avg       0.79      0.76      0.77     19755



In [5]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

if run_gridSearchCV:

    classifier_name = 'GradientBoostingClassifier'

    start_time = time.time()

    if classifier_name == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
        params = {'max_depth' : [2, 10, 30, 50],
                  'min_samples_split' : [2,4,6]
                  }

    if classifier_name == 'RandomForestClassifier':
        model = RandomForestClassifier()
        params = {'n_estimators' : [5,10,25,50],
                  'criterion' : ('gini', 'entropy'),
        }

    if classifier_name == 'GradientBoostingClassifier':
        model = GradientBoostingClassifier()
        params = {'learning_rate' : [0.001, 0.01, 1],
                  'n_estimators' : [2, 15, 30]
                  }

    grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=2, scoring="f1")
    grid.fit(X_train, y_train)

    print(f"\n--- {classifier_name} - Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))
    print(f"Grid search params : {params}")
    print(f"Best params : {grid.best_params_}")

    from my_libs.model_evaluator import ModelEvaluator
    evaluator = ModelEvaluator(model_type=classifier_name, params=grid.best_params_, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
    model = evaluator.evaluate()
    # feats = {}
    # for feature, importance in zip(X_train.columns, grid.best_estimator_.feature_importances_):
    #     feats[feature] = importance
    #
    # importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
    #
    # # variables les plus importantes
    # importances.sort_values(by='Gini-importance', ascending=False).head(20)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ................learning_rate=0.001, n_estimators=2; total time=   0.5s
[CV] END ................learning_rate=0.001, n_estimators=2; total time=   0.8s
[CV] END ................learning_rate=0.001, n_estimators=2; total time=   0.5s
[CV] END ...............learning_rate=0.001, n_estimators=15; total time=   2.7s
[CV] END ...............learning_rate=0.001, n_estimators=15; total time=   3.8s
[CV] END ...............learning_rate=0.001, n_estimators=15; total time=   4.0s
[CV] END ...............learning_rate=0.001, n_estimators=30; total time=   6.8s
[CV] END ...............learning_rate=0.001, n_estimators=30; total time=   5.6s
[CV] END ...............learning_rate=0.001, n_estimators=30; total time=   6.7s
[CV] END .................learning_rate=0.01, n_estimators=2; total time=   0.4s
[CV] END .................learning_rate=0.01, n_estimators=2; total time=   0.4s
[CV] END .................learning_rate=0.01, n_e

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12917,3122
1,1496,2220



Classification report :
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     16039
           1       0.42      0.60      0.49      3716

    accuracy                           0.77     19755
   macro avg       0.66      0.70      0.67     19755
weighted avg       0.81      0.77      0.78     19755

