In [None]:
import pandas as pd

run_gridSearchCV = True  # run or not hyperparameters optimization with GridSearchCV()
run_optuna = True        # run or not hyperparameters optimization with Optuna

# filename = 'df-very-light.pkl'
filename = 'df-light.pkl'
# filename = 'df-full.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
# classifier_name = 'GradientBoostingClassifier'

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

data = data.drop(columns=['an'],axis=1)
data = data[['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [None]:
import time
from my_libs.encoder_custom import EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
X_train, y_train = encoder.transform(X_train, y_train, 'Train')
X_test,  y_test  = encoder.transform(X_test,  y_test,  'Test')

In [None]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

if run_optuna:

    # if classifier_name == 'DecisionTreeClassifier': n_trials = 100
    # if classifier_name == 'RandomForestClassifier': n_trials = 10
    # if classifier_name == 'GradientBoostingClassifier': n_trials = 3
    n_trials = 50
    start_time = time.time()

    # 1. Define an objective function to be maximized.
    def objective(trial):

        # 2. Suggest values for the hyperparameters using a trial object.
        classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'])
        if classifier_name == 'DecisionTreeClassifier':
            dt_max_depth         = trial.suggest_int('max_depth', 2, 50, log=True)
            dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
            classifier_obj = DecisionTreeClassifier(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
        else:
            if classifier_name == 'RandomForestClassifier':
                dt_n_estimators      = trial.suggest_int('n_estimators', 5, 50)
                dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
                classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion)
            else:
                if classifier_name == 'GradientBoostingClassifier':
                    dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log=True)
                    dt_n_estimators  = trial.suggest_int('n_estimators ', 1, 30, log=True)
                    classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)


        score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"--- {classifier_name} - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

    # fig = optuna.visualization.plot_param_importances(study)
    # fig.show()

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = study.best_params
model_type = params['classifier']
params.pop('classifier')

evaluator = ModelEvaluator(model_type=model_type, params=params, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
model = evaluator.evaluate()

In [None]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

if run_gridSearchCV:

    classifier_name = 'GradientBoostingClassifier'

    start_time = time.time()

    if classifier_name == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
        params = {'max_depth' : [2, 10, 30, 50],
                  'min_samples_split' : [2,4,6]
                  }

    if classifier_name == 'RandomForestClassifier':
        model = RandomForestClassifier()
        params = {'n_estimators' : [5,10,25,50],
                  'criterion' : ('gini', 'entropy'),
        }

    if classifier_name == 'GradientBoostingClassifier':
        model = GradientBoostingClassifier()
        params = {'learning_rate' : [0.001, 0.01, 1],
                  'n_estimators' : [2, 15, 30]
                  }

    grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=2, scoring="f1")
    grid.fit(X_train, y_train)

    print(f"\n--- {classifier_name} - Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))
    print(f"Grid search params : {params}")
    print(f"Best params : {grid.best_params_}")

    from my_libs.model_evaluator import ModelEvaluator
    evaluator = ModelEvaluator(model_type=classifier_name, params=grid.best_params_, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

    # feats = {}
    # for feature, importance in zip(X_train.columns, grid.best_estimator_.feature_importances_):
    #     feats[feature] = importance
    #
    # importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
    #
    # # variables les plus importantes
    # importances.sort_values(by='Gini-importance', ascending=False).head(20)