In [6]:
import pandas as pd

quick = True             # work with sampled data to reduce computing time
run_gridSearchCV = False # run or not hyperparameters optimization with GridSearchCV()
run_optuna = True        # run or not hyperparameters optimization with Optuna

filename = 'df-light.pkl' if quick else 'df-full.pkl'

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [7]:
import time
from my_libs.encoder_custom import EncoderCustom

start_time = time.time()

cols_target_encoded = []
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
X_train_enc, y_train_enc = encoder.transform(X_train, y_train, 'Train')
X_test_enc,  y_test_enc  = encoder.transform(X_test,  y_test,  'Test')

print("--- Features encoding performed in %s seconds ---" % (time.time() - start_time))


Default parameter min_samples_leaf will change in version 2.6.See https://github.com/scikit-learn-contrib/category_encoders/issues/327


Default parameter smoothing will change in version 2.6.See https://github.com/scikit-learn-contrib/category_encoders/issues/327



--- Features encoding performed in 27.17088222503662 seconds ---


In [13]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

if run_optuna:

    start_time = time.time()
    # classifier_name = 'DecisionTreeClassifier'
    # n_trials = 100

    # classifier_name = 'DecisionTreeClassifier'

    classifier_name = 'GradientBoostingClassifier'
    n_trials = 3


    # 1. Define an objective function to be maximized.
    def objective(trial):

        # 2. Suggest values for the hyperparameters using a trial object.
        # classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'KNeighborsClassifier'])
        if classifier_name == 'DecisionTreeClassifier':
            dt_max_depth         = trial.suggest_int('max_depth', 2, 50, log=True)
            dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
            # dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            dt_criterion         = 'gini'
            # dt_max_features      = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
            dt_max_features      = 'auto'
            classifier_obj = DecisionTreeClassifier(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split, criterion=dt_criterion, max_features=dt_max_features)
        else:
            if classifier_name == 'RandomForestClassifier':
                dt_n_estimators      = trial.suggest_int('n_estimators', 20, 100)
                # dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
                dt_criterion = 'gini'
                dt_max_depth         = trial.suggest_int('max_depth', 10, 50, log=False)
                dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
                classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
            else:
                if classifier_name == 'GradientBoostingClassifier':
                    dt_learning_rate = trial.suggest_float('learning_rate', 0.01, 1, log=True)
                    dt_n_estimators  = trial.suggest_int('n_estimators ', 1, 100, log=True)
                    classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)


        score = cross_val_score(classifier_obj, X_train_enc, y_train_enc, cv=3, scoring="roc_auc", verbose=1)
        accuracy = score.mean()

        return accuracy

    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)

    print("--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))

    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

[32m[I 2023-02-19 06:22:50,147][0m A new study created in memory with name: no-name-3dbb0df3-229b-42a5-aaf5-d4974a5df237[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.3min finished
[32m[I 2023-02-19 06:28:10,018][0m Trial 0 finished with value: 0.9170250745550588 and parameters: {'learning_rate': 0.660601386343619, 'n_estimators ': 54}. Best is trial 0 with value: 0.9170250745550588.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.4s finished
[32m[I 2023-02-19 06:28:18,551][0m Trial 1 finished with value: 0.8116649666445058 and parameters: {'learning_rate': 0.15343537644235813, 'n_estimators ': 1}. Best is trial 0 with value: 0.9170250745550588.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.7min fini

--- Optimization with Optuna performed in 1434.9427917003632 seconds ---


In [None]:
# Best model
# params = {'classifier': 'RandomForestClassifier', 'n_estimators': 26, 'criterion': 'entropy', 'max_depth': 27, 'min_samples_split': 3}
# params = {'classifier': 'RandomForestClassifier', 'n_estimators': 26, 'criterion': 'entropy', 'max_depth': 47, 'min_samples_split': 2}
# params = {'classifier': 'DecisionTreeClassifier', 'max_depth': 32, 'criterion': 'entropy', 'max_features': 'log2', 'min_samples_split': 5}


params_dt = {'max_depth': 15, 'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 3}
params_rf = {'n_estimators': 34, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.metrics import classification_report_imbalanced

if run_gridSearchCV:
    start_time = time.time()

    model = RandomForestClassifier()

    params = {
        'n_estimators' : [10, 20, 30, 40 ,50],
        'criterion' : ('gini', 'entropy'),
        'max_depth' : [10, 20],
        'min_samples_split' : (2,4)
    }

    grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=10, scoring="f1")
    grid.fit(X_train_enc, y_train_enc)

    print('Best score  : ', grid.best_score_)
    print('Best params : ', grid.best_params_)

    y_pred = grid.predict(X_test_enc)
    print(classification_report_imbalanced(y_test_enc, y_pred))

    print(f"model  : {model}")
    print(f"params : {params}")
    print("--- Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))

    feats = {}
    for feature, importance in zip(X_train_enc.columns, grid.best_estimator_.feature_importances_):
        feats[feature] = importance

    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

    # 8 variables les plus importantes
    importances.sort_values(by='Gini-importance', ascending=False).head(20)