In [None]:
import pandas as pd

quick = True

filename = 'df-light.pkl' if quick else 'df-full.pkl'

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [None]:
import time
from my_libs.encoder_custom import EncoderCustom

start_time = time.time()

# cols_target_encoded = ['dep', 'age']
cols_target_encoded = []
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
X_train_rs, y_train_rs = encoder.transform(X_train, y_train, 'Train')
X_test_rs,  y_test_rs  = encoder.transform(X_test,  y_test,  'Test')

print("--- Features encoding performed in %s seconds ---" % (time.time() - start_time))

In [None]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

start_time = time.time()

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    # classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC'])
    classifier_name = trial.suggest_categorical('classifier', ['RandomForestClassifier'])
    if classifier_name == 'DecisionTreeClassifier':
        dt_max_depth         = trial.suggest_int('max_depth', 10, 50, log=False)
        # dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        dt_criterion         = trial.suggest_categorical('criterion', ['entropy'])
        dt_max_features      = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
        # dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
        dt_min_samples_split = trial.suggest_int('min_samples_split', 3)
        classifier_obj = DecisionTreeClassifier(max_depth=dt_max_depth, min_samples_split=dt_min_samples_split, criterion=dt_criterion, max_features=dt_max_features)
    else:
        if classifier_name == 'RandomForestClassifier':
            dt_n_estimators      = trial.suggest_int('n_estimators', 20, 50)
            dt_criterion         = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            dt_max_depth         = trial.suggest_int('max_depth', 10, 50, log=False)
            dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
            classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
        else:
            if classifier_name == 'SVC':
                dt_C     = trial.suggest_float('C', 0.1, 10, log=True)
                dt_gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
                classifier_obj = SVC(C=dt_C, gamma=dt_gamma)

    score = cross_val_score(classifier_obj, X_train_rs, y_train_rs, cv=3, scoring="f1", verbose=2)
    accuracy = score.mean()

    return accuracy

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.metrics import classification_report_imbalanced

start_time = time.time()

model = RandomForestClassifier()

params = {
    # 'n_estimators' : [10, 20, 30, 40 ,50],
    'n_estimators' : [20, 30],
    # 'criterion' : ('gini', 'entropy'),
    'criterion' : ('gini', 'entropy'),
    'max_depth' : [10, 20],
    'min_samples_split' : (2,4)
}

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=10, scoring="f1")
grid.fit(X_train_rs, y_train_rs)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print(f"model  : {model}")
print(f"params : {params}")
print("--- Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))

In [None]:
feats = {}
for feature, importance in zip(X_train_rs.columns, grid.best_estimator_.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

# 8 variables les plus importantes
importances.sort_values(by='Gini-importance', ascending=False).head(20)