In [None]:
import pandas as pd
from my_libs import lib_tools as pt

run_gridSearchCV = True  # True to run hyperparameters optimization with GridSearchCV()
run_optuna = True        # True to run hyperparameters optimization with Optuna

run_type = 'very-light'
# run_type = 'light'
# run_type = 'full'

if run_type == 'very-light': filename_train, filename_test = 'df-very-light-train.pkl', 'df-very-light-test.pkl'
if run_type == 'light'     : filename_train, filename_test = 'df-light-train.pkl', 'df-light-test.pkl'
if run_type == 'full'      : filename_train, filename_test = 'df-full-train.pkl', 'df-full-test.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
classifier_name = 'GradientBoostingClassifier'

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']
X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [None]:
import time
from my_libs.encoder_custom import EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
X_train, y_train = encoder.transform(X_train, y_train, 'Train')
X_test, y_test = encoder.transform(X_test,  y_test,  'Test')
X_test_final, y_test_final = encoder.transform(X_test_final, y_test_final, 'Test')

In [None]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

start_time = time.time()

if classifier_name == 'DecisionTreeClassifier':
    model = DecisionTreeClassifier()
    params = {'max_depth' : [2, 10, 30, 50],
              'min_samples_split' : [2,4,6]
              }

if classifier_name == 'RandomForestClassifier':
    model = RandomForestClassifier()
    params = {'n_estimators' : [5,10,25,50],
              'criterion' : ('gini', 'entropy'),
    }

if classifier_name == 'GradientBoostingClassifier':
    model = GradientBoostingClassifier()
    params = {'learning_rate' : [0.001, 0.01, 0.5],
              'n_estimators' : [50, 100, 200]
              }

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=2, scoring="f1")
grid.fit(X_train, y_train)

print(f"\n--- {classifier_name} - Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))
print(f"Grid search params : {params}")
print(f"Best params : {grid.best_params_}")

from my_libs.model_evaluator import ModelEvaluator
evaluator = ModelEvaluator(model_type=classifier_name, params=grid.best_params_, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()

In [None]:
feats = {}
for feature, importance in zip(X_train.columns, grid.best_estimator_.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

# variables les plus importantes
importances.sort_values(by='Gini-importance', ascending=False).head(20)

In [None]:
from my_libs.model_evaluator import ModelEvaluator

evaluator = ModelEvaluator(model_type=classifier_name, params=grid.best_params_, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()