In [None]:
import pandas as pd
from my_libs import lib_tools as pt

run_type = 'dev'
# run_type = 'prd'
encoding_step = False

if run_type == 'dev': filename_train, filename_test = 'df-dev-train.pkl', 'df-dev-test.pkl'
if run_type == 'prd': filename_train, filename_test = 'df-prd-train.pkl', 'df-prd-test.pkl'

# classifier_name = 'DecisionTreeClassifier'
# classifier_name = 'RandomForestClassifier'
# classifier_name = 'GradientBoostingClassifier'

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']
X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [None]:
import time
import pandas as pd
from my_libs.encoder_custom import EncoderCustom

if encoding_step :
    cols_target_encoded = ['dep']
    cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

    encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)
    X_train, y_train = encoder.transform(X_train, y_train, 'Train')
    X_test,  y_test  = encoder.transform(X_test,  y_test,  'Test')
    X_test_final, y_test_final = encoder.transform(X_test_final, y_test_final, 'Test')

    X_train.to_pickle('./X_train.pkl')
    y_train.to_pickle('./y_train.pkl')
    X_test.to_pickle('./X_test.pkl')
    y_test.to_pickle('./y_test.pkl')
    X_test_final.to_pickle('./X_test_final.pkl')
    y_test_final.to_pickle('./y_test_final.pkl')
else:
    X_train = pd.read_pickle(f'./X_train.pkl')
    y_train = pd.read_pickle(f'./y_train.pkl')
    X_test = pd.read_pickle(f'./X_test.pkl')
    y_test = pd.read_pickle(f'./y_test.pkl')
    X_test_final = pd.read_pickle(f'./X_test_final.pkl')
    y_test_final = pd.read_pickle(f'./y_test_final.pkl')

In [None]:
print(f"Train dataset size : {X_train.shape}")

In [None]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# if classifier_name == 'DecisionTreeClassifier': n_trials = 100
# if classifier_name == 'RandomForestClassifier': n_trials = 10
# if classifier_name == 'GradientBoostingClassifier': n_trials = 3
n_trials = 10
start_time = time.time()

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    # classifier_name = trial.suggest_categorical('classifier', ['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'])
    classifier_name = trial.suggest_categorical('classifier', ['RandomForestClassifier'])

    if classifier_name == 'DecisionTreeClassifier':
        dt_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
        dt_max_depth = trial.suggest_int('max_depth', 2, 300, log=True)
        dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
        classifier_obj = DecisionTreeClassifier(criterion=dt_criterion, splitter=dt_splitter, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)
    else:
        if classifier_name == 'RandomForestClassifier':
            dt_n_estimators = trial.suggest_int('n_estimators', 50, 150)
            dt_criterion    = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            dt_max_depth = trial.suggest_int('max_depth', 2, 20, log=True)
            dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
            classifier_obj = RandomForestClassifier(n_estimators=dt_n_estimators, criterion=dt_criterion)
        else:
            if classifier_name == 'GradientBoostingClassifier':
                dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5, log=True)
                dt_n_estimators  = trial.suggest_int('n_estimators', 50, 150)
                classifier_obj = GradientBoostingClassifier(learning_rate=dt_learning_rate, n_estimators=dt_n_estimators)


    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring="f1", verbose=1)
    accuracy = score.mean()

    return accuracy

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

print(f"--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
print(f"Best params : {study.best_params}")

# fig = optuna.visualization.plot_param_importances(study)
# fig.show()

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = study.best_params
model_type = params['classifier']
params.pop('classifier')

evaluator = ModelEvaluator(model_type=model_type, params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()

In [None]:
X_train.shape

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [None]:
plot_optimization_history(study)

In [None]:
plot_contour(study)

In [None]:
plot_contour(study, params=[list(params.keys())[0], list(params.keys())[1]])

In [None]:
plot_slice(study)

In [None]:
plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [None]:
plot_edf(study)