### Get Train, Valid, Test data

In [1]:
import pandas as pd
from my_libs import lib_tools as pt

run_type = 'dev'
# run_type = 'prd'
resample = True

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type)
print("Train, valid and Test data loaded")

Train, valid and Test data loaded


### Resample data with SMOTEN()

In [2]:
if resample:
    X_train, y_train = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train.to_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train.to_pickle(f'./pickles/y_train_smote_{run_type}.pkl')
else:
    # Load data previously generated
    X_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')

--- Smote applied in 4.238809108734131 seconds ---
Classes cardinality after resampling :
0    4900
1    4900
Name: grav, dtype: int64
X shape : (6400, 28) -> (9800, 28)
y shape : (6400,) -> (9800,)


### Encode categorical data (target and one hot encoding)

In [3]:
from my_libs.encoder_custom import  EncoderCustom

cols_target_encoded = ['dep']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)

encoder = EncoderCustom(cols_target_encoded=cols_target_encoded, cols_onehot_encoded=cols_onehot_encoded)

X_train, y_train = encoder.transform(X=X_train, y=y_train, datatype='Train')
X_valid, y_valid = encoder.transform(X=X_valid, y=y_valid, datatype='Test')
X_test , y_test  = encoder.transform(X=X_test , y=y_test , datatype='Test')



Columns target encoded : ['dep']
Columns one hot encoded : ['place', 'catu', 'sexe', 'trajet', 'locp', 'actp', 'etatp', 'an', 'mois', 'lum', 'agg', 'int', 'atm', 'col', 'catr', 'circ', 'nbv', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ', 'senc', 'catv', 'age_cls', 'joursem']
Features normalized
--- Train set - features encoding performed in 1.04 seconds ---
--- Test set - features encoding performed in 0.08 seconds ---
--- Test set - features encoding performed in 0.10 seconds ---


### Find best hyperparameters for model with Optuna

In [14]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

n_trials = 10
start_time = time.time()

def objective(trial):

    dt_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    dt_max_depth = trial.suggest_int('max_depth', 2, 300, log=True)
    dt_min_samples_split = trial.suggest_int('min_samples_split', 2, 6)
    classifier_obj = DecisionTreeClassifier(criterion=dt_criterion, splitter=dt_splitter, max_depth=dt_max_depth, min_samples_split=dt_min_samples_split)

    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring="f1", verbose=1)
    accuracy = score.mean()

    return accuracy

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

print(f"\n--- Decision Tree Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
print(f"Best params : {study.best_params}")

[32m[I 2023-02-24 18:17:12,145][0m A new study created in memory with name: no-name-abc80054-eb3d-488e-9b6a-d377e1dba0a5[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[32m[I 2023-02-24 18:17:12,306][0m Trial 0 finished with value: 0.41255746875915567 and parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 3}. Best is trial 0 with value: 0.41255746875915567.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[32m[I 2023-02-24 18:17:12,419][0m Trial 1 finished with value: 0.3895286348518959 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 13, 'min_samples_split': 6}. Best is trial 0 with value: 0.41255746875915567.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Para


--- Decision Tree Classifier - Optimization with Optuna performed in 1.1701242923736572 seconds ---
Best params : {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 16, 'min_samples_split': 3}


### Fit best model

In [20]:
params = study.best_params
model = DecisionTreeClassifier(**params)
model.fit(X_train, y_train)

model.feature_importances_
X_train


Unnamed: 0,place,catu,sexe,trajet,locp,actp,etatp,an,mois,lum,...,vosp,prof,plan,surf,infra,situ,senc,catv,age_cls,joursem
1559992,2,2,2,0,0,0,0,2014,3,5,...,0,1,1,1,5,1,0,7,4,5
1992544,1,1,1,1,0,0,0,2017,11,3,...,0,2,1,1,0,1,2,7,3,2
1319310,1,1,1,1,0,0,0,2012,10,1,...,0,1,1,2,0,1,0,7,1,5
1256159,1,1,2,5,0,0,0,2012,3,1,...,0,1,1,1,0,1,0,7,1,5
1561576,1,1,1,4,0,0,0,2014,9,2,...,0,1,1,1,0,1,0,-1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251931,1,1,1,5,0,0,0,2006,5,1,...,0,1,1,1,0,3,0,7,3,5
477973,0,3,1,0,4,3,1,2007,7,2,...,0,2,1,1,0,1,0,7,0,1
57525,1,1,1,5,0,0,0,2005,5,2,...,0,1,1,1,0,1,0,7,2,6
1572756,1,1,1,0,0,0,0,2014,6,1,...,0,3,1,1,0,1,0,33,2,0
