In [3]:
import pandas as pd

quick = True

if quick:
    start_year, end_year, chk, sampled, filename = [2005, 2021, False, True, 'df-light.pkl']
else:
    start_year, end_year, chk, sampled, filename = [2005, 2021, True, False, 'df-full.pkl']

from sklearn.model_selection import train_test_split

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import TargetEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
sampler        = SMOTE(random_state=42)
# sampler        = RandomUnderSampler()
# sampler        = RandomOverSampler()
model          = DecisionTreeClassifier()


X_train_te = encoder_target.fit_transform(X_train, y_train)
X_test_te  = encoder_target.transform(X_test)

X_train_oh = encoder_onehot.fit_transform(X_train_te, y_train)
X_test_oh  = encoder_onehot.transform(X_test_te)

X_train_sc = scaler.fit_transform(X_train_oh)
X_test_sc  = scaler.transform(X_test_oh)

X_train_rs, y_train_rs = sampler.fit_resample(X_train_sc, y_train)
X_test_rs, y_test_rs   = sampler.fit_resample(X_test_sc, y_test)

params = {
    'max_depth' : [10, 20, 30, 40 ,50],
    'criterion' : ('gini', 'entropy'),
    'max_features' : ('auto', 'sqrt', 'log2'),
    'min_samples_split' : (2,4,6)
}

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, n_jobs=-1)
grid.fit(X_train_rs, y_train_rs)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Best score  :  0.7918674873804986
Best params :  {'criterion': 'gini', 'max_depth': 30, 'max_features': 'auto', 'min_samples_split': 6}
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.83      0.74      0.80      0.79      0.62     15490
          1       0.81      0.74      0.83      0.78      0.79      0.61     15490

avg / total       0.79      0.79      0.79      0.79      0.79      0.62     30980

--- performed in 99.72857713699341 seconds ---


In [14]:
feats = {}
for feature, importance in zip(X_train_oh.columns, grid.best_estimator_.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

#Affichage des 8 variables les plus importantes
importances.sort_values(by='Gini-importance', ascending=False).head(8)

Unnamed: 0,Gini-importance
agg_1,0.081774
age,0.065079
dep,0.053181
catv_1,0.046492
col_1,0.040865
catr_1,0.039829
trajet_1,0.031065
col_6,0.023298


In [13]:
len(grid.best_estimator_.feature_importances_)
X_train_oh

Unnamed: 0,place_1,place_2,place_3,place_4,place_5,place_6,place_7,place_8,place_9,place_10,...,catv_6,catv_7,age,joursem_1,joursem_2,joursem_3,joursem_4,joursem_5,joursem_6,joursem_7
1622124,1,0,0,0,0,0,0,0,0,0,...,0,0,0.226834,1,0,0,0,0,0,0
1859743,0,1,0,0,0,0,0,0,0,0,...,0,0,0.230889,0,1,0,0,0,0,0
334332,1,0,0,0,0,0,0,0,0,0,...,0,0,0.176117,0,0,1,0,0,0,0
1178903,1,0,0,0,0,0,0,0,0,0,...,0,0,0.198079,0,0,0,1,0,0,0
575589,1,0,0,0,0,0,0,0,0,0,...,0,0,0.188881,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965755,1,0,0,0,0,0,0,0,0,0,...,0,0,0.226834,0,1,0,0,0,0,0
1138032,1,0,0,0,0,0,0,0,0,0,...,0,0,0.216652,0,0,0,0,1,0,0
693454,1,0,0,0,0,0,0,0,0,0,...,0,0,0.230889,0,0,0,0,0,1,0
1625316,1,0,0,0,0,0,0,0,0,0,...,0,0,0.345679,0,0,1,0,0,0,0


In [27]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)

smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier(criterion='entropy', max_depth=40, max_features='auto',min_samples_split=2)

X_train_enc_t = encoder_target.fit_transform(X_train, y_train)
X_train_enc_oh = encoder_onehot.fit_transform(X_train_enc_t, y_train)
X_train_sc = scaler.fit_transform(X_train_enc_oh)
X_train_rs, y_train_rs = smt.fit_resample(X_train_sc, y_train)
model.fit(X_train_rs, y_train_rs)

X_test_enc_t = encoder_target.transform(X_test)
X_test_enc_oh = encoder_onehot.transform(X_test_enc_t)
X_test_sc = scaler.transform(X_test_enc_oh)
X_test_rs, y_test_rs = smt.fit_resample(X_test_sc, y_test)
y_pred = model.predict(X_test_rs)

print(f"Train score : {model.score(X_train_rs, y_train_rs)}")
print(f"Test score : {model.score(X_test_rs, y_test_rs)}")

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Train score : 0.9940097402597402
Test score : 0.7841833440929632
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.82      0.77      0.80      0.79      0.63     15490
          1       0.81      0.77      0.82      0.79      0.79      0.63     15490

avg / total       0.80      0.79      0.79      0.79      0.79      0.63     30980

--- performed in 1261.5334310531616 seconds ---


In [None]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier()

dt_pipe = Pipeline([
                    ('encoding_target', encoder_target),
                    ('encoding_onehot', encoder_onehot),
                    ('scaling', scaler),
                    ('smote', smt),
                    # ('selection', selector),
                    ('model', model)
                    ])
param_grid = {
    'model__max_depth' : [10, 20, 30, 40 ,50],
    'model__criterion' : ('gini', 'entropy'),
    'model__max_features' : ('auto', 'sqrt', 'log2'),
    'model__min_samples_split' : (2,4,6)
}

grid = GridSearchCV(dt_pipe, param_grid=param_grid, cv = 3, n_jobs=-1)
grid.fit(X_train, y_train)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))