In [1]:
import pandas as pd

quick = True

if quick:
    start_year, end_year, chk, sampled, filename = [2005, 2021, False, True, 'df-light.pkl']
else:
    start_year, end_year, chk, sampled, filename = [2005, 2021, True, False, 'df-full.pkl']

from sklearn.model_selection import train_test_split

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import TargetEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
sampler        = SMOTE(random_state=42)
# sampler        = RandomUnderSampler()
# sampler        = RandomOverSampler()
model          = DecisionTreeClassifier()


X_train_te = encoder_target.fit_transform(X_train, y_train)
X_test_te  = encoder_target.transform(X_test)

X_train_oh = encoder_onehot.fit_transform(X_train_te, y_train)
X_test_oh  = encoder_onehot.transform(X_test_te)

X_train_sc = scaler.fit_transform(X_train_oh)
X_test_sc  = scaler.transform(X_test_oh)

X_train_rs, y_train_rs = sampler.fit_resample(X_train_sc, y_train)
X_test_rs, y_test_rs   = sampler.fit_resample(X_test_sc, y_test)

params = {
    'max_depth' : [10, 20, 30, 40 ,50],
    'criterion' : ('gini', 'entropy'),
    'max_features' : ('auto', 'sqrt', 'log2'),
    'min_samples_split' : (2,4,6)
}

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=10)
grid.fit(X_train_rs, y_train_rs)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[CV 1/3; 1/90] START criterion=gini, max_depth=10, max_features=auto, min_samples_split=2
[CV 1/3; 1/90] END criterion=gini, max_depth=10, max_features=auto, min_samples_split=2;, score=0.641 total time=   0.2s
[CV 2/3; 1/90] START criterion=gini, max_depth=10, max_features=auto, min_samples_split=2
[CV 2/3; 1/90] END criterion=gini, max_depth=10, max_features=auto, min_samples_split=2;, score=0.786 total time=   0.2s
[CV 3/3; 1/90] START criterion=gini, max_depth=10, max_features=auto, min_samples_split=2
[CV 3/3; 1/90] END criterion=gini, max_depth=10, max_features=auto, min_samples_split=2;, score=0.792 total time=   0.2s
[CV 1/3; 2/90] START criterion=gini, max_depth=10, max_features=auto, min_samples_split=4
[CV 1/3; 2/90] END criterion=gini, max_depth=10, max_features=auto, min_samples_split=4;, score=0.649 total time=   0.2s
[CV 2/3; 2/90] START criterion=gini, max_depth=10, max_features=auto, min_samples_split=4
[CV 

In [3]:
feats = {}
for feature, importance in zip(X_train_oh.columns, grid.best_estimator_.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

#Affichage des 8 variables les plus importantes
importances.sort_values(by='Gini-importance', ascending=False).head(20)

Unnamed: 0,Gini-importance
agg_1,0.071696
dep,0.068432
catv_1,0.046981
catr_1,0.041504
age,0.040795
etatp_1,0.03461
trajet_3,0.02926
circ_1,0.026124
agg_2,0.023798
col_7,0.021816


In [4]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)

smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier(criterion='entropy', max_depth=40, max_features='auto',min_samples_split=2)

X_train_enc_t = encoder_target.fit_transform(X_train, y_train)
X_train_enc_oh = encoder_onehot.fit_transform(X_train_enc_t, y_train)
X_train_sc = scaler.fit_transform(X_train_enc_oh)
X_train_rs, y_train_rs = smt.fit_resample(X_train_sc, y_train)
model.fit(X_train_rs, y_train_rs)

X_test_enc_t = encoder_target.transform(X_test)
X_test_enc_oh = encoder_onehot.transform(X_test_enc_t)
X_test_sc = scaler.transform(X_test_enc_oh)
X_test_rs, y_test_rs = smt.fit_resample(X_test_sc, y_test)
y_pred = model.predict(X_test_rs)

print(f"Train score : {model.score(X_train_rs, y_train_rs)}")
print(f"Test score : {model.score(X_test_rs, y_test_rs)}")

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Train score : 0.9897526043776226
Test score : 0.7857004393900232
                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.81      0.76      0.79      0.78      0.61     15476
          1       0.80      0.76      0.81      0.78      0.78      0.61     15476

avg / total       0.78      0.78      0.78      0.78      0.78      0.61     30952

--- performed in 68.98097157478333 seconds ---


In [16]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier()

dt_pipe = Pipeline([
                    ('encoding_target', encoder_target),
                    ('encoding_onehot', encoder_onehot),
                    ('scaling', scaler),
                    ('smote', smt),
                    # ('selection', selector),
                    ('model', model)
                    ])
param_grid = {
    'model__max_depth' : [10, 20, 30, 40 ,50],
    'model__criterion' : ('gini', 'entropy'),
    'model__max_features' : ('auto', 'sqrt', 'log2'),
    'model__min_samples_split' : (2,4,6)
}

grid = GridSearchCV(estimator=dt_pipe, param_grid=param_grid, cv = 3, verbose=10)
grid.fit(X_train, y_train)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[CV 1/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 1/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.729 total time=   7.3s
[CV 2/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 2/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.734 total time=   6.4s
[CV 3/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 3/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.728 total time=   6.6s
[CV 1/3; 2/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=4
[CV 1/3; 2/90] END 

KeyboardInterrupt: 

In [20]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier()

dt_pipe = Pipeline([
    ('encoding_target', encoder_target),
    ('encoding_onehot', encoder_onehot),
    ('scaling', scaler),
    ('smote', smt),
    # ('selection', selector),
    ('model', model)
], memory='./cache')
param_grid = {
    'model__max_depth' : [10, 20, 30, 40 ,50],
    'model__criterion' : ('gini', 'entropy'),
    'model__max_features' : ('auto', 'sqrt', 'log2'),
    'model__min_samples_split' : (2,4,6)
}

grid = GridSearchCV(estimator=dt_pipe, param_grid=param_grid, cv = 3, verbose=10)
grid.fit(X_train, y_train)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[CV 1/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 1/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.709 total time=   2.2s
[CV 2/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 2/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.719 total time=   1.8s
[CV 3/3; 1/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2
[CV 3/3; 1/90] END model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=2;, score=0.723 total time=   4.8s
[CV 1/3; 2/90] START model__criterion=gini, model__max_depth=10, model__max_features=auto, model__min_samples_split=4
[CV 1/3; 2/90] END 

KeyboardInterrupt: 