In [22]:
import pandas as pd

quick = True

if quick:
    start_year, end_year, chk, sampled, filename = [2005, 2021, False, True, 'df-light.pkl']
else:
    start_year, end_year, chk, sampled, filename = [2005, 2021, True, False, 'df-full.pkl']

from sklearn.model_selection import train_test_split

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [2]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)

smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier(max_depth=10)

X_train_enc_t = encoder_target.fit_transform(X_train, y_train)
X_train_enc_oh = encoder_onehot.fit_transform(X_train_enc_t, y_train)
X_train_sc = scaler.fit_transform(X_train_enc_oh)
X_train_rs, y_train_rs = smt.fit_resample(X_train_sc, y_train)
model.fit(X_train_rs, y_train_rs)

X_test_enc_t = encoder_target.transform(X_test)
X_test_enc_oh = encoder_onehot.transform(X_test_enc_t)
X_test_sc = scaler.transform(X_test_enc_oh)
X_test_rs, y_test_rs = smt.fit_resample(X_test_sc, y_test)
y_pred = model.predict(X_test_rs)

print(classification_report_imbalanced(y_test_rs, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.85      0.79      0.82      0.82      0.67     15490
          1       0.84      0.79      0.85      0.81      0.82      0.66     15490

avg / total       0.82      0.82      0.82      0.82      0.82      0.67     30980



In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier(max_depth=10)


X_train_te = encoder_target.fit_transform(X_train, y_train)
X_test_te  = encoder_target.transform(X_test)

X_train_oh = encoder_onehot.fit_transform(X_train_te, y_train)
X_test_oh  = encoder_onehot.transform(X_test_te)

X_train_sc = scaler.fit_transform(X_train_oh)
X_test_sc  = scaler.transform(X_test_oh)

X_train_rs, y_train_rs = smt.fit_resample(X_train_sc, y_train)
X_test_rs, y_test_rs   = smt.fit_resample(X_test_sc, y_test)

params = {
    'max_depth' : [10, 20, 30, 40 ,50],
    'criterion' : ('gini', 'entropy'),
    'max_features' : ('auto', 'sqrt', 'log2'),
    'min_samples_split' : (2,4,6)
}

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, n_jobs=-1)
grid.fit(X_train_rs, y_train_rs)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Best score  :  0.7911775766926734
Best params :  {'criterion': 'entropy', 'max_depth': 40, 'max_features': 'auto', 'min_samples_split': 4}
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.84      0.74      0.80      0.79      0.62     15490
          1       0.82      0.74      0.84      0.78      0.79      0.61     15490

avg / total       0.79      0.79      0.79      0.79      0.79      0.62     30980

--- performed in 102.34112548828125 seconds ---


In [27]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler = StandardScaler()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
smt   = SMOTE(random_state=42)
model = DecisionTreeClassifier()

dt_pipe = Pipeline([
                    ('encoding_target', encoder_target),
                    ('encoding_onehot', encoder_onehot),
                    ('scaling', scaler),
                    ('smote', smt),
                    # ('selection', selector),
                    ('model', model)
                    ])
param_grid = {
    'model__max_depth' : [10, 20, 30, 40 ,50],
    'model__criterion' : ('gini', 'entropy'),
    'model__max_features' : ('auto', 'sqrt', 'log2'),
    'model__min_samples_split' : (2,4,6)
}

grid = GridSearchCV(dt_pipe, param_grid=param_grid, cv = 3, n_jobs=-1)
grid.fit(X_train, y_train)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Best score  :  0.7483715552909672
Best params :  {'model__criterion': 'gini', 'model__max_depth': 20, 'model__max_features': 'auto', 'model__min_samples_split': 2}
                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.84      0.47      0.84      0.63      0.41     15490
          1       0.46      0.47      0.84      0.47      0.63      0.38      4469

avg / total       0.76      0.76      0.55      0.76      0.63      0.40     19959

--- performed in 1528.062789440155 seconds ---


In [None]:
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler, SMOTE
#
# if undersampling:
#     # Random Undersampling
#     rUs = RandomUnderSampler()
#     # X_ru, y_ru = rUs.fit_resample(X_train, y_train)
#     X_train, y_train = rUs.fit_resample(X_train, y_train)
#     # print('Classes échantillon undersampled :', y_ru.value_counts())
#     print('Classes échantillon undersampled :', y_train.value_counts())
# if oversampling:
#     rOs = RandomOverSampler()
#     X_train, y_train = rOs.fit_resample(X_train, y_train)
#     print('Classes échantillon oversampled :', dict(pd.Series(y_train).value_counts()))
# if smote:
#     smo = SMOTE()
#     X_train, y_train = smo.fit_resample(X_train, y_train)
#     print('Classes échantillon SMOTE :', dict(pd.Series(y_train).value_counts()))

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.tree import DecisionTreeClassifier
#
# parameters = {'max_depth'         : (10,20,30,40,50),
#               'criterion'         : ('gini', 'entropy'),
#               'max_features'      : ('auto', 'sqrt', 'log2'),
#               'min_samples_split' : (2,4,6)
#               }
#
# grid = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=parameters, cv=5, verbose=True)
#
# grid.fit(X_train, y_train)

In [None]:
# from imblearn.metrics import classification_report_imbalanced
#
# model = grid.best_estimator_
# print(model)
#
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))
#
# print(f"\n{classification_report_imbalanced(y_test, y_pred)}")


In [None]:
# feats = {}
# for feature, importance in zip(data.columns, model.feature_importances_):
#     feats[feature] = importance
#
# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
#
# #Affichage des 8 variables les plus importantes
# importances.sort_values(by='Gini-importance', ascending=False).head(8)

In [None]:
# from sklearn import tree
# tree.plot_tree(model);


In [None]:
# print(f"Train accuracy : {model.score(X_train, y_train):.3f}")
# print(f"Test accuracy : {model.score(X_test, y_test):.3f}")