In [1]:
import pandas as pd
from my_libs import lib_tools as pt # Project Tools
from time import time

quick = True

if quick:
    start_year, end_year, chk, sampled = [2005, 2021, True, True]
else:
    start_year, end_year, chk, sampled = [2005, 2021, True, False]

undersampling = False
oversampling = False
smote = True

In [2]:
# df = pt.load_proj_df(start_year, end_year, chk, sampled)
#
# df.to_pickle("./project_df.pkl")

In [3]:
df = pd.read_pickle("./project_df.pkl")

data = df.iloc[:, 1:]
target = df['grav']

col_target_encoded = ['jour', 'age', 'dep']

# target encoding for features with lots of modalities
for col in col_target_encoded:
    data = pt.encode_target_col(data, col, target, chk)

# hot encoding for other columns
for col in data.columns:
    if not (col in col_target_encoded):
        if col in data.columns:
            data = pt.encode_dummies_col(data, col, chk)

Column jour has been target encoded
Column age has been target encoded
Column dep has been target encoded
Column place has been dummies encoded
Column catu has been dummies encoded
Column sexe has been dummies encoded
Column trajet has been dummies encoded
Column locp has been dummies encoded
Column actp has been dummies encoded
Column etatp has been dummies encoded
Column mois has been dummies encoded
Column lum has been dummies encoded
Column agg has been dummies encoded
Column int has been dummies encoded
Column atm has been dummies encoded
Column col has been dummies encoded
Column catr has been dummies encoded
Column circ has been dummies encoded
Column nbv has been dummies encoded
Column vosp has been dummies encoded
Column prof has been dummies encoded
Column plan has been dummies encoded
Column surf has been dummies encoded
Column infra has been dummies encoded
Column situ has been dummies encoded
Column senc has been dummies encoded
Column catv has been dummies encoded
Column 

### Train test split + Normalization

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

# normalisation (only for non categorical variables)
cols = col_target_encoded
sc = StandardScaler()
X_train[cols] = sc.fit_transform(X_train[cols])
X_test[cols] = sc.transform(X_test[cols])

### UnderSampler

In [5]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

if undersampling:
    # Random Undersampling
    rUs = RandomUnderSampler()
    # X_ru, y_ru = rUs.fit_resample(X_train, y_train)
    X_train, y_train = rUs.fit_resample(X_train, y_train)
    # print('Classes échantillon undersampled :', y_ru.value_counts())
    print('Classes échantillon undersampled :', y_train.value_counts())
if oversampling:
    rOs = RandomOverSampler()
    X_train, y_train = rOs.fit_resample(X_train, y_train)
    print('Classes échantillon oversampled :', dict(pd.Series(y_train).value_counts()))
if smote:
    smo = SMOTE()
    X_train, y_train = smo.fit_resample(X_train, y_train)
    print('Classes échantillon SMOTE :', dict(pd.Series(y_train).value_counts()))

Classes échantillon SMOTE : {0: 61586, 1: 61586}


### Balanced Random Forest

In [6]:
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier

t0 = time()

# Classifieur Balanced Random Forest
model = BalancedRandomForestClassifier()

model.fit(X_train, y_train.values)
y_pred = model.predict(X_test)
print("\nClassifieur Balanced Random Forest:")
if undersampling: print("avec undersampling")
if oversampling: print("avec oversampling")
if smote: print("avec smote")

print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))


Classifieur Balanced Random Forest:
avec smote
col_0      0     1
row_0             
0      13731  1785
1       2199  2243
                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.88      0.50      0.87      0.67      0.46     15516
          1       0.56      0.50      0.88      0.53      0.67      0.43      4442

avg / total       0.79      0.80      0.59      0.80      0.67      0.46     19958

Réalisé en 61.743 secondes


### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

t0 = time()

model = DecisionTreeClassifier()

model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

print("\nDecision Tree:")
if undersampling: print("avec undersampling")
if oversampling: print("avec oversampling")
if smote: print("avec smote")

print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))


Decision Tree:
avec smote
col_0      0     1
row_0             
0      12434  3082
1       2354  2088
                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.80      0.47      0.82      0.61      0.39     15516
          1       0.40      0.47      0.80      0.43      0.61      0.36      4442

avg / total       0.74      0.73      0.54      0.73      0.61      0.38     19958

Réalisé en 4.027 secondes


In [8]:
2050/(2050+3101)

0.39798097456804504

### Support Vector Machine

In [9]:
from sklearn.svm import SVC

t0 = time()

model = SVC(gamma='scale', max_iter=1000)

model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

print("\nClassifieur SVM:\n")

if undersampling: print("avec undersampling")
if oversampling: print("avec oversampling")
if smote: print("avec smote")

print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))




Classifieur SVM:

avec smote
col_0     0      1
row_0             
0      2067  13449
1       888   3554
                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.13      0.80      0.22      0.33      0.10     15516
          1       0.21      0.80      0.13      0.33      0.33      0.11      4442

avg / total       0.59      0.28      0.65      0.25      0.33      0.10     19958

Réalisé en 51.629 secondes
