In [3]:
import pandas as pd
import lib_tools as pt # Project Tools
from time import time

quick = True

if quick:
    start_year, end_year, chk, sampled = [2005, 2021, True, True]
else:
    start_year, end_year, chk, sampled = [2005, 2021, True, False]

In [4]:
# df = pt.load_proj_df(start_year, end_year, chk, sampled)
#
# df.to_pickle("./project_df.pkl")

In [5]:
df = pd.read_pickle("./project_df.pkl")

data = df.iloc[:, 1:]
target = df['grav']

col_target_encoded = ['jour', 'age', 'dep']

# target encoding for features with lots of modalities
for col in col_target_encoded:
    data = pt.encode_target_col(data, col, target, chk)

# hot encoding for other columns
for col in data.columns:
    if not (col in col_target_encoded):
        if col in data.columns:
            data = pt.encode_dummies_col(data, col, chk)

Column jour has been target encoded
Column age has been target encoded
Column dep has been target encoded
Column place has been dummies encoded
Column catu has been dummies encoded
Column sexe has been dummies encoded
Column trajet has been dummies encoded
Column locp has been dummies encoded
Column actp has been dummies encoded
Column etatp has been dummies encoded
Column mois has been dummies encoded
Column lum has been dummies encoded
Column agg has been dummies encoded
Column int has been dummies encoded
Column atm has been dummies encoded
Column col has been dummies encoded
Column catr has been dummies encoded
Column circ has been dummies encoded
Column nbv has been dummies encoded
Column vosp has been dummies encoded
Column prof has been dummies encoded
Column plan has been dummies encoded
Column surf has been dummies encoded
Column infra has been dummies encoded
Column situ has been dummies encoded
Column senc has been dummies encoded
Column catv has been dummies encoded
Column 

### Train test split + Normalization

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

# normalisation (only for non categorical variables)
cols = col_target_encoded
sc = StandardScaler()
X_train[cols] = sc.fit_transform(X_train[cols])
X_test[cols] = sc.transform(X_test[cols])

### UnderSampler

In [7]:
from imblearn.under_sampling import RandomUnderSampler

undersampling = False

if undersampling:
    # Random Undersampling
    rUs = RandomUnderSampler()
    # X_ru, y_ru = rUs.fit_resample(X_train, y_train)
    X_train, y_train = rUs.fit_resample(X_train, y_train)
    # print('Classes échantillon undersampled :', y_ru.value_counts())
    print('Classes échantillon undersampled :', y_train.value_counts())

### Balanced Random Forest

In [8]:
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier

t0 = time()

# Classifieur Balanced Random Forest
model = BalancedRandomForestClassifier()

model.fit(X_train, y_train.values)
y_pred = model.predict(X_test)
print("\nClassifieur Balanced Random Forest:\n")
print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))


Classifieur Balanced Random Forest:

col_0      0     1
row_0             
0      11226  4290
1       1000  3442
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.72      0.77      0.81      0.75      0.56     15516
          1       0.45      0.77      0.72      0.57      0.75      0.56      4442

avg / total       0.81      0.73      0.76      0.76      0.75      0.56     19958

Réalisé en 15.936 secondes


### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

t0 = time()

model = DecisionTreeClassifier()

model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

print("\nDecision Tree:\n")
print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))


Decision Tree:

col_0      0     1
row_0             
0      12725  2791
1       2456  1986
                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.82      0.45      0.83      0.61      0.38     15516
          1       0.42      0.45      0.82      0.43      0.61      0.35      4442

avg / total       0.74      0.74      0.53      0.74      0.61      0.37     19958

Réalisé en 2.617 secondes


### Support Vector Machine

In [10]:
from sklearn.svm import SVC

t0 = time()

model = SVC(gamma='scale', max_iter=1000)

model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

print("\nClassifieur SVM:\n")
print(pd.crosstab(y_test.values, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

tt = time() - t0
print("Réalisé en {} secondes".format(round(tt,3)))




Classifieur SVM:

col_0      0     1
row_0             
0      11638  3878
1       2558  1884
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.75      0.42      0.78      0.56      0.33     15516
          1       0.33      0.42      0.75      0.37      0.56      0.31      4442

avg / total       0.71      0.68      0.50      0.69      0.56      0.32     19958

Réalisé en 39.692 secondes
