# Aujourd'hui on roule sur les mecs de l'ENS


https://challengedata.ens.fr/en/challenge/39/prediction_of_transaction_claims_status.html

# Imports des librairies de bases

On ajoutera celles qui manquent au fur et à mesure de nos besoins

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os, gc

# Définition de la seed pour le random

Très important pour qu'on voit les mêmes choses entre nos deux ordis

In [None]:
RANDOM_SEED = 42;
np.random.seed(RANDOM_SEED)

# Définition des paramètres pour Matplot

Rien de bien intéréssant

In [None]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Set des variables globales

Attention, je n'utilise les variables globales pour la gestion des fichiers. Sinon, c'est mort

In [None]:
# Where to save the figures
PROJECT_ROOT_DIR = ".."
DATA_PROCESSED = os.path.join(PROJECT_ROOT_DIR, "data_processed")

# Fonction pour load les libraires

En vrai, on a juste besoin de pd.read_csv, mais c'était pour faire joli

In [None]:
def load_data(file,data_path=DATA_PROCESSED, sep=';'):
    csv_path = os.path.join(data_path, file)
    return pd.read_csv(csv_path, sep=';')

# On load les jeux de données

In [None]:
TX_data = load_data(file = "train.csv");

In [None]:
TX_data.drop(['CARD_PAYMENT','COUPON_PAYMENT','RSP_PAYMENT','WALLET_PAYMENT'], axis = 1, inplace = True)

In [None]:
TX_data.info() # 42 colonnes, c'est un nombre qui fait plaisir

## Vérification des corrélations

In [None]:
corr_map=TX_data.select_dtypes(exclude="object").astype(float).corr(method='spearman')

In [None]:
colormap = plt.cm.BrBG
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(corr_map,
            linewidths=0.1,
            vmax=1.0, 
            square=True, 
            cmap=colormap, 
            linecolor='white',
            annot=False)

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(TX_data, 
                                       test_size=0.3, 
                                       random_state=RANDOM_SEED, 
                                       stratify=TX_data["CLAIM_TYPE"]
                                      )
del TX_data;

# Jointure entre les X et Y

In [None]:
def datapreprocess(data):
    data=data.apply(pd.to_numeric, errors='ignore')
    
    # Y and X
    try :
        Y=data["CLAIM_TYPE"]
        X=data.drop("CLAIM_TYPE", axis=1,inplace=False)
    except:
        Y=0
        X=data
    # Exclude Objets
    X=X.select_dtypes(exclude=['object']) # j'exclude les variables catégorielles que j'ai oublié
    
    # Work on fare
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN',strategy='median', axis=1)
    X=pd.DataFrame(imp.fit_transform(X),columns=X.columns.values)
 
    return X, Y

In [None]:
X_train, Y_train = datapreprocess(train_set)
X_test, Y_test = datapreprocess(test_set)

gc.collect()

In [None]:
def multiclass_roc_auc_score(truth, pred):
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    lb = LabelBinarizer()
    lb.fit(truth)
    return roc_auc_score(lb.transform(truth), lb.transform(pred), average="weighted")

In [None]:
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

# MODEL!

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
sample_weight_arr = compute_sample_weight(class_weight='balanced', y=Y_train)

## XGBoost

#### Core XGBoost Library VS scikit-learn API

Models can be trained in two different ways:

1. Directly using the core library – this is closer to the implementation of the caret-package in R
2. Using the scikit-learn API – this means that the models are implemented in a way that lets the scikit package recognize it as one of it’s own models.

Nous, on va travailler avec l'API de Sklearn, c'est pas optimisé mais plus simple. De toute façon, j'arrive pas à utiliser le Core, a cause des DMatrix qui veulent que des numerics en entrées

Doc des paramètres: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

Doc sur le tunning : https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
from xgboost import XGBClassifier

###  XGBoost solo

In [None]:
params_XGB={
# General Parameters -  the overall functioning
    'booster':'gbtree',
    'silent':0,
    #'nthread':4, # Je le commente, puisque il détecte automatiquement le nombre de cores qu'il peut utiliser.
    'n_estimators' : 1000,
    
# Booster Parameters - the individual booster (tree/regression) at each step
    'learning_rate' : 0.1,
    'min_child_weight' : 1, #A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
    'max_depth' : 3,
    #'max_leaf_nodes':None, #If this is defined, GBM will ignore max_depth.
    'gamma' : 0.3,
    'max_delta_step':4, #it might help in logistic regression when class is extremely imbalanced/ 1-10 might help control the update
    'subsample' : 0.55,
    'colsample_bytree' : 0.85,
    'colsample_bylevel':1, #default
    'reg_lambda' : 1, #default
    'reg_alpha':0,
    'scale_pos_weight' : sample_weight_arr,

# Learning Task Parameters -  the optimization performed
    'objective' : 'multi:softmax', # you also need to set an additional num_class (number of classes)
    'num_class' : len(Y_train.unique()),
    'eval_metric':"auc",
    'seed' : RANDOM_SEED,
}

In [None]:
xgb_clf = XGBClassifier(**params_XGB)

In [None]:
xgb_clf.fit(
    X=X_train, 
    y=Y_train, 
    sample_weight=sample_weight_arr, 
    eval_set=None, 
    eval_metric='auc', 
    early_stopping_rounds=None, 
    verbose=True, 
    xgb_model=None
)

In [None]:
y_pred_xgb_train = xgb_clf.predict(X_train)
y_pred_xgb = xgb_clf.predict(X_test)

In [None]:
train_mAUC = multiclass_roc_auc_score(Y_train, y_pred_xgb_train)
test_mAUC = multiclass_roc_auc_score(Y_test, y_pred_xgb)

In [None]:
print("Performance sur le train : {}".format(train_mAUC))
print("Performance sur le test : {}".format(test_mAUC))

Performance sur le train : 0.6589482571844301

Performance sur le test : 0.6180906043249655


In [None]:
conf_mx = confusion_matrix(Y_test, y_pred_xgb)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
plot_confusion_matrix(norm_conf_mx)

#### C'est un beau score pour le XBoost

Cependant, j'ai optimisé pour la mauvaise métrique, et j'ai toujours pas fait le Grid Search

In [None]:
pd.DataFrame(xgb_clf.feature_importances_, index=X_train.columns, columns=["Feature"]).sort_values(by="Feature", ascending=False)

In [None]:
sortie en erreur les copines!

### XGBoost Grid Search Iterate

Comme mon PC est pourri, je vais chercher les paramètres de façon iterative

In [None]:
from sklearn.model_selection import GridSearchCV

#### Paramètres pour le XGB qui ne changent pas

In [None]:
params_XGB={
# General Parameters -  the overall functioning
    'booster':'gbtree',
    'silent':0,
    #'nthread':4, # Je vais le commenter, puisque il détecte automatiquement le nombre de cores qu'il peut utiliser.
    'n_estimators' : 100,
    
# Booster Parameters - the individual booster (tree/regression) at each step
    'learning_rate' : 0.1,
    'colsample_bylevel':1, #default
    'reg_lambda' : 1, #default
    'scale_pos_weight' : sample_weight_arr,

# Learning Task Parameters -  the optimization performed
    'objective' : 'multi:softmax',
    'num_class' : len(Y_train.unique()),
    'eval_metric':"auc",
    'seed' : RANDOM_SEED,
}

#### Paramètres pour la méthode `fit` de XGB qui ne changent pas

In [None]:
fit_params_xgb_cv={
    'sample_weight': sample_weight_arr, 
    'eval_set' : None, 
    'eval_metric' : 'auc', 
    'early_stopping_rounds' : None, 
    'verbose':True, 
    'xgb_model':None
}

###  Optim 1 : max_depth, min_child_weight et max_delta_step

In [None]:
# Paramètres pour le GridSearch
params_XGB_CV = {
    'max_depth':range(2,5),
    'min_child_weight':range(1,6),
    'max_delta_step':list(range(1,5)),
}

In [None]:
xgb_gs_cv = GridSearchCV(XGBClassifier(**params_XGB), 
                              params_XGB_CV,
                              n_jobs=-1,
                              verbose=1)

In [None]:
xgb_gs_cv.fit(
    X = X_train, 
    y=Y_train, 
    groups=None, 
    **fit_params_xgb_cv
)
print(xgb_gs_cv.best_estimator_)
print("ROC score : {}".format(multiclass_roc_auc_score(Y_test, xgb_gs_cv.predict(X_test))))

Donc les paramètres optimaux sont:
1. `max_depth` :
2. `min_child_weight` :
3. `max_delta_step` :
    

###  Optim 2 : gamma, et subsample

In [None]:
# Paramètres pour le GridSearch
params_XGB_CV = {
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
}

In [None]:
xgb_gs_cv = GridSearchCV(XGBClassifier(**params_XGB), 
                              params_XGB_CV,
                              n_jobs=-1,
                              verbose=1)

In [None]:
xgb_gs_cv.fit(
    X = X_train, 
    y=Y_train, 
    groups=None, 
    **fit_params_xgb_cv
)
print(xgb_gs_cv.best_estimator_)
print("ROC score : {}".format(multiclass_roc_auc_score(Y_test, xgb_gs_cv.predict(X_test))))

Donc les paramètres optimaux sont:
1. `gamma` :
2. `subsample` :

###  Optim 3 : colsample_bytree, et reg_alpha

In [None]:
# Paramètres pour le GridSearch
params_XGB_CV = {
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}

In [None]:
xgb_gs_cv = GridSearchCV(XGBClassifier(**params_XGB), 
                              params_XGB_CV,
                              n_jobs=-1,
                              verbose=1)

In [None]:
xgb_gs_cv.fit(
    X = X_train, 
    y=Y_train, 
    groups=None, 
    **fit_params_xgb_cv
)
print(xgb_gs_cv.best_estimator_)
print("ROC score : {}".format(multiclass_roc_auc_score(Y_test, xgb_gs_cv.predict(X_test))))

Donc les paramètres optimaux sont:
1. `colsample_bytree` :
2. `reg_alpha` :

In [None]:
conf_mx = confusion_matrix(Y_test, y_pred_xgb_cv)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
plot_confusion_matrix(norm_conf_mx)