# Aujourd'hui on roule sur les mecs de l'ENS


https://challengedata.ens.fr/en/challenge/39/prediction_of_transaction_claims_status.html

# Imports des librairies de bases

On ajoutera celles qui manquent au fur et à mesure de nos besoins

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os, gc

# Définition de la seed pour le random

Très important pour qu'on voit les mêmes choses entre nos deux ordis

In [2]:
RANDOM_SEED = 42;
np.random.seed(RANDOM_SEED)

# Définition des paramètres pour Matplot

Rien de bien intéréssant

In [3]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Set des variables globales

Attention, je n'utilise les variables globales pour la gestion des fichiers. Sinon, c'est mort

In [4]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
DATA_PROCESSED = os.path.join(PROJECT_ROOT_DIR, "data_processed")

# Fonction pour load les libraires

En vrai, on a juste besoin de pd.read_csv, mais c'était pour faire joli

In [5]:
def load_data(file,data_path=DATA_PROCESSED, sep=';'):
    csv_path = os.path.join(data_path, file)
    return pd.read_csv(csv_path, sep=';')

# On load les jeux de données

In [6]:
TX_data = load_data(file = "train.csv");
TEST_DATA = load_data(file = "test.csv");

In [7]:
RESULTS = pd.DataFrame({'ID' : []})
RESULTS["ID"]=TEST_DATA["ID"]

In [8]:
TEST_DATA.drop("ID", axis=1, inplace=True)

In [9]:
TX_data.drop(['CARD_PAYMENT','COUPON_PAYMENT','RSP_PAYMENT','WALLET_PAYMENT'], axis = 1, inplace = True)
TEST_DATA.drop(['CARD_PAYMENT','COUPON_PAYMENT','RSP_PAYMENT','WALLET_PAYMENT'], axis = 1, inplace = True)

In [10]:
TX_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 57 columns):
SHIPPING_MODE                 100000 non-null object
SHIPPING_PRICE                100000 non-null int64
WARRANTIES_FLG                100000 non-null bool
WARRANTIES_PRICE              100000 non-null int64
PRICECLUB_STATUS              100000 non-null int64
REGISTRATION_DATE             100000 non-null int64
PURCHASE_COUNT                100000 non-null int64
BUYER_BIRTHDAY_DATE           100000 non-null float64
BUYER_DEPARTMENT              100000 non-null int64
BUYING_DATE                   100000 non-null int64
SELLER_SCORE_COUNT            100000 non-null int64
SELLER_SCORE_AVERAGE          100000 non-null float64
SELLER_COUNTRY                100000 non-null object
SELLER_DEPARTMENT             100000 non-null int64
PRODUCT_TYPE                  100000 non-null object
PRODUCT_FAMILY                100000 non-null object
ITEM_PRICE                    100000 non-null int64

# Jointure entre les X et Y

In [11]:
def datapreprocess(data):
    data=data.apply(pd.to_numeric, errors='ignore')
    
    # Y and X
    try :
        Y=data["CLAIM_TYPE"]
        X=data.drop("CLAIM_TYPE", axis=1,inplace=False)
    except:
        Y=0
        X=data
    # Exclude Objets
    X=X.select_dtypes(exclude=['object'])
    
    # Work on fare
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN',strategy='median', axis=1)
    X=pd.DataFrame(imp.fit_transform(X),columns=X.columns.values)
 
    return X, Y

In [12]:
X_train, Y_train = datapreprocess(TX_data)
TEST_DATA, _ = datapreprocess(TEST_DATA)

#del TX_data;
gc.collect()

38

# MODEL!

## XGBoost

#### Core XGBoost Library VS scikit-learn API

Models can be trained in two different ways:

1. Directly using the core library – this is closer to the implementation of the caret-package in R
2. Using the scikit-learn API – this means that the models are implemented in a way that lets the scikit package recognize it as one of it’s own models.

Nous, on va travailler avec l'API de Sklearn, c'est pas optimisé mais plus simple. De toute façon, j'arrive pas à utiliser le Core, a cause des DMatrix qui veulent que des numerics en entrées

Doc des paramètres: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

Doc sur le tunning : https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [13]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight

In [14]:
sample_weight_arr = compute_sample_weight(class_weight='balanced', y=Y_train)

In [15]:
params_XGB={
# General Parameters -  the overall functioning
    'booster':'gbtree',
    'silent':0,
    #'nthread':4, # Je le commente, puisque il détecte automatiquement le nombre de cores qu'il peut utiliser.
    'n_estimators' : 1000,
    
# Booster Parameters - the individual booster (tree/regression) at each step
    'learning_rate' : 0.09,
    'min_child_weight' : 1, #A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
    'max_depth' : 4,
    #'max_leaf_nodes':None, #If this is defined, GBM will ignore max_depth.
    'gamma' : 0.3,
    'max_delta_step':10, #it might help in logistic regression when class is extremely imbalanced/ 1-10 might help control the update
    'subsample' : 0.7,
    'colsample_bytree' : 0.8,
    'colsample_bylevel':1, #default
    'reg_lambda' : 1, #default
    'reg_alpha':0,
    'scale_pos_weight' : sample_weight_arr,

# Learning Task Parameters -  the optimization performed
    'objective' : 'multi:softmax', # you also need to set an additional num_class (number of classes)
    'num_class' : len(Y_train.unique()),
    'eval_metric':"auc",
    'seed' : RANDOM_SEED,
    'random_state':RANDOM_SEED
}

In [16]:
xgb_clf = XGBClassifier(**params_XGB)

In [17]:
xgb_clf.fit(
    X=X_train, 
    y=Y_train, 
    sample_weight=sample_weight_arr, 
    eval_set=None, 
    eval_metric='auc', 
    early_stopping_rounds=None, 
    verbose=True, 
    xgb_model=None
)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eval_metric='auc', gamma=0.3,
       learning_rate=0.09, max_delta_step=10, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, num_class=8, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=array([ 2.11077,  0.25012, ...,  1.78546,  0.25012]),
       seed=42, silent=0, subsample=0.7)

In [18]:
y_pred_gb = xgb_clf.predict(TEST_DATA)

In [19]:
RESULTS["CLAIM_TYPE"] = pd.DataFrame(y_pred_gb)

In [20]:
RESULTS.head()

Unnamed: 0,ID,CLAIM_TYPE
0,100000,SELLER_CANCEL_POSTERIORI
1,100001,WITHDRAWAL
2,100002,SELLER_CANCEL_POSTERIORI
3,100003,-
4,100004,DAMAGED


In [21]:
filename = DATA_PROCESSED+"/submission_XBG_3.csv"

RESULTS.to_csv(filename, index=False, sep=";")