In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV



In [2]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', normalized_gini(pred, y)

In [3]:
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [4]:
categorical_features = train.filter(like='cat', axis=1).columns.values.tolist()
categorical_features.remove('ps_car_11_cat') # cette variable catégorie possède beaucoup de catégories, il faut la traiter à part

In [5]:
porto_train = pd.get_dummies(train, columns=categorical_features)
porto_test = pd.get_dummies(test, columns=categorical_features)

In [6]:
# Convert train & test to numpy arrays
X = porto_train.drop('target', axis=1).values
y = porto_train['target'].values
test = porto_test.values

# Create submission dataframe
submission = porto_test['id'].to_frame()
submission['target'] = 0

In [7]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True,
    'n_jobs':-1 # use all CPU cores
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index,:], X[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds,
                          watchlist,
                          early_stopping_rounds=100,
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission['target'] += xgb_model.predict(xgb.DMatrix(test)) / kfold

 xgb kfold: 1 of 5 : 
[0]	train-gini:0.219501	eval-gini:0.19356
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.292937	eval-gini:0.245516
[100]	train-gini:0.340085	eval-gini:0.26271
[150]	train-gini:0.372633	eval-gini:0.263936
[200]	train-gini:0.399579	eval-gini:0.263366
[250]	train-gini:0.423076	eval-gini:0.263659
Stopping. Best iteration:
[157]	train-gini:0.376417	eval-gini:0.264479

 xgb kfold: 2 of 5 : 
[0]	train-gini:0.22065	eval-gini:0.211444
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.290834	eval-gini:0.261783
[100]	train-gini:0.335917	eval-gini:0.277149
[150]	train-gini:0.367988	eval-gini:0.27897
[200]	train-gini:0.395879	eval-gini:0.277472
Stopping. Best iteration:
[122]	train-gini:0.35151	eval-gini:0.279694

 xgb kfold: 3 of 5 : 
[0]	train-gini:0.2

In [10]:
submission.to_csv('submission2.csv',index=False)