In [1]:
import numpy as np
import pandas as pd

# The data
The data we are going to use here is the following :

Categorical variables are transformed with get_dummies

ps_car_11_cat is not dropped

null values are not transformed

In [2]:
X = np.load('X.npy')
X_test = np.load('X_test.npy')
y = np.load('y.npy')

# XGBoost

In [6]:
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV

In [7]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini_f', normalized_gini(pred, y)

In [8]:
submission = np.zeros(X_test.shape[0])

In [11]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 4,
    'min_child_weight': 6,
    'subsample': 0.8,
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True,
    'n_jobs':-1
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index,:], X[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train_f'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds,
                          watchlist,
                          early_stopping_rounds=100,
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission += xgb_model.predict(xgb.DMatrix(X_test)) / kfold

 xgb kfold: 1 of 5 : 
[0]	train_f-gini_f:0.198841	eval-gini_f:0.174244
Multiple eval metrics have been passed: 'eval-gini_f' will be used for early stopping.

Will train until eval-gini_f hasn't improved in 100 rounds.
[50]	train_f-gini_f:0.274279	eval-gini_f:0.237939
[100]	train_f-gini_f:0.308353	eval-gini_f:0.259246
[150]	train_f-gini_f:0.327653	eval-gini_f:0.263725
[200]	train_f-gini_f:0.342632	eval-gini_f:0.263689
[250]	train_f-gini_f:0.354656	eval-gini_f:0.264048
[300]	train_f-gini_f:0.366592	eval-gini_f:0.263808
Stopping. Best iteration:
[240]	train_f-gini_f:0.352256	eval-gini_f:0.264606

 xgb kfold: 2 of 5 : 
[0]	train_f-gini_f:0.206068	eval-gini_f:0.195686
Multiple eval metrics have been passed: 'eval-gini_f' will be used for early stopping.

Will train until eval-gini_f hasn't improved in 100 rounds.
[50]	train_f-gini_f:0.270441	eval-gini_f:0.250905
[100]	train_f-gini_f:0.304832	eval-gini_f:0.273319
[150]	train_f-gini_f:0.325364	eval-gini_f:0.279364
[200]	train_f-gini_f:0.3413

In [12]:
porto_test_id = pd.read_csv('data/test.csv',usecols=['id'])
def make_submission(name, pred):
    s = porto_test_id['id'].to_frame()
    s['target'] = pred
    s.to_csv('Submissions/'+name+'.csv',index=False)

In [14]:
make_submission('fxgb',submission)

This gives a Kaggle private score of 0.28593