In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV



In [2]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', normalized_gini(pred, y)

In [3]:
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [12]:
train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [4]:
cat_feat = train.filter(like='cat', axis=1).columns.values.tolist()
bin_feat = train.filter(like='bin', axis=1).columns.values.tolist()
num_feat = list(set(list(train)) - set(cat_feat)- set(bin_feat))
num_feat.remove('target')
#categorical_features.remove('ps_car_11_cat') # cette variable catégorie possède beaucoup de catégories, il faut la traiter à part

In [6]:
porto_train = pd.get_dummies(train, columns=cat_feat)
porto_test = pd.get_dummies(test, columns=cat_feat)

In [20]:
#cols = [c for c in porto_train.columns if c[-2:] == '-1']
porto_train = porto_train.drop(cols,1)
porto_test = porto_test.drop(cols,1)

In [22]:
cat_col =  porto_train.filter(like='bin', axis=1).columns.values.tolist()
cat_col.append('ps_car_11_cat')
cont_col= list(set(porto_train.columns.tolist()) - set(cat_col))

In [23]:
from sklearn.preprocessing import Imputer
I_cont = Imputer(missing_values=-1, strategy='median', axis=0, verbose=0)
I_cat = Imputer(missing_values=-1, strategy='most_frequent', axis=0, verbose=0)

porto_train[cont_col] = I_cont.fit_transform(porto_train[cont_col])
#porto_train[cat_col] = I_cat.fit_transform(porto_train[cat_col])

cont_col_test = cont_col.remove('target')
porto_test[cont_col] = I_cont.fit_transform(porto_test[cont_col])
#porto_test[cat_col] = I_cat.fit_transform(porto_test[cat_col])

In [7]:
# Convert train & test to numpy arrays
X = porto_train.drop('target', axis=1).values
y = porto_train['target'].values
test = porto_test.values

# Create submission dataframe
submission = porto_test['id'].to_frame()
submission['target'] = 0

In [9]:
X.shape

(595212, 228)

In [10]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True,
    'n_jobs':-1 # use all CPU cores
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index,:], X[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds,
                          watchlist,
                          early_stopping_rounds=100,
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission['target'] += xgb_model.predict(xgb.DMatrix(test)) / kfold

 xgb kfold: 1 of 5 : 
[0]	train-gini:0.205311	eval-gini:0.187002
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.292015	eval-gini:0.245046
[100]	train-gini:0.339373	eval-gini:0.263555
[150]	train-gini:0.370936	eval-gini:0.266609
[200]	train-gini:0.397009	eval-gini:0.266314
[250]	train-gini:0.421314	eval-gini:0.265049
Stopping. Best iteration:
[176]	train-gini:0.384292	eval-gini:0.267703

 xgb kfold: 2 of 5 : 
[0]	train-gini:0.20554	eval-gini:0.18746
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.288797	eval-gini:0.258088
[100]	train-gini:0.336797	eval-gini:0.276406
[150]	train-gini:0.369561	eval-gini:0.278947
[200]	train-gini:0.396076	eval-gini:0.280499
[250]	train-gini:0.41788	eval-gini:0.278527
[300]	train-gini:0.439915	eval-gini:0.277282
Stopping. Best itera

In [11]:
submission.id = submission.id.astype(int)
submission.to_csv('submission4.csv',index=False)