In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV



In [2]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', normalized_gini(pred, y)

In [3]:
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [4]:
categorical_features = train.filter(like='cat', axis=1).columns.values.tolist()
categorical_features.remove('ps_car_11_cat') # cette variable catégorie possède beaucoup de catégories, il faut la traiter à part

In [5]:
porto_train = pd.get_dummies(train, columns=categorical_features)
porto_test = pd.get_dummies(test, columns=categorical_features)

In [6]:
cols = [c for c in porto_train.columns if c[-2:] == '-1']

In [7]:
porto_train = porto_train.drop(cols,1)
porto_test = porto_test.drop(cols,1)

In [8]:
cat_col =  porto_train.filter(like='bin', axis=1).columns.values.tolist()
cat_col.append('ps_car_11_cat')
cont_col= list(set(porto_train.columns.tolist()) - set(cat_col))

In [9]:
from sklearn.preprocessing import Imputer
I_cont = Imputer(missing_values=-1, strategy='mean', axis=0, verbose=0)
I_cat = Imputer(missing_values=-1, strategy='most_frequent', axis=0, verbose=0)

porto_train[cont_col] = I_cont.fit_transform(porto_train[cont_col])
porto_train[cat_col] = I_cat.fit_transform(porto_train[cat_col])

cont_col_test = cont_col.remove('target')
porto_test[cont_col] = I_cont.fit_transform(porto_test[cont_col])
porto_test[cat_col] = I_cat.fit_transform(porto_test[cat_col])

In [10]:
# Convert train & test to numpy arrays
X = porto_train.drop('target', axis=1).values
y = porto_train['target'].values
test = porto_test.values

# Create submission dataframe
submission = porto_test['id'].to_frame()
submission['target'] = 0

In [11]:
print(cont_col)

['ps_calc_02', 'ps_car_01_cat_7', 'ps_car_12', 'ps_car_15', 'ps_car_04_cat_5', 'ps_car_06_cat_17', 'ps_calc_10', 'ps_car_01_cat_8', 'ps_ind_01', 'ps_car_01_cat_5', 'ps_car_10_cat_0', 'ps_car_01_cat_0', 'ps_car_04_cat_2', 'ps_car_04_cat_6', 'ps_car_05_cat_1', 'ps_car_04_cat_8', 'ps_calc_11', 'ps_car_01_cat_4', 'ps_ind_05_cat_1', 'ps_car_09_cat_3', 'ps_calc_08', 'ps_ind_02_cat_1', 'ps_car_11', 'ps_car_09_cat_4', 'ps_car_10_cat_1', 'ps_ind_02_cat_3', 'ps_car_06_cat_1', 'ps_car_09_cat_1', 'ps_car_08_cat_1', 'ps_car_01_cat_6', 'ps_car_06_cat_13', 'ps_car_06_cat_9', 'ps_car_09_cat_0', 'ps_reg_02', 'ps_car_10_cat_2', 'ps_ind_15', 'ps_car_02_cat_0', 'ps_reg_01', 'ps_calc_12', 'ps_ind_02_cat_2', 'ps_ind_05_cat_4', 'ps_car_01_cat_3', 'ps_car_06_cat_16', 'ps_car_04_cat_3', 'ps_calc_05', 'ps_ind_04_cat_1', 'ps_calc_04', 'ps_ind_05_cat_6', 'ps_car_06_cat_4', 'ps_ind_05_cat_0', 'ps_car_06_cat_8', 'ps_car_01_cat_9', 'ps_calc_03', 'ps_reg_03', 'ps_car_05_cat_0', 'ps_car_09_cat_2', 'ps_car_03_cat_1', '

In [12]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True,
    'n_jobs':-1 # use all CPU cores
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index,:], X[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds,
                          watchlist,
                          early_stopping_rounds=100,
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission['target'] += xgb_model.predict(xgb.DMatrix(test)) / kfold

 xgb kfold: 1 of 5 : 
[0]	train-gini:0.206053	eval-gini:0.197553
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.291817	eval-gini:0.245459
[100]	train-gini:0.340191	eval-gini:0.265225
[150]	train-gini:0.371307	eval-gini:0.268517
[200]	train-gini:0.397043	eval-gini:0.268391
[250]	train-gini:0.421644	eval-gini:0.267512
Stopping. Best iteration:
[184]	train-gini:0.389053	eval-gini:0.269479

 xgb kfold: 2 of 5 : 
[0]	train-gini:0.203564	eval-gini:0.182044
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.289598	eval-gini:0.25669
[100]	train-gini:0.335239	eval-gini:0.276632
[150]	train-gini:0.368825	eval-gini:0.280908
[200]	train-gini:0.394836	eval-gini:0.282208
[250]	train-gini:0.418681	eval-gini:0.279726
[300]	train-gini:0.44156	eval-gini:0.277845
Stopping. Best iter

In [15]:
submission2 = submission

In [20]:
submission2.id = submission2.id.astype(int)

In [21]:
submission2.to_csv('submission2.csv',index=False)