In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict

In [6]:
train = pd.read_csv('data/train.csv',na_values=-1)
test = pd.read_csv('data/test.csv',na_values=-1)

In [7]:
cat_feat = train.filter(like='cat', axis=1).columns.values.tolist()

bin_feat = train.filter(like='bin', axis=1).columns.values.tolist()

num_feat = list(set(list(train)) - set(cat_feat)- set(bin_feat))
num_feat.remove('target')
num_feat.remove('id')

features_names = cat_feat + bin_feat + num_feat

In [8]:
for var in bin_feat + cat_feat:
    train[var] = train[var].astype('category')
    test[var] = test[var].astype('category')

In [9]:
Train_cnt = train[num_feat]
Train_bin = train[bin_feat]
Train_cat = pd.get_dummies(train[cat_feat],
                              prefix_sep='#',
                              drop_first=True
                          )
Test_cnt = test[num_feat]
Test_bin = test[bin_feat]
Test_cat = pd.get_dummies(test[cat_feat],
                              prefix_sep='#',
                              drop_first=True)

#### On met le tout dans de nouveaux dataframes train et test

In [None]:
Train_mod = pd.concat([Train_cnt, Train_bin,Train_cat], axis=1)
Test_mod = pd.concat([Test_cnt, Test_bin, Test_cat], axis=1)

#### On remlace les valeurs manquantes par 0 (Il semblerait que xgboost les gère mieux comme ça).

In [11]:
Train_cnt = Train_cnt.fillna(0)
Test_cnt = Test_cnt.fillna(0)

#### On les met dans X et X_test

In [17]:
X = Train_mod.as_matrix()
X_test = Test_mod.as_matrix()

## XGBoost (Code de Binh)

In [19]:
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV



In [20]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', normalized_gini(pred, y)

In [21]:
submission = test['id'].to_frame()
submission['target'] = 0

In [None]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.8, #
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True, # ok
    'n_jobs':-1 # use all CPU cores
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index,:], X[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds, # ok
                          watchlist,
                          early_stopping_rounds=100, # ok
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission['target'] += xgb_model.predict(xgb.DMatrix(X_test)) / kfold

In [24]:
submission.id = submission.id.astype(int)
submission.to_csv('Submissions/PS-02-03.csv',index=False)