# Preliminaries

In [None]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.externals import joblib

In [3]:
X_test = np.load('X_test.npy')

In [4]:
X = np.load('X.npy')
y = np.load('y.npy')

In [5]:
X_train, X_t, y_train, y_t = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [19]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

#my_scorer = make_scorer(normalized_gini, greater_is_better=True)

In [7]:
porto_test_id = pd.read_csv('data/test.csv',usecols=['id'])
def make_submission(name, pred):
    s = porto_test_id['id'].to_frame()
    s['target'] = pred
    s.to_csv('Submissions/'+name+'.csv',index=False)

In [26]:
def rnd_gini(clf):
    x = datetime.now()
    clf.fit(X_train, y_train, eval_metric='auc')
    y = datetime.now()
    print('Time to fit : ' + str(x-y))
    tr = normalized_gini(clf.predict_proba(X_train)[:,1],y_train)
    te = normalized_gini(clf.predict_proba(X_t)[:,1],y_t)
    print('Train-gini :', tr)
    print('Test-gini :',  te)
    print('Difference-gini :', tr-te)
    x = str(datetime.now())
    joblib.dump(clf, "pkl/RandomForest/"+ x +".pkl")
    print("Model file : pkl/RandomForest/"+ x +".pkl")

In [9]:
def binarize(pred,threshold):
    s= pred
    s[s<threshold] = 0
    s[s>=threshold] = 1
    s.astype(int)
    return s

# xgb

In [10]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier



In [11]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)

In [22]:
rnd_gini(xgb1) # une heure

Time to fit : -1 day, 23:08:03.220847
Train-gini : 0.74955583372
Test-gini : 0.238792372355
Difference-gini : 0.510763461365
Model file : pkl/RandomForest/2018-01-09 18:16:33.299830.pkl


In [23]:
sub = xgb1.predict_proba(X_test)[:,1]
make_submission('X1',sub) # gini 0.22

In [24]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)

In [25]:
rnd_gini(xgb1)

Time to fit : -1 day, 23:54:35.877236
Train-gini : 0.366180688013
Test-gini : 0.283100610116
Difference-gini : 0.0830800778971
Model file : pkl/RandomForest/2018-01-09 18:37:06.101585.pkl


In [26]:
sub = xgb1.predict_proba(X_test)[:,1]
make_submission('X2',sub) # gini 0.276

In [28]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=100,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:56:37.401772
Train-gini : 0.294438169987
Test-gini : 0.28303204244
Difference-gini : 0.0114061275476
Model file : pkl/RandomForest/2018-01-09 21:39:16.912158.pkl


In [29]:
sub = xgb1.predict_proba(X_test)[:,1]
make_submission('X3',sub) # gini 0.276

eta : The range is 0 to 1. Low eta value means model is more robust to overfitting.

gamma : The default value is set to 0. You need to specify minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm will be. The range is 0 to ∞. Larger the gamma more conservative the algorithm is.

subsample : The default value is set to 1. You need to specify the subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. The range is 0 to 1.

colsample_bytree : The default value is set to 1. You need to specify the subsample ratio of columns when constructing each tree. The range is 0 to 1.

In [11]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=250,
                     max_depth=3,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     # colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:49:21.099864
Train-gini : 0.335892718081
Test-gini : 0.286187496908
Difference-gini : 0.0497052211732
Model file : pkl/RandomForest/2018-01-10 14:29:39.278114.pkl


In [12]:
sub = xgb1.predict_proba(X_test)[:,1]
make_submission('X4',sub) # gini 0.27994

In [13]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=250,
                     max_depth=3,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.6,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:50:59.103499
Train-gini : 0.333627153402
Test-gini : 0.283350423365
Difference-gini : 0.050276730037
Model file : pkl/RandomForest/2018-01-10 14:47:15.788369.pkl


In [15]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=200,
                     max_depth=2,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.6,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:55:00.317006
Train-gini : 0.290945829133
Test-gini : 0.282763014023
Difference-gini : 0.00818281511002
Model file : pkl/RandomForest/2018-01-10 14:53:08.012830.pkl


In [16]:
sub = xgb1.predict_proba(X_test)[:,1]
make_submission('X5',sub) # gini 0.27061

In [17]:
xgb1 = XGBClassifier(learning_rate =0.1,
                     n_estimators=200,
                     max_depth=4,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.5,
                     colsample_bytree=0.7,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:51:28.319714
Train-gini : 0.366036985667
Test-gini : 0.282306951191
Difference-gini : 0.083730034476
Model file : pkl/RandomForest/2018-01-10 15:08:47.471931.pkl


In [24]:
xgb1 = XGBClassifier(learning_rate =0.05,
                     n_estimators=200,
                     max_depth=4,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     #eval_metric ='auc',
                     nthread=4,
                     scale_pos_weight=1,
                     #n_jobs = -1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:51:23.588497
Train-gini : 0.324838222356
Test-gini : 0.286266416983
Difference-gini : 0.0385718053724
Model file : pkl/RandomForest/2018-01-10 15:21:04.819062.pkl


In [25]:
xgb1 = XGBClassifier(learning_rate =0.05,
                     n_estimators=200,
                     max_depth=4,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     #n_jobs = -1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:50:55.546325
Train-gini : 0.324838222356
Test-gini : 0.286266416983
Difference-gini : 0.0385718053724
Model file : pkl/RandomForest/2018-01-10 16:03:13.047066.pkl


A partir d'ici la fonction rnd_clf est changée, clf.fit a été changé en rajoutant eval_metric = 'auc'

In [27]:
xgb1 = XGBClassifier(learning_rate =0.05,
                     n_estimators=200,
                     max_depth=4,
                     # min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     #n_jobs = -1,
                     seed=27)
rnd_gini(xgb1)

Time to fit : -1 day, 23:51:13.532016
Train-gini : 0.324838222356
Test-gini : 0.286266416983
Difference-gini : 0.0385718053724
Model file : pkl/RandomForest/2018-01-10 16:13:30.972190.pkl


# Tunning

In [29]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini_f', normalized_gini(pred, y)

In [4]:
data_train = pd.read_csv('data/train.csv',index_col='id',na_values=-1)
data_test = pd.read_csv('data/test.csv',index_col='id',na_values=-1)

In [21]:
cat_feat = data_test.filter(like='cat', axis=1).columns.values.tolist()
bin_feat = data_test.filter(like='bin', axis=1).columns.values.tolist()
num_feat = list(set(list(data_test)) - set(cat_feat)- set(bin_feat))

predictors = cat_feat + bin_feat + num_feat

In [14]:
alg = XGBClassifier(
    learning_rate =0.1,
    n_estimators=50,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27)

In [18]:
dtrain = data_train
cv_folds = 5
early_stopping_rounds=50
target = 'target'

xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round = alg.get_params()['n_estimators'],
                  nfold = cv_folds,
                  metrics='auc',
                  maximize = True,
                  early_stopping_rounds = early_stopping_rounds,
                  verbose_eval = 10,
                  #show_progress=False
                 )

[0]	train-auc:1+0	test-auc:1+0
[10]	train-auc:1+0	test-auc:1+0
[20]	train-auc:1+0	test-auc:1+0
[30]	train-auc:1+0	test-auc:1+0
[40]	train-auc:1+0	test-auc:1+0
