In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score

In [3]:
cust_df = pd.read_csv("cust_df_selected_tuesday2.csv")

In [4]:
cust_df.head().T

Unnamed: 0,0,1,2,3,4
session_id,exxxxxshop.si-0.00021101047198248773,exxxxxshop.si-0.0002712991769484001,exxxxxshop.si-0.00033158788092821383,exxxxxshop.si-0.00039187658370279567,exxxxxshop.si-0.0004521652850530127
TARGET_successful_purchase,-1,-1,-1,-1,-1
test_or_train_flag,1,1,1,1,1
last_basket_element_number,1,3,12,1,4
last_click_num,1,4,60,13,56
last_customer_age,-1,42,31,35,-1
last_customer_value,-1,491,548,504,-1
last_duration_of_session,49.166,384.522,1981.1,1471.28,1271.37
last_last_order_of_customer,-1,64,42,15,-1
last_level_of_purchasing_process,1,1,1,1,1


In [5]:
cust_df.shape

(49086, 37)

In [6]:
target = 'TARGET_successful_purchase'
bemeno_valtozok = list(cust_df.columns)[3:]

In [25]:
len(bemeno_valtozok)

34

In [7]:
train = cust_df[cust_df['test_or_train_flag']==0].copy()
test = cust_df[cust_df['test_or_train_flag']==1].copy()
print("train: ", train.shape)
print("test: ", test.shape)

train:  (24584, 37)
test:  (24502, 37)


# XGBoost and grid search
Ideas from:  
https://towardsdatascience.com/a-beginners-guide-to-xgboost-87f5d4c30ed7  
https://xgboost.readthedocs.io/en/latest/python/python_api.html

In [8]:
import xgboost as xgb

In [9]:
from sklearn.model_selection import GridSearchCV

In [13]:
xgb_model = xgb.XGBClassifier(random_state=42)

In [14]:
parameters = {'gamma': [0.1, 0.4, 1, 5],
              'learning_rate': [0.05, 0.1, 0.15, 0.20], #so called `eta` value
              'max_depth': [5, 10, 15],
              'min_child_weight': [5, 10],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.5, 1.0],
              'n_estimators': [20, 40], #number of trees, change it to 1000 for better results
              'missing':[-1],
              'seed': [1337]}

In [15]:
clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2)

In [16]:
clf.fit(train[bemeno_valtozok], train[target])

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  2.8min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  9.9min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed: 21.8min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed: 40.4min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed: 76.3min
[Parallel(n_jobs=5)]: Done 1152 out of 1152 | elapsed: 88.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=42, reg_alpha=0, reg_...
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [0.5, 1.0],
                         'gamma': [0.1, 0.4, 1, 5],
                         'learning_rate': [0.05, 0.1, 0.15, 0.2],
                         'max_depth': [5, 10, 15], 'min_child_weight': [5, 10],
 

In [17]:
print(clf.best_score_)

0.8947999466084426


In [18]:
clf.best_params_

{'colsample_bytree': 1.0,
 'gamma': 1,
 'learning_rate': 0.15,
 'max_depth': 5,
 'min_child_weight': 5,
 'missing': -1,
 'n_estimators': 40,
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

In [19]:
bestmodel = clf.best_estimator_
bestmodel

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=1,
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=-1, n_estimators=40, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337,
              silent=1, subsample=0.8, verbosity=1)

Evaluation:

In [20]:
scores_best = cross_val_score(bestmodel, train[bemeno_valtozok], train[target], cv=10, scoring='roc_auc')
print(scores_best.mean())
print(scores_best.min())
print(scores_best.max())

0.8955969706184795
0.886958164320369
0.9072021747912293


In [21]:
test['tipp'] = bestmodel.predict_proba(test[bemeno_valtozok])[:,1]

In [22]:
sub = test[['session_id', 'tipp']]
sub.columns = ['session_id', 'prob']

In [23]:
sub.head()

Unnamed: 0,session_id,prob
0,exxxxxshop.si-0.00021101047198248773,0.041005
1,exxxxxshop.si-0.0002712991769484001,0.801088
2,exxxxxshop.si-0.00033158788092821383,0.850897
3,exxxxxshop.si-0.00039187658370279567,0.737692
4,exxxxxshop.si-0.0004521652850530127,0.52809


In [24]:
sub.to_csv("submission_tue_xgb.csv", index=False)

Public score: 89.2 %

In [26]:
model2 = xgb.XGBClassifier(colsample_bytree=1.0,
                           gamma=1, learning_rate=0.15,
                           max_depth=5, min_child_weight=5,
                           missing=-1, n_estimators=100,
                           seed=1337,
                           silent= 1,
                           subsample=0.8,
                           random_state=42)

In [27]:
scores_2 = cross_val_score(model2, train[bemeno_valtozok], train[target], cv=10, scoring='roc_auc')
print(scores_2.mean())
print(scores_2.min())
print(scores_2.max())

0.8953819875309511
0.8846019247594051
0.9083426124764707


---

## 2nd round

In [28]:
xgb_model2 = xgb.XGBClassifier(random_state=42)
parameters = {'gamma': [1],
              'learning_rate': [0.12, 0.15, 0.18], #so called `eta` value
              'max_depth': [4,5,6],
              'min_child_weight': [3, 5],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [1.0],
              'n_estimators': [50,100], #number of trees, change it to 1000 for better results
              'missing':[-1],
              'seed': [1337]}
clf2 = GridSearchCV(xgb_model2, parameters, n_jobs=5, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2)

In [29]:
clf2.fit(train[bemeno_valtozok], train[target])

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  4.8min
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed: 17.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=42, reg_alpha=0, reg_...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [1.0], 'gamma': [1],
                         'learning_rate': [0.12, 0.15, 0.18],
                         'max_depth': [4, 5, 6]

In [30]:
print(clf2.best_score_)

0.8958026217552304


In [31]:
clf2.best_params_

{'colsample_bytree': 1.0,
 'gamma': 1,
 'learning_rate': 0.12,
 'max_depth': 6,
 'min_child_weight': 3,
 'missing': -1,
 'n_estimators': 50,
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

In [32]:
bestmodel2 = clf2.best_estimator_
bestmodel2

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=1,
              learning_rate=0.12, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=-1, n_estimators=50, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337,
              silent=1, subsample=0.8, verbosity=1)

In [33]:
scores_best2 = cross_val_score(bestmodel2, train[bemeno_valtozok], train[target], cv=10, scoring='roc_auc')
print(scores_best2.mean())
print(scores_best2.min())
print(scores_best2.max())

0.896579345803268
0.8860024125772158
0.9081961345740873


In [34]:
test['tipp'] = bestmodel2.predict_proba(test[bemeno_valtozok])[:,1]

In [35]:
sub = test[['session_id', 'tipp']]
sub.columns = ['session_id', 'prob']
sub.head()

Unnamed: 0,session_id,prob
0,exxxxxshop.si-0.00021101047198248773,0.027484
1,exxxxxshop.si-0.0002712991769484001,0.8547
2,exxxxxshop.si-0.00033158788092821383,0.852925
3,exxxxxshop.si-0.00039187658370279567,0.70791
4,exxxxxshop.si-0.0004521652850530127,0.460031


In [36]:
sub.to_csv("submission_tue_xgb2.csv", index=False)

Public score: 0.89310

---  
# 3 rd round - on the full cust_df

In [37]:
cust_df3 = pd.read_csv("cust_df_tuesday2.csv")

In [38]:
cust_df3.shape

(49086, 82)

In [39]:
target = 'TARGET_successful_purchase'
bemeno_valtozok3 = list(cust_df3.columns)[3:]
len(bemeno_valtozok3)

79

In [40]:
train3 = cust_df3[cust_df3['test_or_train_flag']==0].copy()
test3 = cust_df3[cust_df3['test_or_train_flag']==1].copy()
print("train: ", train3.shape)
print("test: ", test3.shape)

train:  (24584, 82)
test:  (24502, 82)


In [43]:
xgb_model3 = xgb.XGBClassifier(random_state=42)
parameters = {'gamma': [1],
              'learning_rate': [0.12, 0.15, 0.18], #so called `eta` value
              'max_depth': [5,6,10],
              'min_child_weight': [2, 3],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [1.0],
              'n_estimators': [40, 50, 60], #number of trees, change it to 1000 for better results
              'missing':[-1],
              'seed': [1337]}
clf3 = GridSearchCV(xgb_model3, parameters, n_jobs=5, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2)

In [44]:
clf3.fit(train3[bemeno_valtozok3], train3[target])

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  7.9min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 37.2min
[Parallel(n_jobs=5)]: Done 162 out of 162 | elapsed: 40.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=42, reg_alpha=0, reg_...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [1.0], 'gamma': [1],
                         'learning_rate': [0.12, 0.15, 0.18],
                         'max_depth': [5, 6, 10

In [45]:
print(clf3.best_score_)
clf3.best_params_

0.8960514552714361


{'colsample_bytree': 1.0,
 'gamma': 1,
 'learning_rate': 0.12,
 'max_depth': 6,
 'min_child_weight': 3,
 'missing': -1,
 'n_estimators': 60,
 'seed': 1337,
 'silent': 1,
 'subsample': 0.8}

In [46]:
bestmodel3 = clf3.best_estimator_
bestmodel3

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=1,
              learning_rate=0.12, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=-1, n_estimators=60, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337,
              silent=1, subsample=0.8, verbosity=1)

In [49]:
scores_best3 = cross_val_score(bestmodel3, train3[bemeno_valtozok3], train3[target], cv=10, scoring='roc_auc')
print(scores_best3.mean())
print(scores_best3.min())
print(scores_best3.max())

0.8977530567336036
0.8875036453776611
0.9116022429014556


In [50]:
test['tipp'] = bestmodel3.predict_proba(test3[bemeno_valtozok3])[:,1]
sub = test[['session_id', 'tipp']]
sub.columns = ['session_id', 'prob']
sub.head()

Unnamed: 0,session_id,prob
0,exxxxxshop.si-0.00021101047198248773,0.027691
1,exxxxxshop.si-0.0002712991769484001,0.840585
2,exxxxxshop.si-0.00033158788092821383,0.887177
3,exxxxxshop.si-0.00039187658370279567,0.729537
4,exxxxxshop.si-0.0004521652850530127,0.36438


In [51]:
sub.to_csv("submission_tue_xgb3.csv", index=False)

Public score: 0.89526

Ötlet:
* frequency encode: https://www.kaggle.com/cdeotte/200-magical-models-santander-0-920  
* ensemble more models: https://towardsdatascience.com/two-is-better-than-one-ensembling-models-611ee4fa9bd8  