In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('ds_challenge_data_final.csv',sep='\t')
#df.fillna(0, inplace = True)
#df.fillna("none", inplace = True)
#df = df.sample(frac=0.05)


In [3]:
df['holiday_nm'] = df['holiday_nm'].apply(lambda x: 'none' if ((str(x).strip()=='0') | (x==0)) else x)

In [4]:
target = 'PRO'
cols_to_drop = ['Transaction_Date','Hash_Trans_ID','PRO' ]
cat_cols = ['holiday_nm']
df = pd.get_dummies(data=df, columns = cat_cols)

In [5]:
df_train = df[df['Transaction_Year'] < 2015]
train_targets = df_train['PRO'].as_matrix().astype('float32')
train_data = df_train.drop(cols_to_drop, axis=1, inplace=False).as_matrix().astype('float32')
df_test = df[df['Transaction_Year'] == 2015]
test_targets = df_test['PRO'].as_matrix().astype('float32')
test_data = df_test.drop(cols_to_drop, axis=1, inplace=False).as_matrix().astype('float32')

### Crossvalidation training/testing

In [48]:
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    dtest_predictions = alg.predict(dtest[predictors])
    dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]
        
    #Print model report:
    print("Accuracy  (Train): %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
    print("Accuracy  (Test): %.4g" % metrics.accuracy_score(dtest[target].values, dtest_predictions))
    print("AUC Score (Test): %f" % metrics.roc_auc_score(dtest[target], dtest_predprob))

### get the baseline

In [49]:
predictors = [x for x in df_train.columns if x not in cols_to_drop]

In [50]:
xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, learning_rate=0.2, n_estimators=100, verbosity=1, n_jobs=-1)
modelfit(xgb1, df_train, df_test, predictors)

Accuracy  (Train): 0.6972
AUC Score (Train): 0.711693
Accuracy  (Test): 0.611
AUC Score (Test): 0.712133


## Tune Max_Depth and Min_Child_Weight

In [86]:
for max_depth in [2,3,4,5,6,7,8]:
    for min_child_weight in [1,2,3,4,5]:
        print("Max Depth: "+str(max_depth)+" Min Child Weight: "+str(min_child_weight))
        xgb1 = xgb.XGBClassifier(max_depth=max_depth, min_child_weight=min_child_weight, learning_rate=0.2, n_estimators=100, verbosity=1, n_jobs=-1)
        modelfit(xgb1, df_train, df_test, predictors)

Max Depth: 2 Min Child Weight: 1
Accuracy  (Train): 0.6981
AUC Score (Train): 0.717564
Accuracy  (Test): 0.6024
AUC Score (Test): 0.711162
Max Depth: 2 Min Child Weight: 2
Accuracy  (Train): 0.6982
AUC Score (Train): 0.717187
Accuracy  (Test): 0.6054
AUC Score (Test): 0.710551
Max Depth: 2 Min Child Weight: 3
Accuracy  (Train): 0.6983
AUC Score (Train): 0.717304
Accuracy  (Test): 0.6021
AUC Score (Test): 0.710533
Max Depth: 2 Min Child Weight: 4
Accuracy  (Train): 0.6973
AUC Score (Train): 0.717515
Accuracy  (Test): 0.6019
AUC Score (Test): 0.710653
Max Depth: 2 Min Child Weight: 5
Accuracy  (Train): 0.6975
AUC Score (Train): 0.717723
Accuracy  (Test): 0.6008
AUC Score (Test): 0.710092
Max Depth: 3 Min Child Weight: 1
Accuracy  (Train): 0.7113
AUC Score (Train): 0.748538
Accuracy  (Test): 0.5893
AUC Score (Test): 0.707135
Max Depth: 3 Min Child Weight: 2
Accuracy  (Train): 0.7103
AUC Score (Train): 0.748209
Accuracy  (Test): 0.5923
AUC Score (Test): 0.705359
Max Depth: 3 Min Child Weig

## Tune Gamma

In [89]:
for gamma in [0,1,2,5]:
    print("Gamma: "+str(gamma))
    xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, gamma=gamma, learning_rate=0.2, n_estimators=100, verbosity=1, n_jobs=-1)
    modelfit(xgb1, df_train, df_test, predictors)

Gamma: 0
Accuracy  (Train): 0.6981
AUC Score (Train): 0.717564
Accuracy  (Test): 0.6024
AUC Score (Test): 0.711162
Gamma: 1
Accuracy  (Train): 0.6981
AUC Score (Train): 0.717564
Accuracy  (Test): 0.6024
AUC Score (Test): 0.711162
Gamma: 2
Accuracy  (Train): 0.6981
AUC Score (Train): 0.717564
Accuracy  (Test): 0.6024
AUC Score (Test): 0.711162
Gamma: 5
Accuracy  (Train): 0.6984
AUC Score (Train): 0.717556
Accuracy  (Test): 0.6008
AUC Score (Test): 0.708985


## Tune subsample and colsample_bytree

In [92]:
for subsample in [0.75,0.90]:
    for colsample_bytree in [0.75,0.90]:
        print("subsample: "+str(subsample)+" colsample_bytree: "+str(colsample_bytree))
        xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, gamma=0, subsample=subsample, colsample_bytree=colsample_bytree,  learning_rate=0.2, n_estimators=100, verbosity=1, n_jobs=-1)
        modelfit(xgb1, df_train, df_test, predictors)

subsample: 0.75 colsample_bytree: 0.75
Accuracy  (Train): 0.6974
AUC Score (Train): 0.718032
Accuracy  (Test): 0.6159
AUC Score (Test): 0.712284
subsample: 0.75 colsample_bytree: 0.9
Accuracy  (Train): 0.6981
AUC Score (Train): 0.718903
Accuracy  (Test): 0.6029
AUC Score (Test): 0.708487
subsample: 0.9 colsample_bytree: 0.75
Accuracy  (Train): 0.6982
AUC Score (Train): 0.718381
Accuracy  (Test): 0.6124
AUC Score (Test): 0.713309
subsample: 0.9 colsample_bytree: 0.9
Accuracy  (Train): 0.6979
AUC Score (Train): 0.718415
Accuracy  (Test): 0.5932
AUC Score (Test): 0.709267


## Tuning Regularization Parameter (Alpha)

In [93]:
for reg_alpha in [1e-5, 1e-2, 0.1, 1, 100]:
    print("reg_alpha: "+str(reg_alpha))
    xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, reg_alpha=reg_alpha, gamma=0, subsample=0.9, colsample_bytree=0.75, learning_rate=0.2, n_estimators=100, verbosity=1, n_jobs=-1)
    modelfit(xgb1, df_train, df_test, predictors)

reg_alpha: 1e-05
Accuracy  (Train): 0.6982
AUC Score (Train): 0.718381
Accuracy  (Test): 0.6124
AUC Score (Test): 0.713309
reg_alpha: 0.01
Accuracy  (Train): 0.6983
AUC Score (Train): 0.718431
Accuracy  (Test): 0.6086
AUC Score (Test): 0.712294
reg_alpha: 0.1
Accuracy  (Train): 0.6976
AUC Score (Train): 0.718475
Accuracy  (Test): 0.6111
AUC Score (Test): 0.712852
reg_alpha: 1
Accuracy  (Train): 0.6985
AUC Score (Train): 0.717625
Accuracy  (Test): 0.6101
AUC Score (Test): 0.712563
reg_alpha: 100
Accuracy  (Train): 0.6915
AUC Score (Train): 0.694471
Accuracy  (Test): 0.5698
AUC Score (Test): 0.699578


## Tune Learning Rate

In [95]:
for learning_rate in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    print("learning_rate: "+str(learning_rate))
    xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, reg_alpha=1e-5, gamma=0, subsample=0.9, colsample_bytree=0.75, learning_rate=learning_rate, n_estimators=100, verbosity=1, n_jobs=-1)
    modelfit(xgb1, df_train, df_test, predictors)

learning_rate: 0.1
Accuracy  (Train): 0.6924
AUC Score (Train): 0.701357
Accuracy  (Test): 0.5311
AUC Score (Test): 0.697049
learning_rate: 0.2
Accuracy  (Train): 0.6982
AUC Score (Train): 0.718381
Accuracy  (Test): 0.6124
AUC Score (Test): 0.713309
learning_rate: 0.3
Accuracy  (Train): 0.7035
AUC Score (Train): 0.725752
Accuracy  (Test): 0.6235
AUC Score (Test): 0.711375
learning_rate: 0.4
Accuracy  (Train): 0.7072
AUC Score (Train): 0.732592
Accuracy  (Test): 0.6267
AUC Score (Test): 0.707841
learning_rate: 0.5
Accuracy  (Train): 0.7089
AUC Score (Train): 0.735258
Accuracy  (Test): 0.642
AUC Score (Test): 0.711854
learning_rate: 0.6
Accuracy  (Train): 0.7113
AUC Score (Train): 0.740971
Accuracy  (Test): 0.6329
AUC Score (Test): 0.706418
learning_rate: 0.7
Accuracy  (Train): 0.7117
AUC Score (Train): 0.742178
Accuracy  (Test): 0.6264
AUC Score (Test): 0.697784
learning_rate: 0.8
Accuracy  (Train): 0.7125
AUC Score (Train): 0.742344
Accuracy  (Test): 0.6295
AUC Score (Test): 0.698846
l

In [96]:
for learning_rate in [0.12,0.14,0.16,0.18,0.2,0.22]:
    print("learning_rate: "+str(learning_rate))
    xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, reg_alpha=1e-5, gamma=0, subsample=0.9, colsample_bytree=0.75, learning_rate=learning_rate, n_estimators=100, verbosity=1, n_jobs=-1)
    modelfit(xgb1, df_train, df_test, predictors)

learning_rate: 0.12
Accuracy  (Train): 0.6933
AUC Score (Train): 0.707418
Accuracy  (Test): 0.5659
AUC Score (Test): 0.705617
learning_rate: 0.14
Accuracy  (Train): 0.6943
AUC Score (Train): 0.709667
Accuracy  (Test): 0.5725
AUC Score (Test): 0.706899
learning_rate: 0.16
Accuracy  (Train): 0.6956
AUC Score (Train): 0.712565
Accuracy  (Test): 0.5805
AUC Score (Test): 0.705057
learning_rate: 0.18
Accuracy  (Train): 0.6973
AUC Score (Train): 0.715888
Accuracy  (Test): 0.5869
AUC Score (Test): 0.706624
learning_rate: 0.2
Accuracy  (Train): 0.6982
AUC Score (Train): 0.718381
Accuracy  (Test): 0.6124
AUC Score (Test): 0.713309
learning_rate: 0.22
Accuracy  (Train): 0.6988
AUC Score (Train): 0.720001
Accuracy  (Test): 0.6116
AUC Score (Test): 0.711649


## Tune Number of Estimators

In [111]:
for n_estimators in [5,10,25,50,100,250,500]:
    print("n_estimators: "+str(n_estimators))
    xgb1 = xgb.XGBClassifier(max_depth=2, min_child_weight=1, reg_alpha=1e-5, gamma=0, subsample=0.9, colsample_bytree=0.75, learning_rate=0.2, n_estimators=n_estimators, verbosity=1, n_jobs=-1)
    modelfit(xgb1, df_train, df_test, predictors)

n_estimators: 5
Accuracy  (Train): 0.6808
AUC Score (Train): 0.655191
Accuracy  (Test): 0.4705
AUC Score (Test): 0.631883
n_estimators: 10
Accuracy  (Train): 0.6835
AUC Score (Train): 0.664935
Accuracy  (Test): 0.4941
AUC Score (Test): 0.669287
n_estimators: 25
Accuracy  (Train): 0.6886
AUC Score (Train): 0.685363
Accuracy  (Test): 0.546
AUC Score (Test): 0.695893
n_estimators: 50
Accuracy  (Train): 0.6923
AUC Score (Train): 0.702431
Accuracy  (Test): 0.5789
AUC Score (Test): 0.705769
n_estimators: 100
Accuracy  (Train): 0.6982
AUC Score (Train): 0.718381
Accuracy  (Test): 0.6124
AUC Score (Test): 0.713309
n_estimators: 250
Accuracy  (Train): 0.7109
AUC Score (Train): 0.742967
Accuracy  (Test): 0.6447
AUC Score (Test): 0.719179
n_estimators: 500
Accuracy  (Train): 0.7278
AUC Score (Train): 0.769086
Accuracy  (Test): 0.656
AUC Score (Test): 0.717381


## Best Model So Far

In [51]:
best_mdl = xgb.XGBClassifier(max_depth=2, min_child_weight=1, reg_alpha=1e-5, gamma=0,subsample=0.9, colsample_bytree=0.75, learning_rate=0.2, n_estimators=250, verbosity=1, n_jobs=-1)

In [52]:
modelfit(best_mdl, df_train, df_test, predictors)

Accuracy  (Train): 0.7088
AUC Score (Train): 0.737140
Accuracy  (Test): 0.6547
AUC Score (Test): 0.718894
