## Model Building in XGBoost

This is a great article for tunning XGboost: http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [1]:
import os
windows=False
if (windows):
    mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
    os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import time
import csv
import boto # to download from AWS S3 buckets

import pickle
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import math

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [3]:
#Let's define variables that will define the behaviour of the whole script
s3_path = 'http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/'
use_validation=True
scale_numericals=True
onehot_categoricals=False
lag = 4
num_clusters_cliente = 4000
shifted_target = True
trimmed = True

In [4]:
#now we load our modified train and test set
tic()
sufix=""
if (use_validation): sufix += "_holdout"
if (scale_numericals): sufix += "_scaled"
if (onehot_categoricals): sufix += "_onehot"
if (shifted_target): sufix += "_shifted"
sufix += "_" + repr(lag) + "lags"
sufix += "_" + repr(num_clusters_cliente) + "clusters"

print ('Downloading File: train_modified{}.csv  ...'.format(sufix))
train = pd.read_csv("{}train_modified{}.csv".format(s3_path,sufix),
                    dtype = {'Canal_ID': 'int8',
                            'log_target':  'float64',
                            'Log_Target_mean_lag1': 'float64',
                            'Log_Target_mean_lag2': 'float64',
                            'Log_Target_mean_lag3': 'float64',
                            'Log_Target_mean_lag4': 'float64',
                            'Log_Target_mean_lag5': 'float64',
                            'Lags_sum': 'float64',
                            'pairs_mean':  'float64',
                            'brand': 'int8',
                            'prodtype_cluster': 'int32',
                            'Qty_Ruta_SAK_Bin': 'int32',
                            'ZipCode': 'uint32',
                            'week_ct': 'int8',
                            'NombreCliente': 'int32',
                            'Producto_ID_clust_ID':'int32',
                            'Ruta_SAK_clust_ID':'int32',
                            'Agencia_ID_clust_ID':'int32',
                            'Cliente_ID_clust_ID':'int32'},
                   )

print ('Downloading File: val_modified{}.csv  ...'.format(sufix))
val = pd.read_csv("{}val_modified{}.csv".format(s3_path,sufix),
                    dtype = {'Canal_ID': 'int8',
                            'log_target':  'float64',
                            'Log_Target_mean_lag1': 'float64',
                            'Log_Target_mean_lag2': 'float64',
                            'Log_Target_mean_lag3': 'float64',
                            'Log_Target_mean_lag4': 'float64',
                            'Log_Target_mean_lag5': 'float64',
                            'Lags_sum': 'float64',
                            'pairs_mean':  'float64',
                            'brand': 'int8',
                            'prodtype_cluster': 'int32',
                            'Qty_Ruta_SAK_Bin': 'int32',
                            'ZipCode': 'uint32',
                            'week_ct': 'int8',
                            'NombreCliente': 'int32',
                            'Producto_ID_clust_ID':'int32',
                            'Ruta_SAK_clust_ID':'int32',
                            'Agencia_ID_clust_ID':'int32',
                            'Cliente_ID_clust_ID':'int32'},
                   ) 

print ('Downloading File: test_modified{}.csv  ...'.format(sufix))
test = pd.read_csv("{}test_modified{}.csv".format(s3_path,sufix),
                    dtype = {'id': 'uint32',
                            'Canal_ID': 'int8',
                            'Log_Target_mean_lag1': 'float64',
                            'Log_Target_mean_lag2': 'float64',
                            'Log_Target_mean_lag3': 'float64',
                            'Log_Target_mean_lag4': 'float64',
                            'Log_Target_mean_lag5': 'float64',
                            'Lags_sum': 'float64',
                            'pairs_mean':  'float64',
                            'brand': 'int8',
                            'prodtype_cluster': 'int32',
                            'Qty_Ruta_SAK_Bin': 'int32',
                            'ZipCode': 'uint32',
                            'week_ct': 'int8',
                            'NombreCliente': 'int32',
                            'Producto_ID_clust_ID':'int32',
                            'Ruta_SAK_clust_ID':'int32',
                            'Agencia_ID_clust_ID':'int32',
                            'Cliente_ID_clust_ID':'int32'},
                      )
tac()

Downloading File: train_modified_holdout_scaled_shifted_4lags_4000clusters.csv  ...
Downloading File: val_modified_holdout_scaled_shifted_4lags_4000clusters.csv  ...
Downloading File: test_modified_holdout_scaled_shifted_4lags_4000clusters.csv  ...
Time passed: 0hour:4min:46sec


In [11]:
#Define target and ID columns:
target = 'log_target'
IDcol = 'id'

## Train multiple models per client cluster

Ok, so we said on our prior step (Models wiht scikit-learn) that we need to deal with the data set high variance. Let's do this first:

Looking at the plot below, created on the clustering-by-demand on the feature engineering notebook, we see that some client clusters behave very differntly from others. So this explain why our model is failing on predicting accurately for all of them.
We are going then to create a wrapper function to create as many models as Client Clusters by demand are (Cliente_ID_clust_ID). The scores should be bettter individually, and the concatenation of all 400 models should yield a better overall RSMLE than our baseline 0.47.

![Image of Variables vs Hypothesis](./input-data/h2o-clustByDem_Cliente_ID_400.png)

In [6]:
import xgboost as xgb
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

def model_fit(alg, ctrain, cval, ctest, predictors, target, IDcol):
    
    #Fit the algorithm on the data
    watchlist = [(cval[predictors], cval[target])]
    alg.fit(ctrain[predictors], ctrain[target], eval_set=watchlist, eval_metric='rmse', early_stopping_rounds=20, verbose=False)


    #Predict training set:
    ctrain["predictions"] = alg.predict(ctrain[predictors])
    ctrain["predictions"] = np.maximum(ctrain["predictions"], 0)

    
    #Predict validation (holdout) set:
    cval["predictions"] = alg.predict(cval[predictors])
    cval["predictions"] = np.maximum(cval["predictions"], 0)# we make all negative numbers = 0 since there cannot be a negative demand

    
    #Predict on testing data: we need to revert it back to target by applying expm1
    ctest[target] = alg.predict(ctest[predictors])
    ctest[target] = np.maximum(ctest[target], 0) # we make all negative numbers = 0 since there cannot be a negative demand
    
    return ctrain[[target,"predictions","Demanda_uni_equil"]], cval[[target,"predictions","Demanda_uni_equil"]], ctest[[IDcol,target]]
    

In [7]:
def clusters_fit (alg, dtrain, dval, dtest, predictors, target, IDcol):
    
    train_predictions = pd.DataFrame(index=[target,"predictions","Demanda_uni_equil"])
    val_predictions = pd.DataFrame(index=[target,"predictions","Demanda_uni_equil"])
    test_predictions = pd.DataFrame(index=[IDcol,target])
    
    clusters_list = train.Cliente_ID_clust_ID.drop_duplicates().get_values()
    
    for cluster in clusters_list:
        
        #we get the cluster train,val, test data

        ctrain = dtrain.loc[dtrain["Cliente_ID_clust_ID"] == cluster]
        cval   = dval.loc[dval["Cliente_ID_clust_ID"] == cluster]
        ctest  = dtest.loc[dtest["Cliente_ID_clust_ID"] == cluster]
        
        #we train the cluster
        ctrain, cval, ctest = model_fit(model, ctrain, cval, ctest, predictors, target, IDcol)
        
        #rsmle_train =  np.sqrt(metrics.mean_squared_error(ctrain[target], ctrain["predictions"]))
        #rsmle_val = np.sqrt(metrics.mean_squared_error(cval[target], cval["predictions"]))
            
        #concatenate each cluster result
        train_predictions = pd.concat([train_predictions,ctrain],ignore_index=True)
        val_predictions = pd.concat([val_predictions,cval],ignore_index=True)
        test_predictions = pd.concat([test_predictions,ctest],ignore_index=True)
        
        #train_predictions.dropna(axis=0, how='any',inplace=True)
        #val_predictions.dropna(axis=0, how='any',inplace=True)
        #test_predictions.dropna(axis=0, how='any',inplace=True)
          
        #acc_rsmle_train =  np.sqrt(metrics.mean_squared_error(train_predictions[target], train_predictions["predictions"]))
        #acc_rsmle_val = np.sqrt(metrics.mean_squared_error(val_predictions[target], val_predictions["predictions"]))
        #rows_pct = cval.shape[0]*100/dval.shape[0]

        #print('Cluster: {:.0f}  RMSLE T: {:.4f} RMSLE V: {:.4f}  RowsPct: {:.4f} - ACC. RSMLE TRAIN: {:.4f} - ACC. RSMLE VAL: {:.4f}'.format(
        #      cluster, rsmle_train, rsmle_val ,rows_pct, acc_rsmle_train, acc_rsmle_val))
    
        
    #For some reason this function is adding to NaN rows at the beggining, I don't know why, but we'll remove them
    train_predictions.dropna(axis=0, how='any',inplace=True)
    val_predictions.dropna(axis=0, how='any',inplace=True)
    test_predictions.dropna(axis=0, how='any',inplace=True)
    
    return train_predictions, val_predictions, test_predictions
    

In [8]:
def report_submit(dtrain, dval, dtest, filename):
    
    #Print model report:
    print ("\nModel Report")
    if(shifted_target):
        print ('RMSLE TRAIN: ', np.sqrt(metrics.mean_squared_error(np.log1p(dtrain["Demanda_uni_equil"]), dtrain["predictions"])))
        print ('RMSLE VAL: ', np.sqrt(metrics.mean_squared_error(np.log1p(dval["Demanda_uni_equil"]), dval["predictions"])))
    else:
        print ('RMSLE TRAIN: ', np.sqrt(metrics.mean_squared_error(dtrain[target], dtrain["predictions"])))
        print ('RMSLE VAL: ', np.sqrt(metrics.mean_squared_error(dval[target], dval["predictions"])))
    
    #Predict on testing data: we need to revert it back to target by applying expm1
    dtest[target] = np.expm1(dtest[target])
    dtest[target] = np.maximum(dtest[target], 0) # we make all negative numbers = 0 since there cannot be a negative demand
  
    
    print ('NUM ROWS PREDICTED: ', dtest.shape[0] )
    print ('NUM NEGATIVES PREDICTED: ', dtest[target][dtest[target] < 0].count())
    print ('MIN TARGET PREDICTED: ', dtest[target].min())
    print ('MEAN TARGET PREDICTED: ', dtest[target].mean())
    print ('MAX TARGET PREDICTED: ', dtest[target].max())
    
    #Export submission file:
    submission = dtest.copy()
    submission[IDcol] = submission[IDcol].astype(int)
    submission.rename(columns={target: 'Demanda_uni_equil'}, inplace=True)
    submission.to_csv("./Submissions/"+filename, index=False)
    

### Alg6 - XGB - Train each client cluster separately

Let's try training each of the client clusters separately and see if we have good results

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4', 
              'Agencia_ID','Ruta_SAK','Cliente_ID','Producto_ID',
              'Lags_sum', 'brand', 'prodtype_cluster', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'Producto_ID_clust_ID']


model = xgb.XGBRegressor(n_estimators = 50, objective="reg:linear", learning_rate= 0.1, max_depth=10,
                         subsample=0.85,colsample_bytree=0.7)

tic()
dt, dv, dte = clusters_fit(model, train, val, test, predictors, target, IDcol)
report_submit(dt, dv, dte, 'alg6.csv')
tac()

In [136]:
dte.head()

Unnamed: 0,id,log_target
2,924190.0,1.479542
3,4521987.0,1.744488
4,6217476.0,1.315279
5,970784.0,1.522702
6,3448837.0,2.396521


In [None]:
plt.hist(val['log_target'], 100, alpha=0.5, label='target')
plt.hist(val['predictions'], 100, alpha=0.5, label='predictions') 
plt.legend(loc='upper right')
plt.show()

### Alg6.2 - XGB - Train only one batch

And now let's compare it with training the complete training set

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4', 
              'Agencia_ID','Ruta_SAK','Cliente_ID','Producto_ID',
              'Lags_sum', 'brand', 'prodtype_cluster', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'Producto_ID_clust_ID']

model = xgb.XGBRegressor(n_estimators = 300, objective="reg:linear", learning_rate= 0.1, max_depth=10,
                         subsample=0.85,colsample_bytree=0.7)

tic()
dt, dv, dte = model_fit(model, train, val, test, predictors, target, IDcol)
report_submit(dt, dv, dte, 'alg6.2.csv')
tac()

feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')

In [None]:
plt.plot(model.evals_result()['validation_0']['rmse'])