
# Homework 4 - Rossmann Store Sales

In [17]:
import pylab
import csv
import datetime
import math
import re
import time
import random
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.tseries.offsets import *
from operator import *

from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_predict, cross_val_score
from sklearn import linear_model
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import xgboost as xgb
from xgboost.sklearn import XGBRegressor


%matplotlib inline
start_time = time.time()

# Create Scoring and Accuracy metrics

In [4]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

#Fitting log and then converting back to exponential
def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

#Make rmse score function for Scikit
scoring_fnc = make_scorer(rmspe)

## Setting seed

In [5]:
seed = 123

## Reading sales and store data

In [6]:
#Load the training, test and store data using pandas

nrows = None
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}

train = pd.read_csv('train.csv', nrows=nrows,parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')), 
                   dtype = types)
nrows = nrows
test = pd.read_csv('test.csv', nrows=nrows, parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')),
                  dtype = types)
store = pd.read_csv("store.csv")

#Assume store open, if not provided
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

#Consider only open stores for training
train = train[train["Open"] != 0]

#Filter sales bigger then zero else rmspe infinite
train = train[train["Sales"] > 0]

#Join with store
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

In [8]:
def create_features(features, data):
    
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
   
    #Selecting variables specific to Store
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])

    #Add SchoolDay
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    
    # Label encode categorical variables
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    #Create features specific to type of day
    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    
    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    str_month = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(str_month)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    
    #Create whether Month is promo month
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

# Create train and test features

In [9]:
features = []
create_features(features, train)
create_features([], test)

In [12]:
print(features)

['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'PromoOpen', 'IsPromoMonth']


In [10]:
X_train = train[features]
y_train = np.log1p(train.Sales)
X_test = test[features]
#X_train, X_train_test, y_train, y_train_test = train_test_split(X, targets, test

In [11]:
std_scale = StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

# Linear Lasso Regression

In [15]:
# Lasso regularization 
param = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False], 'alpha':[0.1,1.0,10.0]}
lasso_reg = linear_model.Lasso()
gs = GridSearchCV(lasso_reg, param_grid = param, scoring= scoring_fnc, cv=5, verbose=1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  4.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False], 'alpha': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(rmspe), verbose=1)

In [16]:
lasso_regr = gs.best_estimator_
lasso_regr.fit(X_train, y_train)
scores = cross_val_score(lasso_regr,X_train,y_train,cv=5,scoring=scoring_fnc)
print(scores.mean())
print(scores.std())

0.049452882111151876
0.0020098931954629505


In [31]:
test_probs=lasso_regr.predict(X_test_std)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("lasso_submission.csv", index=False)

**Private Score - 0.46**  
**Public Score - 0.429**

# Linear Ridge Regression

In [17]:
# Ridge regularization 
param = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False], 'alpha':[0.1,1.0,10.0]}
ridge_reg = linear_model.Ridge()
gs = GridSearchCV(ridge_reg, param_grid = param, scoring= scoring_fnc, cv=5, verbose=1)
gs.fit(X_train, y_train)

ridge_regr = gs.best_estimator_
ridge_regr.fit(X_train, y_train)
scores = cross_val_score(ridge_regr,X_train,y_train,cv=5,scoring=scoring_fnc)
print(scores.mean())
print(scores.std())

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.7min finished


0.04836313542743356
0.0020732140565679147


In [45]:
test_probs=ridge_regr.predict(X_test_std)
# Make Submissionccv_s
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("ridge_submission.csv", index=False)

**Private Score - 0.45**  
**Public Score - 0.421**

# Decision Tree Regression

In [39]:
X_train = train[features]
y_train = np.log1p(train.Sales)
X_test = test[features]
#X_train, X_train_test, y_train, y_train_test = train_test_split(X, targets, test_size=0.20, random_state=15)


param = {'max_depth': np.arange(3, 10), 'min_samples_split' : range(10,100,20)}
dtc = DecisionTreeRegressor()

gs = GridSearchCV(dtc, param_grid=param, scoring=scoring_fnc, cv=5, verbose=1)
gs.fit(X_train, y_train)
dt_model = gs.best_estimator_
dt_model.fit(X_train, y_train)
scores = cross_val_score(dt_model, X_train,y_train,cv=5,scoring=scoring_fnc)
print(scores.mean())
print(scores.std())

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=1)]: Done 175 out of 175 | elapsed:  6.3min finished


0.04819799017873186
0.003061535478218888


In [40]:
test_probs=dt_model.predict(X_test)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("dt_submission.csv", index=False)

**Private Score - 0.334**  
**Public Score - 0.32**

# Random Forest Regression

In [19]:
X_train = train[features]
y_train = np.log1p(train.Sales)
X_test = test[features]
#X_train, X_train_test, y_train, y_train_test = train_test_split(X, targets, test_size=0.20, random_state=15)

In [None]:
rf = ensemble.RandomForestRegressor(bootstrap = True)
param = {'max_depth': [10,20],
         'min_samples_split': [10,20],
         'n_estimators': [150, 200]}

gs = RandomizedSearchCV(rf, param_distributions=param, n_iter=6, scoring=scoring_fnc, cv=5, verbose=1, n_jobs =-1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
gs.best_params_

In [13]:
rf_regr = ensemble.RandomForestRegressor(n_estimators=150, max_depth=20, min_samples_split=10, bootstrap=True, n_jobs=-1, 
                                         random_state=123, verbose = 1)
rf_regr.fit(X_train, y_train)
scores = cross_val_score(rf_regr,X_train,y_train,cv=5,scoring=scoring_fnc)
print(scores.mean())
print(scores.std())

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.0min finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.5s
[Parallel(n_j

0.04958165218512673
0.0018085442263876495


[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.6s finished


In [36]:
test_probs=rf_regr.predict(X_test)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("rf_submission.csv", index=False)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.1s finished


**Private Score - 0.166**  
**Public Score - 0.151**

# Linear SVR

In [55]:
X_train = train[features]
y_train = np.log1p(train.Sales)
X_test = test[features]
#X_train, X_train_test, y_train, y_train_test = train_test_split(X, targets, test_size=0.20, random_state=15)

In [None]:
kernels = ['rbf','poly']
Cs = [0.1, 1, 10]
gammas = [0.1, 1]
param = {'C': Cs, 'gamma' : gammas, 'kernel' : kernels}

sup_vec = SVR()
gs = GridSearchCV(sup_vec, param_grid=param, scoring=scoring_fnc, cv=5, verbose=1, njobs = -1)
gs.fit(X_train, y_train)
svr_regr = gs.best_estimator_
svr_regr.fit(X_train, y_train)
scores = cross_val_score(svr_regr,X_train,y_train,cv=5,scoring=scoring_fnc)
print(scores.mean())
print(scores.std())


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
test_probs=svr_regr.predict(X_test)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("rf_submission.csv", index=False)

# Xgboost - with Train, Val and Test - Early Stopping

In [47]:
X_train, X_valid = train_test_split(train, test_size=0.2, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

In [42]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.2,
          "max_depth": 8,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 123
          }
num_boost_round = 500

In [26]:
def runXGBCV(train_X, train_y, test_X, test_y = None):
    model = GridSearchCV(
        XGBRegressor(eta=0.2,objective='reg:linear', booster='gbtree', subsample=0.9, silent=1, seed=123, n),
        {
            'max_depth':[4,8,12],
            'colsample_bytree':[0.4,0.8]
        },cv=5, scoring=scoring_fnc)
    model.fit(train_X, train_y)
    print(model.best_params_, best_score_)
    pred_test_y = model.best_estimator_.predict(test_X)
    return pred_test_y

In [27]:
preds=runXGBCV(X_train, y_train, X_test)

KeyboardInterrupt: 

In [43]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

[0]	train-rmse:6.61832	eval-rmse:6.6192	train-rmspe:0.998705	eval-rmspe:0.998707
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:5.29883	eval-rmse:5.2997	train-rmspe:0.994716	eval-rmspe:0.994726
[2]	train-rmse:4.2444	eval-rmse:4.24537	train-rmspe:0.984559	eval-rmspe:0.984586
[3]	train-rmse:3.40154	eval-rmse:3.40247	train-rmspe:0.964041	eval-rmspe:0.964093
[4]	train-rmse:2.7281	eval-rmse:2.72898	train-rmspe:0.92963	eval-rmspe:0.929695
[5]	train-rmse:2.191	eval-rmse:2.19183	train-rmspe:0.87991	eval-rmspe:0.879919
[6]	train-rmse:1.76333	eval-rmse:1.76415	train-rmspe:0.816333	eval-rmspe:0.81612
[7]	train-rmse:1.42232	eval-rmse:1.42304	train-rmspe:0.743026	eval-rmspe:0.742424
[8]	train-rmse:1.15202	eval-rmse:1.15273	train-rmspe:0.665262	eval-rmspe:0.663852
[9]	train-rmse:0.938984	eval-rmse:0.93969	train-rmspe:0.588598	eval-rmspe:0.585829
[10]	train-rmse:0.771353	eval-rmse:0.77201

[96]	train-rmse:0.140708	eval-rmse:0.143192	train-rmspe:0.198507	eval-rmspe:0.158202
[97]	train-rmse:0.140282	eval-rmse:0.142826	train-rmspe:0.198091	eval-rmspe:0.157687
[98]	train-rmse:0.139913	eval-rmse:0.142471	train-rmspe:0.197802	eval-rmspe:0.157346
[99]	train-rmse:0.139166	eval-rmse:0.141721	train-rmspe:0.197156	eval-rmspe:0.156576
[100]	train-rmse:0.138388	eval-rmse:0.140965	train-rmspe:0.196143	eval-rmspe:0.155788
[101]	train-rmse:0.138142	eval-rmse:0.140724	train-rmspe:0.195873	eval-rmspe:0.155531
[102]	train-rmse:0.137093	eval-rmse:0.139713	train-rmspe:0.197605	eval-rmspe:0.154428
[103]	train-rmse:0.136637	eval-rmse:0.139274	train-rmspe:0.197176	eval-rmspe:0.153967
[104]	train-rmse:0.136284	eval-rmse:0.138935	train-rmspe:0.196856	eval-rmspe:0.153644
[105]	train-rmse:0.135941	eval-rmse:0.138623	train-rmspe:0.196436	eval-rmspe:0.1533
[106]	train-rmse:0.135247	eval-rmse:0.137943	train-rmspe:0.195925	eval-rmspe:0.152594
[107]	train-rmse:0.134986	eval-rmse:0.137713	train-rmspe:0.1

[192]	train-rmse:0.110113	eval-rmse:0.114061	train-rmspe:0.167672	eval-rmspe:0.126308
[193]	train-rmse:0.10995	eval-rmse:0.11392	train-rmspe:0.167658	eval-rmspe:0.126151
[194]	train-rmse:0.10989	eval-rmse:0.113869	train-rmspe:0.167562	eval-rmspe:0.126088
[195]	train-rmse:0.109842	eval-rmse:0.113833	train-rmspe:0.167524	eval-rmspe:0.126054
[196]	train-rmse:0.109654	eval-rmse:0.113662	train-rmspe:0.167156	eval-rmspe:0.125837
[197]	train-rmse:0.109493	eval-rmse:0.113525	train-rmspe:0.166877	eval-rmspe:0.125682
[198]	train-rmse:0.109336	eval-rmse:0.113377	train-rmspe:0.166748	eval-rmspe:0.125519
[199]	train-rmse:0.109243	eval-rmse:0.113301	train-rmspe:0.166639	eval-rmspe:0.125428
[200]	train-rmse:0.109092	eval-rmse:0.11317	train-rmspe:0.166333	eval-rmspe:0.125281
[201]	train-rmse:0.108878	eval-rmse:0.112979	train-rmspe:0.166578	eval-rmspe:0.125061
[202]	train-rmse:0.108756	eval-rmse:0.112887	train-rmspe:0.166483	eval-rmspe:0.124975
[203]	train-rmse:0.108601	eval-rmse:0.112772	train-rmspe:0

[288]	train-rmse:0.099161	eval-rmse:0.10478	train-rmspe:0.149419	eval-rmspe:0.116102
[289]	train-rmse:0.09899	eval-rmse:0.10462	train-rmspe:0.149224	eval-rmspe:0.115932
[290]	train-rmse:0.098932	eval-rmse:0.104578	train-rmspe:0.14916	eval-rmspe:0.115883
[291]	train-rmse:0.098833	eval-rmse:0.104499	train-rmspe:0.149078	eval-rmspe:0.1158
[292]	train-rmse:0.098792	eval-rmse:0.104468	train-rmspe:0.149058	eval-rmspe:0.115775
[293]	train-rmse:0.098771	eval-rmse:0.104451	train-rmspe:0.14904	eval-rmspe:0.115755
[294]	train-rmse:0.098662	eval-rmse:0.104364	train-rmspe:0.148959	eval-rmspe:0.115671
[295]	train-rmse:0.098572	eval-rmse:0.104302	train-rmspe:0.142074	eval-rmspe:0.115611
[296]	train-rmse:0.098521	eval-rmse:0.104275	train-rmspe:0.14204	eval-rmspe:0.115589
[297]	train-rmse:0.098372	eval-rmse:0.104133	train-rmspe:0.141974	eval-rmspe:0.115434
[298]	train-rmse:0.098289	eval-rmse:0.104075	train-rmspe:0.141902	eval-rmspe:0.115388
[299]	train-rmse:0.098162	eval-rmse:0.103973	train-rmspe:0.141

[384]	train-rmse:0.092536	eval-rmse:0.099697	train-rmspe:0.131311	eval-rmspe:0.111035
[385]	train-rmse:0.092485	eval-rmse:0.099673	train-rmspe:0.131248	eval-rmspe:0.111019
[386]	train-rmse:0.09242	eval-rmse:0.099627	train-rmspe:0.131181	eval-rmspe:0.110973
[387]	train-rmse:0.092384	eval-rmse:0.099602	train-rmspe:0.131153	eval-rmspe:0.110943
[388]	train-rmse:0.092323	eval-rmse:0.09956	train-rmspe:0.131097	eval-rmspe:0.110893
[389]	train-rmse:0.092244	eval-rmse:0.099495	train-rmspe:0.131011	eval-rmspe:0.110827
[390]	train-rmse:0.092206	eval-rmse:0.099468	train-rmspe:0.131032	eval-rmspe:0.110804
[391]	train-rmse:0.09215	eval-rmse:0.099431	train-rmspe:0.130957	eval-rmspe:0.110764
[392]	train-rmse:0.092122	eval-rmse:0.099415	train-rmspe:0.130932	eval-rmspe:0.11075
[393]	train-rmse:0.092084	eval-rmse:0.0994	train-rmspe:0.128877	eval-rmspe:0.110738
[394]	train-rmse:0.092033	eval-rmse:0.099365	train-rmspe:0.129093	eval-rmspe:0.110717
[395]	train-rmse:0.091968	eval-rmse:0.099305	train-rmspe:0.1

[480]	train-rmse:0.088222	eval-rmse:0.096855	train-rmspe:0.118602	eval-rmspe:0.108103
[481]	train-rmse:0.088202	eval-rmse:0.096845	train-rmspe:0.118588	eval-rmspe:0.108104
[482]	train-rmse:0.088165	eval-rmse:0.096832	train-rmspe:0.118511	eval-rmspe:0.108093
[483]	train-rmse:0.088117	eval-rmse:0.096801	train-rmspe:0.118492	eval-rmspe:0.108066
[484]	train-rmse:0.088085	eval-rmse:0.096786	train-rmspe:0.118458	eval-rmspe:0.108054
[485]	train-rmse:0.088038	eval-rmse:0.096755	train-rmspe:0.118417	eval-rmspe:0.108026
[486]	train-rmse:0.088	eval-rmse:0.096723	train-rmspe:0.118392	eval-rmspe:0.108001
[487]	train-rmse:0.087982	eval-rmse:0.096716	train-rmspe:0.118377	eval-rmspe:0.108001
[488]	train-rmse:0.087948	eval-rmse:0.096695	train-rmspe:0.118344	eval-rmspe:0.107983
[489]	train-rmse:0.087928	eval-rmse:0.096688	train-rmspe:0.11832	eval-rmspe:0.107977
[490]	train-rmse:0.087875	eval-rmse:0.096676	train-rmspe:0.114696	eval-rmspe:0.107962
[491]	train-rmse:0.087858	eval-rmse:0.09667	train-rmspe:0.

In [48]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_submission_final.csv", index=False)

Make predictions on the test set


In [19]:
month_num = {'Jan' : 1,'Feb' : 2,'Mar' : 3,'Apr' : 4,'May' : 5,'Jun' : 6,'Jul' : 7,'Aug' : 8,'Sept' : 9, 'Oct' : 10,
            'Nov' : 11,'Dec' : 12 }

def monthToNum(date):
    return{
            'Jan' : 1,
            'Feb' : 2,
            'Mar' : 3,
            'Apr' : 4,
            'May' : 5,
            'Jun' : 6,
            'Jul' : 7,
            'Aug' : 8,
            'Sept' : 9, 
            'Oct' : 10,
            'Nov' : 11,
            'Dec' : 12
    }[date]

df_store['PromoInterval0'] = df_store['PromoInterval0'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval1'] = df_store['PromoInterval1'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval2'] = df_store['PromoInterval2'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval3'] = df_store['PromoInterval3'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)