In [2]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from math import sqrt

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# special matplotlib argument for improved plots
from matplotlib import rcParams




In [179]:
%run scripts/rossman.py
%run scripts/helper.py

In [180]:
rossman = Rossman('./data/train.csv', './data/test.csv', './data/store.csv')

In [181]:
# merge with stores data
train_df_merged = rossman.merge_stores_data()

In [182]:
# consider only those entries with non-zero sales value
train_df_with_non_zero_sales = rossman.non_zero_sales_data()

In [183]:
# test dataset
test_df = rossman.test_df.copy()

In [184]:
# preprocessing - converting all categorical variables into numerical values
train_df_processed, test_df_processed = preprocessing(train_df_with_non_zero_sales, test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [185]:
# create three separate training examples for three years

# train_df_2013 = get_data(train_df, '2013-01-01', '2013-12-31')
train_df_2014_2015 = get_data(train_df, '2014-01-01', '2015-12-31')
# train_df_2015 = get_data(train_df, '2015-01-01', '2015-12-31')

In [186]:
features = train_df_2013.columns.drop(['Date', 'Sales'])

In [187]:
# X_train_2013 = train_df_2013[features]
X_train_2014_2015 = train_df_2014[features]
# X_train_2015 = train_df_2015[features]

In [123]:
# y_train_2013 = np.log1p(train_df_2013.Sales)
y_train_2014_2015 = np.log1p(train_df_2014.Sales)
# y_train_2015 = np.log1p(train_df_2015.Sales)

In [61]:
# Extreme Gradient Boosting
## Creating models on dataset from three different years
## and testing it out on the final 6 weeks of year 2015

import xgboost as xgb

In [124]:
# training a model on data from year 2013
# dtrain_2013 = xgb.DMatrix(X_train_2013, y_train_2013, missing=-999.0)
dtrain_2014 = xgb.DMatrix(X_train_2014, y_train_2014, missing=-999.0)
# dtrain_2015 = xgb.DMatrix(X_train_2015, y_train_2015, missing=-999.0)

In [128]:
Xtest = test_df_processed[features]

Xtest.Open.fillna(1, inplace=True)

Xtest_open_stores = Xtest[Xtest.Open == 1]
Xtest_closed_stores = Xtest[Xtest.Open == 0]

dtest = xgb.DMatrix(Xtest_open_stores, missing=-999.0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [129]:
params_2014 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), 
                    ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))
num_round = 1500

model_2014 = xgb.train(params_2014, dtrain_2014, num_round, feval=rmspe_xg)

In [130]:
# predictions for the year 2014

predictions_test = np.expm1(model_2014.predict(dtest))

In [175]:
# predictions for open and closed stores and then stack them together
open_stores_test_ids = Xtest_open_stores.index.values + 1
closed_stores_test_ids = Xtest_closed_stores.index.values + 1

open_stores_preds = predictions_test
closed_stores_preds = [0.] * len(closed_stores_test_ids)

final_ids = np.hstack([open_stores_test_ids, closed_stores_test_ids])
final_preds = np.hstack([open_stores_preds, closed_stores_preds])

In [178]:
create_submission(final_ids, final_preds, 'xgb_only_2014.csv')

In [97]:
params_2015 = dict((('silent', 1), ('nthread', 8), ('objective', 'reg:linear'),('eta', 0.05), 
                    ('subsample', 0.8), ('colsample_bytree', 0.7), ('min_child_weight', 5), ('max_depth', 8)))
num_round = 1000

model_2015 = xgb.train(params_2015, dtrain_2015, num_round, feval=rmspe_xg)

In [98]:
# predictions for the year 2014
predictions_2015 = np.expm1(model_2015.predict(dtest))

print 'RMSPE error for model based on examples from the year 2015 ', rmspe(ytest, predictions_2015)

RMSPE error for model based on examples from the year 2015  0.224573258686


In [99]:
# find the correlations between three predictions
prediction_df = pd.DataFrame({'2013': predictions_2013, '2014': predictions_2014, '2015': predictions_2015})
prediction_df.corr()

Unnamed: 0,2013,2014,2015
2013,1.0,0.940708,0.918543
2014,0.940708,1.0,0.93355
2015,0.918543,0.93355,1.0


In [108]:
prediction_avg = .1 * predictions_2013 + 0.8 * predictions_2014 + .1 * predictions_2015

In [109]:
print 'RMSPE error for average of the predictions of three models ', rmspe(ytest, prediction_avg)

RMSPE error for average of the predictions of three models  0.171631453195


In [111]:
rossman.test_df.Open.value_counts()

1    35093
0     5984
Name: Open, dtype: int64