# Rossman Store ML - CS3244
### A0175196U, A0174733B

For more information on how we solved this problem, please refer to the README file. 

IMPORTANT : This code requires to import XGBOOST

In [2]:
import numpy as np
import pandas 
from math import sqrt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
import xgboost as xgb

In [3]:
#read the files
store = pandas.read_csv('store.csv', sep = ',')
train_initial = pandas.read_csv('train_v2.csv',dtype={"StateHoliday": str})
test = pandas.read_csv('test_v2.csv', sep = ",") 

#take out the sales (we don't have this info in the test)
sales_train = train_initial['Sales']
train = train_initial.drop('Sales', axis=1)


#take out days were it's closed - When 0 customers will be identified in test data
# we will automatically put the test sales at 0.
sales_train = pandas.Series.to_frame(sales_train)
train = train[train.Customers != 0]
sales_train = sales_train[sales_train.Sales != 0]

In [4]:
# put the 2 types of data together
df = train.join(store.set_index('Store'), on='Store')
test = test.join(store.set_index('Store'), on='Store')

# extract the date as day/month/year/weak of the year
df['Date']  = pandas.to_datetime(df['Date'], errors='coerce')
df['Year']  = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['WeekOfYear'] = df['Date'].dt.weekofyear

test['Date']  = pandas.to_datetime(test['Date'], errors='coerce')
test['Year']  = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day
test['WeekOfYear'] = test['Date'].dt.weekofyear

In [5]:
# combine train and test (in that oder) to make sure all tranformation are the same for 
# test and train (same dummies etc)
number_train = df.shape[0]
number_test = test.shape[0]
frames = [df,test]
result = pandas.concat(frames)

In [6]:
#fill the missing values with -1
result = result.fillna(-1)

In [7]:
result.head()

Unnamed: 0,Store,DayOfWeek,Date,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear
0,1,5,2015-01-30,616,1,1,0,0,c,a,...,9.0,2008.0,0,-1.0,-1.0,-1,2015,1,30,5
1,2,5,2015-01-30,624,1,1,0,0,a,a,...,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,1,30,5
2,3,5,2015-01-30,678,1,1,0,0,a,a,...,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,1,30,5
3,4,5,2015-01-30,1632,1,1,0,0,c,c,...,9.0,2009.0,0,-1.0,-1.0,-1,2015,1,30,5
4,5,5,2015-01-30,617,1,1,0,0,a,a,...,4.0,2015.0,0,-1.0,-1.0,-1,2015,1,30,5


In [8]:
# detect if there is a promo2 on the day of the sales             
month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
result['monthStr'] = result.Month.map(month2str)
result.loc[result.PromoInterval == -1, 'PromoInterval'] = ''
result['IsPromoMonth'] = 0
for interval in result.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                result.loc[(result.monthStr == month) & (result.PromoInterval == interval), 'IsPromoMonth'] = 1
result.drop(['monthStr'], axis=1, inplace=True)



In [9]:
# detect if there is competition open at the time of the evaluation
result['Competition'] = 0
year_sale = result.Year
year_comp = result.CompetitionOpenSinceYear
month_sale = result.Month
month_comp = result.CompetitionOpenSinceMonth

competition_exist = (year_sale > year_comp) | ((year_sale > year_comp)&(month_sale>=month_comp))
result.loc[competition_exist, 'Competition'] = 1

In [10]:
#drop data that is not important anymore
result.drop(['PromoInterval','Date'], axis=1, inplace=True)

In [11]:
# It seemed a good idea to compare the time since the competition open
# and the time since the promo2 had stated. 
# --> However the model performed better without this extra information !

#-------------- Code below abandonned ------------------------------------

# def month_difference (comp_year, our_year, comp_month, our_month):
#     difference = our_month - comp_month + 12*(our_year-comp_year)
#     return difference
# def week_difference (comp_year, our_year, comp_month, our_month):
#     difference = our_week - comp_week + 52*(our_year-comp_year)
# 
# result['CompetitionOpenSinceHowLong'] = month_difference (result['CompetitionOpenSinceYear'],result['Year'],
#                                                   result['CompetitionOpenSinceMonth'],result['Month'])
# result['Promo2HowLong'] = month_difference(result['Promo2SinceYear'],result['Year'],
#                                                    result['Promo2SinceWeek'],result['Month'])

# Unfortunately these guys seem to get more error

In [12]:
# replace the caracters by numbers, to be used in decision trees later
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
result.StateHoliday.replace(mappings, inplace=True)
result.StoreType.replace(mappings, inplace=True)
result.Assortment.replace(mappings, inplace=True)

In [14]:
# funny behaviour can happen in the beginning or the end of every month, as well as fridays
# in general the month of december behaves very differently than the others because of christmas.
#we add parameters to adress these issues

result['Friday'] = 0
result.loc[result.DayOfWeek == 5,'Friday'] = 1
result['December'] = 0
result.loc[result.Month == 12,'December'] = 1
result['EarlyInMonth'] = 0
result.loc[result.Day < 5, 'EarlyInMonth'] = 1
result['LateInMonth'] = 0
result.loc[result.Day > 25, 'LateInMonth'] = 1


In [15]:
result.drop(['Month'], axis=1, inplace=True)

In [16]:
# redivide the result dataframe in the testing and the training sets
df = result.iloc[:number_train,:]
test = result.iloc[number_train:,:]

In [17]:
# conversion everything to float32 to have uniform format.
df = df.astype(np.float32)
test = test.astype(np.float32)

In [18]:
#convert a numpy array
data_train = df.values
sales_train = pandas.DataFrame.as_matrix(sales_train)
sales_train = sales_train.reshape(len(sales_train),)
#make a copy of the test data in a numoy array format
testing = test.values

In [19]:
X = data_train
y = sales_train

In [20]:
# divide the train set into a training and a validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.012)

In [21]:
# change to format supported by xgboost
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid,y_valid)
dtest = xgb.DMatrix(testing)

The best resut^ls so far are achieved by :

'max_depth': 6, 

'eta': 0.02, 

'silent': 1, 

'objective': 'reg:linear',  

'booster': 'gbtree',

'colsample_bytree': 0.9

"subsample": 0.9 

In [22]:
#set the parameters necessary to optimize gradient boosting

param = {
    'max_depth': 6,  # depth of tree
    'eta': 0.02,  # the training step 
    'silent': 1,  # logging mode - quiet
    'objective': 'reg:linear',  # error evaluation for multiclass training
    'booster': 'gbtree',
    'colsample_bytree': 0.9, #subsample ratio of columns when constructing each tree
    "subsample": 0.9 # subsample ratio of the training instance: prevent overfitting
        }
num_round = 10000  # the number of training iterations 
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [23]:
def rmspe(y, yhat):
    return np.sqrt(np.mean(((yhat - y)/y) ** 2))

def rmspe_xg(yhat, y):
    y = np.array(y.get_label())
    yhat = np.array(yhat)
    return "rmspe", rmspe(y,yhat)

In [24]:
# train the gradient boosting model
gbm = xgb.train(param, dtrain, num_round, evals=watchlist, maximize = False,
                early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=False)

In [25]:
#evaluate the model
prediction_of_sales = gbm.predict(dtest)
prediction_in_sample = gbm.predict(dtrain)
prediction_validation = gbm.predict(dvalid) 


In [26]:
# set the sales at 0 if there was no customers --> store closed
for i in range (number_test) :
    if test['Customers'].iloc[i] == 0 :
        prediction_of_sales[i] = 0

In [27]:
print ('error in samplle : %f', rmspe (y_train, prediction_in_sample))
print ('validation error : %f', rmspe (y_valid, prediction_validation))

error in samplle : %f 0.0380674031004
validation error : %f 0.056008644787


In [28]:
# write the prediction in a csv file
submission = pandas.DataFrame()
submission['Sales'] = prediction_of_sales
cols = ['Id','Sales']
submission['Id'] = submission.index + 1
submission = submission[cols]
submission.to_csv('submission.csv', index=False)

## results
We could get in sample errors of 0.01 with xgboost, however it did not generalize to the test dataset. 
After optimizing the parameters and assuring good generalization:

train data error : 0.036

validation_error : 0.054

test_error : 0.355

# ressources

1) https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

2) https://docs.python.org/2/library/csv.html

3) http://xgboost.readthedocs.io/en/latest/parameter.html

4) http://www.akbarian.org/notes/sale-forecasting-time-series-analysis/

# Statement of Individual Work

We, A0175196U, A0174733B, certify that we have followed the CS 3244 Machine Learning class guidelines for homework assignments.  In particular, we expressly vow that we have followed the Facebook rule in discussing with others in doing the assignment and did not take notes (digital or printed) from the discussions. We’ve also followed Kaggle competition rules.
