In [29]:
#import libraries

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor     
import category_encoders as ce

import datetime


from functions import metric   #metric from functions

In [30]:
df = pd.read_csv('./data/cleaned_data.csv',index_col=0)

df.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
3,2013-01-01,377.0,2.0,0.0,0.0,0.0,0.0,a,1.0,a,c,100.0,6.0,2010.0,1.0,18.0,2010.0,"Feb,May,Aug,Nov"
5,2013-01-01,375.0,2.0,0.0,0.0,0.0,0.0,a,1.0,a,c,15710.0,2.0,2013.0,1.0,37.0,2009.0,"Jan,Apr,Jul,Oct"
9,2013-01-01,371.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,c,1970.0,7.0,2009.0,1.0,45.0,2014.0,"Feb,May,Aug,Nov"
10,2013-01-01,370.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,a,8250.0,10.0,2000.0,1.0,31.0,2009.0,"Jan,Apr,Jul,Oct"
12,2013-01-01,368.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,c,1450.0,4.0,2005.0,1.0,45.0,2009.0,"Feb,May,Aug,Nov"


In [31]:
#reset df index
df = df.reset_index(drop=True)

In [32]:
#get present year 
present_year = datetime.datetime.now().year

#change the CompetitionOpenSinceYear from year to age count

df['CompetitionOpenSinceAge'] = present_year - df['CompetitionOpenSinceYear']
df['Promo2SinceAge'] = present_year - df['Promo2SinceYear']

In [57]:
#replace Store with the mean sales

mean_encode = df.groupby('Store')['Sales'].mean().to_frame(name='Mean_Sales').reset_index()


#save mean_encode to disk for future data
mean_encode.to_csv('./data/mean_encode.csv')

In [34]:
#drop all rows where the Sales is zero

df = df[df['Sales'] != 0]

#drop all unused and unneccesary columns

df = df.drop(['Date','PromoInterval','CompetitionOpenSinceYear','Promo2SinceYear','Customers'],axis=1)

In [35]:
#split into train,test, split

y = df['Sales']
X = df.drop(['Sales'],axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)

In [36]:
#merge mean_encode with X_train and drop Store

X_train = pd.merge(X_train,mean_encode, left_on="Store",right_on="Store",how='left',suffixes=('', ''))

In [37]:
#encodings
#ce_target = ce.TargetEncoder(cols = ['Store'])

ce_base = ce.BaseNEncoder(cols = ['StoreType','Assortment','StateHoliday'],base=2)

In [38]:
#drop the Sales
X_train = X_train.drop('Store',axis=1)

In [39]:
X_train.dtypes

DayOfWeek                    float64
Open                         float64
Promo                        float64
StateHoliday                  object
SchoolHoliday                float64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
Promo2                       float64
Promo2SinceWeek              float64
CompetitionOpenSinceAge      float64
Promo2SinceAge               float64
Mean_Sales                   float64
dtype: object

#### Baseline Model

In [40]:
#build baseline model
lr = LinearRegression()


lr_pipeline = Pipeline([
                        ('base_encode', ce_base),
                       ('linear_regression',lr)
                       ])
lr_pipeline.fit(X_train,y_train)

preds_lr = lr_pipeline.predict(X_test)

In [52]:
#error metric

def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [53]:
#test lr_pipeline accuracy

metric(preds_lr,y_test.to_numpy())

1492.6059738462704

### Gradient Boosted trees

In [15]:
#Build a Baseline model - linear regression
gb_trees = LGBMRegressor(learning_rate=0.05,max_depth=5,n_estimators=500,n_jobs=-1, num_leaves=35,subsample=0.8)

gb_pipeline =  Pipeline([
                        ('base_encode', ce_base),
                       ('gb_trees',gb_trees)
                       ])
gb_pipeline.fit(X_train,y_train)

preds_tree = gb_pipeline.predict(X_test)  #predict on X_test

In [54]:
#test gb_pipeline

metric(preds_tree,y_test.to_numpy())

54.66364870413005

In [56]:
#save pipeline to disk
import pickle

filename = './pipeline/gb_pipeline.pkl'
pickle.dump(gb_pipeline, open(filename, 'wb'))