In [1]:
#import required libraries
import numpy as np
import pandas as pd

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Import Warnings
import warnings 

#import statsmodel
import statsmodels.formula.api as smf

#Import rmse
from statsmodels.tools.eval_measures import rmse

#import linear regression from scikit-learn
from sklearn.linear_model import LinearRegression 

#import polynomial features
from sklearn.preprocessing import PolynomialFeatures

#import Metrics
from sklearn.metrics import mean_squared_error,r2_score

#configuration settings
%matplotlib inline
sns.set(color_codes=True)
warnings.filterwarnings('ignore') ##surpress warnings

## Load the data into a dataframe

In [2]:
#load the data into the dataframe as supermarket_till_transcations_df
supermarket_till_transcations_df=pd.read_csv('supermarket_till_transactions.csv')
supermarket_till_transcations_df.head(10)

Unnamed: 0,SHOP_WEEK,SHOP_DATE,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,PROD_CODE,PROD_CODE_10,PROD_CODE_20,PROD_CODE_30,...,CUST_PRICE_SENSITIVITY,CUST_LIFESTAGE,BASKET_ID,BASKET_SIZE,BASKET_PRICE_SENSITIVITY,BASKET_TYPE,BASKET_DOMINANT_MISSION,STORE_CODE,STORE_FORMAT,STORE_REGION
0,200607,20060413,5,20,1,103,PRD0900097,CL00001,DEP00001,G00001,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
1,200607,20060412,4,19,1,28,PRD0900353,CL00070,DEP00020,G00007,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02
2,200607,20060413,5,20,3,84,PRD0900550,CL00167,DEP00055,G00016,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
3,200607,20060412,4,19,1,221,PRD0901647,CL00010,DEP00003,G00002,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02
4,200607,20060413,5,20,1,334,PRD0902064,CL00073,DEP00021,G00007,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
5,200607,20060413,5,20,6,156,PRD0902293,CL00167,DEP00055,G00016,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
6,200607,20060413,5,20,1,101,PRD0903074,CL00045,DEP00011,G00004,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
7,200607,20060412,4,19,3,192,PRD0903409,CL00070,DEP00020,G00007,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02
8,200607,20060413,5,20,1,27,PRD0903934,CL00137,DEP00048,G00013,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
9,200607,20060412,4,19,3,465,PRD0904023,CL00076,DEP00022,G00007,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02


In order to illustrate Multiple linear  Regression we just need two variables which are:
        1.SHOP_WEEKDAY
        2.SHOP_TOUR
        3.QUANTITY
        4.SPEND

In [3]:
supermarket_till_transcations_df=supermarket_till_transcations_df[['SHOP_WEEKDAY','SHOP_HOUR','QUANTITY','SPEND']]
supermarket_till_transcations_df.head(5)

Unnamed: 0,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND
0,5,20,1,103
1,4,19,1,28
2,5,20,3,84
3,4,19,1,221
4,5,20,1,334


## Using statsmodel

In [4]:
#Initialise and fit linear regression model using 'statsmodel'
stats_model=smf.ols('SPEND ~ SHOP_WEEKDAY + SHOP_HOUR + QUANTITY',data=supermarket_till_transcations_df)
stats_model=stats_model.fit()

We now estimate the parameters using model.params 

In [5]:
stats_model.params

Intercept       366.801757
SHOP_WEEKDAY    -18.779187
SHOP_HOUR       -12.306012
QUANTITY         53.434573
dtype: float64

In [6]:
#print a summary of the fiited model
stats_model.summary()

0,1,2,3
Dep. Variable:,SPEND,R-squared:,0.12
Model:,OLS,Adj. R-squared:,0.097
Method:,Least Squares,F-statistic:,5.24
Date:,"Fri, 07 Feb 2020",Prob (F-statistic):,0.002
Time:,19:50:34,Log-Likelihood:,-817.2
No. Observations:,119,AIC:,1642.0
Df Residuals:,115,BIC:,1654.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,366.8018,93.598,3.919,0.000,181.402,552.202
SHOP_WEEKDAY,-18.7792,18.221,-1.031,0.305,-54.872,17.313
SHOP_HOUR,-12.3060,5.712,-2.154,0.033,-23.621,-0.991
QUANTITY,53.4346,17.801,3.002,0.003,18.175,88.695

0,1,2,3
Omnibus:,119.23,Durbin-Watson:,2.114
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1503.081
Skew:,3.549,Prob(JB):,0.0
Kurtosis:,18.898,Cond. No.,64.1


In [7]:
#print the R-squared value of the model
stats_model.rsquared

0.12024717016691922

In [8]:
#to get the adj_rsquared value for the model
stats_model.rsquared_adj

0.09729709634518657

## Perform the prediction

In [9]:
new_shop_weekday=1
new_shop_hour= 18
new_quantity= 2
stats_model_ypred=stats_model.predict({"SHOP_WEEKDAY":new_shop_weekday,"SHOP_HOUR":new_shop_hour,"QUANTITY":new_quantity})
y = supermarket_till_transcations_df[['SPEND']]


## _RMSE_

The root-measure-squared-error is the frequently used measure of the differences between values(sample and population values) predicted by the model and the values actually observed.

In [10]:
#calc the rmse
stats_model_rmse=rmse(y,stats_model_ypred)
stats_model_rmse

array([249.57120334])

### Confidence Interval 

In [11]:
#print the confidence interval for the model ceeficients
stats_model.conf_int()

Unnamed: 0,0,1
Intercept,181.401596,552.201919
SHOP_WEEKDAY,-54.871733,17.31336
SHOP_HOUR,-23.621078,-0.990946
QUANTITY,18.174604,88.694541


### _Hypothesis testing and P-Values_

In [12]:
#print the p-values for the model coefficients
stats_model.pvalues

Intercept       0.000152
SHOP_WEEKDAY    0.304878
SHOP_HOUR       0.033304
QUANTITY        0.003293
dtype: float64

## _Using scikit-learn_ 

In [13]:
#Build linear regression model using SHOP_WEEKDAY,SHOP_HOUR,QUANTITY predictors 
#Split data into predictors X and output Y
predictors=['SHOP_WEEKDAY','SHOP_HOUR','QUANTITY']
x = supermarket_till_transcations_df[predictors]
y = supermarket_till_transcations_df['SPEND']


#Intialise and fit model

lm=LinearRegression()
scikit_model=lm.fit(x,y)



In [14]:
print(f'alpha = {scikit_model.intercept_}')
print(f'betas = {scikit_model.coef_}')

alpha = 366.8017572034543
betas = [-18.77918675 -12.30601205  53.43457262]


Therefore, our model can be written as:


SPEND=366.802 +(-SHOP_WEEKDAY) + (-SHOP_TOUR) + (QAUNTITY)
We can predict values by simply using.predict():

In [15]:
new_x = [[1,18,2]] #Sunday 6pm buying 2 items
scikit_learn_ypred = scikit_model.predict(new_x)

### Calculate the RMSE when scikit learn is used

In [16]:
#calc rmse 
scikit_learn_rmse = rmse(y,scikit_learn_ypred)
scikit_learn_rmse

249.5712033427086