### Modelling - second pass

In [1]:
import pandas as pd
import numpy as np

In [4]:
# reading in the dataset

activities_spend = pd.read_csv("clean_data/activities_spend.csv")

In [5]:
# examining the dataset

activities_spend.head()

Unnamed: 0,value,activity_type,spend_unit
0,1662,Shopping,million (£)
1,469,Live entertainment,million (£)
2,892,Evening out,million (£)
3,354,Attend personal event,million (£)
4,400,Live entertainment,million (£)


In [6]:
# creating dummy variables

activities_spend = pd.get_dummies(activities_spend, drop_first = True)

activities_spend.head()

Unnamed: 0,value,activity_type_Day out,activity_type_Evening out,activity_type_Leisure activity,activity_type_Live entertainment,activity_type_Shopping,activity_type_Visit attraction,activity_type_Visit family/friends
0,1662,0,0,0,0,1,0,0
1,469,0,0,0,1,0,0,0
2,892,0,1,0,0,0,0,0
3,354,0,0,0,0,0,0,0
4,400,0,0,0,1,0,0,0


In [None]:
# this looks better - there are only 7 dummy variables, and therefore 7 variables now.
# This means less of a chance of overfitting.

In [None]:
# building the model

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
# setting the response array

response_var = activities_spend["value"]

# putting the predictors in a dataframe
predictors_vars = activities_spend.drop(columns = "value")

In [9]:
# defining the model

model = LinearRegression()

model.fit(predictors_vars, response_var)

LinearRegression()

In [10]:
# getting the r-sq value

model.score(predictors_vars, response_var)

0.6909188806674171

In [None]:
# This is a lot better, and a lot more what I'd expect for a good model.
# The first one seemed massively overfitted, where this is more realistic. Hopefully I'll have good p-values here too.

In [11]:
# getting the coefficients

model.intercept_

229.71428571428595

In [12]:
model.coef_

# these numbers don't seem as scattered as the first lot did - but maybe that just because there's fewer of them?

array([ 210.71428571,  583.14285714,  271.95238095,  161.78571429,
       1230.57142857,  234.71428571, 1219.14285714])

In [13]:
# getting the p-values

In [14]:
import statsmodels.api as sm

  from pandas import Int64Index as NumericIndex


In [15]:
predictors_vars = sm.add_constant(predictors_vars)

In [16]:
sm_model = sm.OLS(response_var, predictors_vars).fit()
print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.691
Model:                            OLS   Adj. R-squared:                  0.667
Method:                 Least Squares   F-statistic:                     28.74
Date:                Fri, 11 Feb 2022   Prob (F-statistic):           2.07e-20
Time:                        14:00:38   Log-Likelihood:                -678.36
No. Observations:                  98   AIC:                             1373.
Df Residuals:                      90   BIC:                             1393.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons