# Question 8: Is there any way to predict spending and visits for Visit Scotland?

## Load Cleaned Data & Libraries

In [1]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

#cleaned data
international_survey = pd.read_csv('../data/clean_data/international_survey.csv')

### Predicting expenditure

In [2]:
international_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32049 entries, 0 to 32048
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             32049 non-null  int64  
 1   country          32049 non-null  object 
 2   purpose          32049 non-null  object 
 3   mode             32049 non-null  object 
 4   duration         32049 non-null  object 
 5   age              32049 non-null  object 
 6   visits_thousand  32049 non-null  float64
 7   nights_thousand  32049 non-null  float64
 8   millions_spent   32049 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 2.2+ MB


In [3]:
#international_survey['year'] = international_survey['year'].astype('category')

In [4]:
international_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32049 entries, 0 to 32048
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             32049 non-null  int64  
 1   country          32049 non-null  object 
 2   purpose          32049 non-null  object 
 3   mode             32049 non-null  object 
 4   duration         32049 non-null  object 
 5   age              32049 non-null  object 
 6   visits_thousand  32049 non-null  float64
 7   nights_thousand  32049 non-null  float64
 8   millions_spent   32049 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 2.2+ MB


#### Create dummy variables for categorical data

In [5]:
international_dummied = international_survey.copy()
#international_dummied.drop(columns='country', inplace=True)
international_dummied = pd.get_dummies(international_dummied, drop_first=True)
international_dummied

Unnamed: 0,year,visits_thousand,nights_thousand,millions_spent,country_Australia,country_Austria,country_Belgium,country_Brazil,country_Bulgaria,country_C'wealth Caribbean,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,2002,0.270000,1.600000,0.160000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2002,1.080000,4.620000,0.250000,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,2002,0.990000,5.030000,0.810000,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,2002,0.350000,1.800000,0.080000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,2002,1.660000,10.090000,0.620000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,2019,1.575168,14.176511,0.897375,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,2019,0.631421,3.157103,0.228890,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,2019,2.243179,11.633949,0.877631,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,2019,0.550858,1.714171,0.135526,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


#### pick appropriate predictor and response variable

In [6]:
international_predictor = international_dummied.drop(columns='millions_spent')
international_response = international_dummied.millions_spent

In [7]:
international_predictor

Unnamed: 0,year,visits_thousand,nights_thousand,country_Australia,country_Austria,country_Belgium,country_Brazil,country_Bulgaria,country_C'wealth Caribbean,country_Canada,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,2002,0.270000,1.600000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2002,1.080000,4.620000,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,2002,0.990000,5.030000,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,2002,0.350000,1.800000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,2002,1.660000,10.090000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,2019,1.575168,14.176511,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,2019,0.631421,3.157103,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,2019,2.243179,11.633949,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,2019,0.550858,1.714171,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [8]:
international_response

0        0.160000
1        0.250000
2        0.810000
3        0.080000
4        0.620000
           ...   
32044    0.897375
32045    0.228890
32046    0.877631
32047    0.135526
32048    0.103797
Name: millions_spent, Length: 32049, dtype: float64

#### create model & check values / coef ...

In [9]:
model = LinearRegression()

model.fit(international_predictor, international_response)

LinearRegression()

In [10]:
model.score(international_predictor, international_response)

0.5147092983880975

In [11]:
model.intercept_

-46.48176447325952

In [12]:
pd.DataFrame(
  {
  "variable" : international_predictor.columns.values,
  "coefficient" : model.coef_
  }
)

Unnamed: 0,variable,coefficient
0,year,0.022932
1,visits_thousand,0.535163
2,nights_thousand,0.027641
3,country_Australia,0.132380
4,country_Austria,0.124127
...,...,...
78,age_25-34,0.050338
79,age_35-44,0.179584
80,age_45-54,0.293061
81,age_55-64,0.304350


In [13]:
international_predictor = sm.add_constant(international_predictor)

international_predictor

Unnamed: 0,const,year,visits_thousand,nights_thousand,country_Australia,country_Austria,country_Belgium,country_Brazil,country_Bulgaria,country_C'wealth Caribbean,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,1.0,2002,0.270000,1.600000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.0,2002,1.080000,4.620000,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1.0,2002,0.990000,5.030000,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,1.0,2002,0.350000,1.800000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1.0,2002,1.660000,10.090000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,1.0,2019,1.575168,14.176511,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,1.0,2019,0.631421,3.157103,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,1.0,2019,2.243179,11.633949,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,1.0,2019,0.550858,1.714171,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [14]:
sm_model = sm.OLS(international_response, international_predictor).fit()

print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:         millions_spent   R-squared:                       0.515
Model:                            OLS   Adj. R-squared:                  0.513
Method:                 Least Squares   F-statistic:                     408.5
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        12:31:47   Log-Likelihood:                -59490.
No. Observations:               32049   AIC:                         1.191e+05
Df Residuals:                   31965   BIC:                         1.199e+05
Df Model:                          83                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

### Predicting expenditure on clustered data

to see how the data has been clustered see [here...](q8_clustering.ipynb)

#### Load clustered dataset

In [15]:
clustered_survey = pd.read_csv('../data/clean_data/int_survey_clustered.csv')

In [16]:
clustered_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32049 entries, 0 to 32048
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             32049 non-null  int64  
 1   country          32049 non-null  object 
 2   purpose          32049 non-null  object 
 3   mode             32049 non-null  object 
 4   duration         32049 non-null  object 
 5   age              32049 non-null  object 
 6   visits_thousand  32049 non-null  float64
 7   nights_thousand  32049 non-null  float64
 8   millions_spent   32049 non-null  float64
 9   clusters         32049 non-null  int64  
 10  cluster_tag      32049 non-null  object 
dtypes: float64(3), int64(2), object(6)
memory usage: 2.7+ MB


In [17]:
#clustered_survey['year'] = clustered_survey['year'].astype('category')

#create dummies
cluster_dum = clustered_survey.copy()
#cluster_dum.drop(columns={'country','clusters'}, inplace=True)
#cluster_dum.drop(columns='clusters', inplace=True)
cluster_dum = pd.get_dummies(cluster_dum, drop_first=True)
cluster_dum

Unnamed: 0,year,visits_thousand,nights_thousand,millions_spent,clusters,country_Australia,country_Austria,country_Belgium,country_Brazil,country_Bulgaria,...,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+,cluster_tag_medium_spenders,cluster_tag_small_spenders
0,2002,0.270000,1.600000,0.160000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,2002,1.080000,4.620000,0.250000,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
2,2002,0.990000,5.030000,0.810000,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,2002,0.350000,1.800000,0.080000,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,2002,1.660000,10.090000,0.620000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,2019,1.575168,14.176511,0.897375,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
32045,2019,0.631421,3.157103,0.228890,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
32046,2019,2.243179,11.633949,0.877631,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
32047,2019,0.550858,1.714171,0.135526,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [18]:
clustered_predictor = cluster_dum.drop(columns='millions_spent')
clustered_response = cluster_dum.millions_spent

#### create model

In [19]:
model = LinearRegression()

model.fit(clustered_predictor, clustered_response)

LinearRegression()

In [20]:
model.score(clustered_predictor, clustered_response)

0.6666036883887174

In [21]:
clustered_predictor = sm.add_constant(clustered_predictor)

sm_model = sm.OLS(clustered_response, clustered_predictor).fit()

print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:         millions_spent   R-squared:                       0.667
Model:                            OLS   Adj. R-squared:                  0.666
Method:                 Least Squares   F-statistic:                     751.9
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        12:31:48   Log-Likelihood:                -53474.
No. Observations:               32049   AIC:                         1.071e+05
Df Residuals:                   31963   BIC:                         1.078e+05
Df Model:                          85                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Clustering the data has made a definate difference changing the r-squared value from 0.51 to 0.66, lets try predict with and without one specific country to see whether the models perform better on a single country instead of the entire dataset.

### Predicting USA expenditure clustered and non clustered

USA was selected due to having accounted for the highest expenditure, visits and nights spent in scotland.

#### Prepare data to be used

In [22]:
usa_non_cluster = pd.read_csv('../data/clean_data/countries/usa.csv')

usa_clustered = clustered_survey.query("country == 'USA'").copy()
usa_clustered.drop(columns='clusters', inplace=True)

#### Check data

In [23]:
usa_non_cluster.country.unique()

array(['USA'], dtype=object)

In [24]:
usa_clustered.country.unique()

array(['USA'], dtype=object)

#### Create dummies

In [25]:
noclus_dummied = usa_non_cluster.copy()
noclus_dummied = pd.get_dummies(noclus_dummied, drop_first=True)

dummied_clus = usa_clustered.copy()
dummied_clus = pd.get_dummies(dummied_clus, drop_first=True)

#### Set predictor/response

In [26]:
noclus_predictor = noclus_dummied.drop(columns='millions_spent')
noclus_response = noclus_dummied.millions_spent
noclus_predictor = sm.add_constant(noclus_predictor)

clustered_predictor = dummied_clus.drop(columns='millions_spent')
clustered_response = dummied_clus.millions_spent
clustered_predictor = sm.add_constant(clustered_predictor)

#### Create models

In [27]:
clustered_model = sm.OLS(clustered_response, clustered_predictor).fit()
model_noncluster = sm.OLS(noclus_response, noclus_predictor).fit()

#### Non Clustered model results

In [28]:
print(model_noncluster.summary())

                            OLS Regression Results                            
Dep. Variable:         millions_spent   R-squared:                       0.745
Model:                            OLS   Adj. R-squared:                  0.743
Method:                 Least Squares   F-statistic:                     581.3
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        12:31:49   Log-Likelihood:                -7673.3
No. Observations:                3609   AIC:                         1.538e+04
Df Residuals:                    3590   BIC:                         1.550e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  -88.0990 

#### Clustered model results

In [29]:
print(clustered_model.summary())

                            OLS Regression Results                            
Dep. Variable:         millions_spent   R-squared:                       0.815
Model:                            OLS   Adj. R-squared:                  0.814
Method:                 Least Squares   F-statistic:                     789.3
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        12:31:49   Log-Likelihood:                -7093.0
No. Observations:                3609   AIC:                         1.423e+04
Df Residuals:                    3588   BIC:                         1.436e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             