# Question 8: Is there any way to predict spending and visits for Visit Scotland?

## Load Cleaned Data & Libraries

In [1]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

#cleaned data
international_survey = pd.read_csv('../data/clean_data/international_survey.csv')

### Predicting expenditure

In [2]:
international_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32049 entries, 0 to 32048
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             32049 non-null  int64  
 1   country          32049 non-null  object 
 2   purpose          32049 non-null  object 
 3   mode             32049 non-null  object 
 4   duration         32049 non-null  object 
 5   age              32049 non-null  object 
 6   visits_thousand  32049 non-null  float64
 7   nights_thousand  32049 non-null  float64
 8   millions_spent   32049 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 2.2+ MB


In [3]:
international_survey['year'] = international_survey['year'].astype('category')

In [4]:
international_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32049 entries, 0 to 32048
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   year             32049 non-null  category
 1   country          32049 non-null  object  
 2   purpose          32049 non-null  object  
 3   mode             32049 non-null  object  
 4   duration         32049 non-null  object  
 5   age              32049 non-null  object  
 6   visits_thousand  32049 non-null  float64 
 7   nights_thousand  32049 non-null  float64 
 8   millions_spent   32049 non-null  float64 
dtypes: category(1), float64(3), object(5)
memory usage: 2.0+ MB


#### Create dummy variables for categorical data

In [5]:
international_dummied = international_survey.copy()
#international_dummied.drop(columns='country', inplace=True)
international_dummied = pd.get_dummies(international_dummied, drop_first=True)
international_dummied

Unnamed: 0,visits_thousand,nights_thousand,millions_spent,year_2003,year_2004,year_2005,year_2006,year_2007,year_2008,year_2009,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,0.270000,1.600000,0.160000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.080000,4.620000,0.250000,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0.990000,5.030000,0.810000,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0.350000,1.800000,0.080000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1.660000,10.090000,0.620000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,1.575168,14.176511,0.897375,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,0.631421,3.157103,0.228890,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,2.243179,11.633949,0.877631,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,0.550858,1.714171,0.135526,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


#### pick appropriate predictor and response variable

In [6]:
international_predictor = international_dummied.drop(columns='millions_spent')
international_response = international_dummied.millions_spent

In [7]:
international_predictor

Unnamed: 0,visits_thousand,nights_thousand,year_2003,year_2004,year_2005,year_2006,year_2007,year_2008,year_2009,year_2010,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,0.270000,1.600000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.080000,4.620000,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0.990000,5.030000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0.350000,1.800000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1.660000,10.090000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,1.575168,14.176511,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,0.631421,3.157103,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,2.243179,11.633949,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,0.550858,1.714171,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [8]:
international_response

0        0.160000
1        0.250000
2        0.810000
3        0.080000
4        0.620000
           ...   
32044    0.897375
32045    0.228890
32046    0.877631
32047    0.135526
32048    0.103797
Name: millions_spent, Length: 32049, dtype: float64

#### create model & check values / coef ...

In [9]:
model = LinearRegression()

model.fit(international_predictor, international_response)

LinearRegression()

In [10]:
model.score(international_predictor, international_response)

0.515521096326943

In [11]:
model.intercept_

-0.5143907518917362

In [12]:
pd.DataFrame(
  {
  "variable" : international_predictor.columns.values,
  "coefficient" : model.coef_
  }
)

Unnamed: 0,variable,coefficient
0,visits_thousand,0.538460
1,nights_thousand,0.027658
2,year_2003,0.020899
3,year_2004,-0.036725
4,year_2005,-0.048917
...,...,...
94,age_25-34,0.050874
95,age_35-44,0.181631
96,age_45-54,0.293741
97,age_55-64,0.304550


In [13]:
international_predictor = sm.add_constant(international_predictor)

international_predictor

Unnamed: 0,const,visits_thousand,nights_thousand,year_2003,year_2004,year_2005,year_2006,year_2007,year_2008,year_2009,...,mode_Tunnel,duration_15+ nights,duration_4-7 nights,duration_8-14 nights,age_16-24,age_25-34,age_35-44,age_45-54,age_55-64,age_65+
0,1.0,0.270000,1.600000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.0,1.080000,4.620000,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1.0,0.990000,5.030000,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,1.0,0.350000,1.800000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1.0,1.660000,10.090000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32044,1.0,1.575168,14.176511,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
32045,1.0,0.631421,3.157103,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32046,1.0,2.243179,11.633949,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
32047,1.0,0.550858,1.714171,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [14]:
sm_model = sm.OLS(international_response, international_predictor).fit()

print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:         millions_spent   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     343.4
Date:                Thu, 16 Feb 2023   Prob (F-statistic):               0.00
Time:                        12:23:44   Log-Likelihood:                -59463.
No. Observations:               32049   AIC:                         1.191e+05
Df Residuals:                   31949   BIC:                         1.200e+05
Df Model:                          99                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     