In [3]:
import pandas as pd
import seaborn as sns
import numpy as np

data = pd.read_csv('data/dataset.csv', sep = ';')

data['log_value'] = np.log(data['value'])
data['log_area'] = np.log(data['area'])
data['log_dist_beach'] = np.log(data['dist_beach'] + 1) #adds one due to log(0) = -inf
data['log_dist_pharmacy'] = np.log(data['dist_pharmacy'] + 1) #adds one due to log(0) = -inf

# <font color = 'red' style = 'font-size: 30px;'>Creating the * Training and Test Datasets *
<hr style = 'border: 2px solid red;'>

## Importing * train_test_split * from the * scikit-learn * library

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [2]:
from sklearn.model_selection import train_test_split

## Creating a Series (pandas) to store the Property Price (y)

In [4]:
y = data['log_value']

## Creating a DataFrame (pandas) to store the explanatory variables (X)

In [5]:
x = data[['log_area','log_dist_beach','log_dist_pharmacy']]

## Creating the training and test datasets

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2811)

# Linear Regression
<hr>

<p style = 'font-size: 20px; line-height: 2; margin: 10px 50px; text-align: justify; '> The regression analysis concerns the study of the dependence of a variable (the <b> dependent </b> variable) in relation to one or more variables, the explanatory variables, in order to estimate and / or predict the average value of the former in terms of the known or fixed values of the latter. </p>


## scikit-learn (https://scikit-learn.org/stable/)

<p style = 'font-size: 20px; line-height: 2; margin: 10px 50px; text-align: justify; '> * scikit-learn * is a Python module specialized in solutions for * machine learning *. </p>

<img width='800px' src='data/img/Log-linear.png'>

## Importing the statsmodels library

https://www.statsmodels.org/stable/index.html

In [9]:
import statsmodels.api as sm

## Estimating the model with statsmodels

In [10]:
x_train_with_const = sm.add_constant(x_train)

In [11]:
x_train_with_const

Unnamed: 0,const,log_area,log_dist_beach,log_dist_pharmacy
2661,1.0,5.945421,0.000000,0.382273
912,1.0,3.135494,0.972865,0.605015
3042,1.0,4.317488,1.794961,0.486594
141,1.0,3.401197,0.310455,0.599609
3854,1.0,5.676754,0.032193,0.101903
...,...,...,...,...
3657,1.0,5.075174,2.023480,0.333605
979,1.0,4.174387,2.296141,0.156465
2389,1.0,4.394449,1.367741,0.409727
447,1.0,3.951244,2.166841,0.217381


In [15]:
model_statsmodels = sm.OLS(y_train, x_train_with_const, hasconst=True).fit()

# <font color = 'red' style = 'font-size: 30px;'>Evaluating the Estimated Model </font>
<hr style = 'border: 2px solid red;'>

## Evaluating model test statistics

In [16]:
model_statsmodels.summary()

0,1,2,3
Dep. Variable:,log_value,R-squared:,0.805
Model:,OLS,Adj. R-squared:,0.805
Method:,Least Squares,F-statistic:,5495.0
Date:,"Tue, 04 May 2021",Prob (F-statistic):,0.0
Time:,16:22:20,Log-Likelihood:,-2044.9
No. Observations:,4000,AIC:,4098.0
Df Residuals:,3996,BIC:,4123.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.3417,0.060,154.734,0.000,9.223,9.460
log_area,1.0580,0.012,89.320,0.000,1.035,1.081
log_dist_beach,-0.4905,0.009,-56.690,0.000,-0.508,-0.474
log_dist_pharmacy,-0.0167,0.032,-0.521,0.603,-0.080,0.046

0,1,2,3
Omnibus:,64.751,Durbin-Watson:,1.971
Prob(Omnibus):,0.0,Jarque-Bera (JB):,106.858
Skew:,0.136,Prob(JB):,6.2499999999999996e-24
Kurtosis:,3.753,Cond. No.,47.6


# <font color = 'red' style = 'font-size: 30px;'>Modifying the Model and Reassessing the Fit </font>
<hr style = 'border: 2px solid red;'>

## Creating a new set of explanatory variables (X)

In [17]:
x = data[['log_area', 'log_dist_beach']]

## Creating the training and test datasets

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2811)

In [19]:
x_train

Unnamed: 0,log_area,log_dist_beach
2661,5.945421,0.000000
912,3.135494,0.972865
3042,4.317488,1.794961
141,3.401197,0.310455
3854,5.676754,0.032193
...,...,...
3657,5.075174,2.023480
979,4.174387,2.296141
2389,4.394449,1.367741
447,3.951244,2.166841


## Estimating the model with statsmodels

In [21]:
x_train_with_const = sm.add_constant(x_train)
x_train_with_const

Unnamed: 0,const,log_area,log_dist_beach
2661,1.0,5.945421,0.000000
912,1.0,3.135494,0.972865
3042,1.0,4.317488,1.794961
141,1.0,3.401197,0.310455
3854,1.0,5.676754,0.032193
...,...,...,...
3657,1.0,5.075174,2.023480
979,1.0,4.174387,2.296141
2389,1.0,4.394449,1.367741
447,1.0,3.951244,2.166841


In [22]:
model_statsmodels = sm.OLS(y_train, x_train_with_const, hasconst=True).fit()

## Evaluating the test statistics of the new model

### Test of joint significance of the parameters

Prob (F-statistic) < 0.05 (OK)

### Test of individual significance of the parameters

P> |t| < 0.05 (OK)

## Evaluating test statistics for the new model

In [23]:
print(model_statsmodels.summary())

                            OLS Regression Results                            
Dep. Variable:              log_value   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     8244.
Date:                Tue, 04 May 2021   Prob (F-statistic):               0.00
Time:                        16:38:09   Log-Likelihood:                -2045.1
No. Observations:                4000   AIC:                             4096.
Df Residuals:                    3997   BIC:                             4115.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              9.3349      0.059    158.