In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics


data = pd.read_csv('data/beer_consumption.csv', sep = ';')

# <font color = 'red' style = 'font-size: 30px;'> Comparing Models </font>
<hr style = 'border: 2px solid red;'>

## Estimating a new model with the substitution of the explanatory variable Maximum Temperature for Average Temperatuda

In [8]:
X = data[['max_temp', 'rain', 'weekend']]
X2 = data[['mean_temp', 'rain', 'weekend']]
y = data['consumption']

## Creating the training and test datasets

In [11]:
#max temp
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 2811)

#mean temp
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state= 2811)

## Instantiating the class * LinearRegression() *

In [12]:
#max temp
model = LinearRegression()

#mean teamp
model_2 = LinearRegression()

## Using the * fit () * method of the "model_2" object to estimate our linear model using TRAINING data (y2_train and X2_train)

In [13]:
#max temp
model.fit(X_train, y_train)

#mean temp
model_2.fit(X2_train, y2_train)

LinearRegression()

## Obtaining the coefficient of determination (R²) of the new estimated model and comparing it with the result of the previous model

In [14]:
print('Model with max temp')
print('R^2 = {}'.format(model.score(X_train, y_train).round(2)))

Model with max temp
R^2 = 0.73


In [17]:
print('Model with mean temp')
print('R^2 = {}'.format(model_2.score(X2_train, y2_train).round(2)))

Model with mean temp
R^2 = 0.66


## Generating forecasts for TEST data (X_test and X2_test) using the * predict () * method of the "model" and "model_2" objects

In [18]:
predicted_y = model.predict(X_test)
predicted_y_2 = model_2.predict(X2_test)

## Obtaining the coefficient of determination (R²) for the predictions of the two models

In [21]:
print('Model with mean temp')
print('R^2 = {}'.format(metrics.r2_score(y2_test, predicted_y_2).round(2)))

Model with mean temp
R^2 = 0.66


In [22]:
print('Model with max temp')
print('R^2 = {}'.format(metrics.r2_score(y_test, predicted_y).round(2)))

Model with max temp
R^2 = 0.69


# <font color = 'red' style = 'font-size: 30px;'> Other Regression Metrics </font>
<hr style = 'border: 2px solid red;'>

## Regression metrics
<hr>

source: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

Some statistics obtained from the regression model are very useful as a criterion for comparing estimated models and selecting the best model. The main regression metrics that scikit-learn makes available for linear models are as follows:

### Mean Squared Error

Mean of squares of errors. Better settings have a lower $EQM$.

$$EQM(y, \hat{y}) = \frac 1n\sum_{i=0}^{n-1}(y_i-\hat{y}_i)^2$$

### Root of the Mean Squared Error

Square root of the mean of the squares of the errors. Better settings have a lower $\sqrt{EQM}$.

$$\sqrt{EQM(y, \hat{y})} = \sqrt{\frac 1n\sum_{i=0}^{n-1}(y_i-\hat{y}_i)^2}$$

## Obtaining metrics for the Average Temperature model

In [28]:
mse_2 = metrics.mean_squared_error(y2_test, predicted_y_2).round(2)
sqrt_mse_2 = np.sqrt(mse_2).round(2)
r2_2 = metrics.r2_score(y2_test, predicted_y_2).round(2)

pd.DataFrame([mse_2, sqrt_mse_2, r2_2], ['MSE', 'SQRT MSE', 'R^2'], columns=['Metrics'])

Unnamed: 0,Metrics
MSE,6060775.46
SQRT MSE,2461.86
R^2,0.66


## Obtaining metrics for the Max Temperature model

In [30]:
mse = metrics.mean_squared_error(y_test, predicted_y).round(2)
sqrt_mse = np.sqrt(mse).round(2)
r2 = metrics.r2_score(y_test, predicted_y).round(2)

pd.DataFrame([mse, sqrt_mse, r2], ['MSE', 'SQRT MSE', 'R^2'], columns=['Metrics'])

Unnamed: 0,Metrics
MSE,5471976.38
SQRT MSE,2339.23
R^2,0.69
