In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Validation Strategy

The validation strategy is one of the most important steps in machine learning. If one fails to effectivelly validate your machine learning model, the results may be unexpectedly bad.

The idea is to create steps that will help you make a good estimate of your error on data as it will come in real life. We've seen that:
- if your model is too complex, it may memorize your training data (high variance)
- if your model is too simple, you'll have performance problems (high bias)

If your model memorizes your training data, you'll probably obtain a good performance score on your data, but when real data comes to you, you'll lose performance. We say that the model didn't `GENERALIZE` the patterns to unseen data. It means your error estimates were wrong. What can we do to avoid that?

We can simulate what real life would look like. For example, one strategy is to hide some data from your model, to check its reliability on this never-seen data.

## Hold-Out

The idea of the hold-out is to hide a part of the dataset and use it to test your model performance. The errors measured on your `Test set` will be a better estimate of the model performance in real life.

An important observation is that your model may be performing fairly well on your `Training Set`, but your `Test Set` performance may be poor. This may indicate your model is memorizing your data. 

There are several things that can be happening if you observe that your model is performing much better on the `Training data` than in your `Test data`.

- Model is too complex
    - solution: use a strategy to penalize model complexity if it doesn't bring much gain (regularization)
    
- Data Leakage
    - you may be using some information that you shouldn't have in your training. Information from the future. 

In [None]:
tb_auto = pd.read_csv('data/tb_autompg.csv', na_values='?')
tb_auto = tb_auto.dropna()

In [None]:
tb_auto.info()

In [None]:
tb_auto.head()

In [None]:
tb_auto.columns = [col.replace(' ', '_') for col in tb_auto.columns]

## EDA

In [None]:
sns.pairplot(tb_auto)

In [None]:
tb_auto['log_weight'] = np.log(tb_auto['weight'])
tb_auto['log_horsepower'] = np.log(tb_auto['horsepower'])
tb_auto['log_displacement'] = np.log(tb_auto['displacement'])
X_eda = tb_auto.drop(['mpg', 'car_name', 'weight', 'horsepower', 'displacement'], axis = 1)
scaler = StandardScaler().fit(X_eda)
pca_auto = PCA()
pca_auto.fit(scaler.transform(X_eda))
pca_eda = pca_auto.transform(scaler.transform(X_eda))
tb_pca_eda = pd.DataFrame(pca_eda, columns = ['PC' + str(i) for i in range(pca_eda.shape[1])])
tb_pca_eda['mpg'] = tb_auto['mpg']
tb_pca_eda['log_mpg'] = np.log(tb_pca_eda['mpg'])

In [None]:
sns.pairplot(tb_pca_eda)

## Holdout in Python

Usually, people tend to separate approximately 20% of the dataset as a test (or holdout) set.

In [None]:
X = tb_pca_eda.drop(['mpg', 'log_mpg'], axis = 1)
y = tb_pca_eda['log_mpg']

In [None]:
X['PC0_PC1'] = X['PC0'] * X['PC1']

In [None]:
X.head()

Criar interações entre variáveis é um processo tedioso utilizando apenas a biblioteca pandas...

In [None]:
import patsy

In [None]:
y, X = patsy.dmatrices('log_mpg ~ PC0 + PC1 + PC2', data = tb_pca_eda, return_type="dataframe")

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Your error should be estimated using the `Test Set`. This will be a better estimate of your true error.

Not only that, you should also calculate the error on your `Training Set` (called `training error`). This will be a good comparison to check whether your results on your `Test Set` (called `test error`) are getting too far from the results on your `Test Set`, which, again, can indicate an overfitting.

## Data Leakage example

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# R2 - correlação entre valores previstos e valores reais
print(model.score(X_test, y_test))

In [None]:
tb_pred_leak = pd.DataFrame({'y_real' : np.exp(y_test['log_mpg'])})
tb_pred_leak['pred'] = np.exp(model.predict(X_test))

In [None]:
sns.scatterplot(data = tb_pred_leak, x = 'pred', y = 'y_real')

### Algumas medidas de erro

In [None]:
tb_pred_leak['erro_pred'] = tb_pred_leak['y_real'] - tb_pred_leak['pred']

#### RMSE

O RMSE é uma medida de erro equivalente ao desvio padrão dos resíduos, ou seja, ele mede o erro de previsão do modelo nas unidades da variável resposta

In [None]:
print(np.sqrt(np.mean(tb_pred_leak['erro_pred'] ** 2)))

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
np.sqrt(mean_squared_error(tb_pred_leak['y_real'], tb_pred_leak['pred']))

#### RMSPE

Se a nossa variável resposta é positiva (Y > 0) então podemos calcular o erro médio percentual a partir do RMSPE

In [None]:
print(np.sqrt(np.mean((tb_pred_leak['erro_pred']/tb_pred_leak['y_real']) ** 2)))

#### MAPE

O MAPE é uma medida de erro muito utilizada em áreas que realizam previsões de demanda. Ele é semelhante ao RMSPE mas utiliza o valor absoluto (módulo) para corrigir erros negativos.

In [None]:
print(np.mean(abs(tb_pred_leak['erro_pred'])/tb_pred_leak['y_real']))

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
mean_absolute_percentage_error(tb_pred_leak['y_real'], tb_pred_leak['pred'])

**What's wrong with the process above?**

## Solution: the EXACT same steps used to prepare your training data should be used on the new data 


So, you fit the standard scaler on your training data, and don't fit it again on your test data. Effectively, you'll be using the `mean` and `standard deviation` from the StandardScaler as you've seen on your training data (<b>pipelines</b> will soon come to rescue us for that).

In [None]:
X_pre = tb_auto.drop(['mpg', 'car_name'], axis = 1)
y_pre = tb_auto['mpg']

In [None]:
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size = 0.2)

In [None]:
X_train_pre['log_weight'] = np.log(X_train_pre['weight'])
X_train_pre['log_horsepower'] = np.log(X_train_pre['horsepower'])
X_train_pre['log_displacement'] = np.log(X_train_pre['displacement'])
X_train_pre = X_train_pre.drop(['weight', 'horsepower', 'displacement'], axis = 1).copy()
X_train_pre.describe()
scaler = StandardScaler().fit(X_train_pre)
pca_auto = PCA()
pca_auto.fit(scaler.transform(X_train_pre))
pca_train = pca_auto.transform(scaler.transform(X_train_pre))
tb_X_pca_train = pd.DataFrame(pca_train, columns = ['PC' + str(i) for i in range(pca_train.shape[1])])

In [None]:
X_train = patsy.dmatrix(' ~ PC0 + PC1 + PC2', data = tb_X_pca_train, return_type="dataframe")
y_train = np.log(y_train_pre)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
X_test_pre['log_weight'] = np.log(X_test_pre['weight'])
X_test_pre['log_horsepower'] = np.log(X_test_pre['horsepower'])
X_test_pre['log_displacement'] = np.log(X_test_pre['displacement'])
X_test_pre = X_test_pre.drop(['weight', 'horsepower', 'displacement'], axis = 1).copy()
X_test_pre.describe()

In [None]:
pca_test = pca_auto.transform(scaler.transform(X_test_pre))
tb_X_pca_test = pd.DataFrame(pca_test, columns = ['PC' + str(i) for i in range(pca_test.shape[1])])

In [None]:
X_test = patsy.dmatrix(' ~ PC0 + PC1 + PC2', data = tb_X_pca_test, return_type="dataframe")
y_test = np.log(y_test_pre)

In [None]:
tb_pred_test = pd.DataFrame({'y_real' : y_test_pre, 'pred' : np.exp(model.predict(X_test))})
tb_pred_test['erro_pred'] = tb_pred_test['y_real'] - tb_pred_test['pred']

In [None]:
sns.scatterplot(data = tb_pred_test, x = 'pred', y = 'y_real')

In [None]:
print(f"RMSE: {np.sqrt(np.mean(tb_pred_test['erro_pred'] ** 2))}")
print(f"RMSE: {np.sqrt(np.mean((tb_pred_leak['erro_pred']/tb_pred_leak['y_real']) ** 2))}")