# Train test split to evaluate model

In [1]:

from modules import utils
utils.configure_plotly_template(showlegend=True)

## Data

In [2]:
import pandas as pd

df = pd.read_parquet('../../../data/statsmodels/AirPassengers.parquet').asfreq('ME')
df.columns = ['values']
df.index.name = 'datetime'

df

Unnamed: 0_level_0,values
datetime,Unnamed: 1_level_1
1949-01-31,112
1949-02-28,118
...,...
1960-11-30,390
1960-12-31,432


In [3]:
import numpy as np
df['values_log'] = np.log(df['values'])

df

Unnamed: 0_level_0,values,values_log
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-31,112,4.718499
1949-02-28,118,4.770685
...,...,...
1960-11-30,390,5.966147
1960-12-31,432,6.068426


In [4]:
series = df["values_log"]
series

datetime
1949-01-31    4.718499
1949-02-28    4.770685
                ...   
1960-11-30    5.966147
1960-12-31    6.068426
Freq: ME, Name: values_log, Length: 144, dtype: float64

In [5]:
df_base = df.copy()

## Previous lessons: overfitting

1. Evaluate the model on the same series used for training.
2. Overfitting problem: the model is good to predict historical series, but not for the future.
3. Businesses depend on predicting the future, not the past.

In [6]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

model = SARIMAX(series, order=(0, 1, 1), seasonal_order=(0, 1, 1, 12))
model_fit = model.fit()

df['predictions_log'] = model_fit.predict()
df['predictions_log_exp'] = np.exp(df['predictions_log'])

df

Unnamed: 0_level_0,values,values_log,predictions_log,predictions_log_exp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1949-01-31,112,4.718499,0.000000,1.000000
1949-02-28,118,4.770685,4.718499,112.000000
...,...,...,...,...
1960-11-30,390,5.966147,5.993307,400.737585
1960-12-31,432,6.068426,6.083396,438.515902


In [7]:
from sklearn.metrics import root_mean_squared_error

idx = 12 + 1

real = df["values"][idx:]
pred = df["predictions_log_exp"][idx:]

score = root_mean_squared_error(real, pred)

score

10.714860511741556

## Train test slpit to detect overfitting

### Split

In [8]:
from sklearn.model_selection import train_test_split

df = df[['values', 'values_log']]
df_train, df_test = train_test_split(df, shuffle=False, test_size=0.3)

In [9]:
df_train

Unnamed: 0_level_0,values,values_log
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-31,112,4.718499
1949-02-28,118,4.770685
...,...,...
1957-03-31,356,5.874931
1957-04-30,348,5.852202


In [10]:
df_test

Unnamed: 0_level_0,values,values_log
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1957-05-31,355,5.872118
1957-06-30,422,6.045005
...,...,...
1960-11-30,390,5.966147
1960-12-31,432,6.068426


### Evaluate model

In [11]:
model = SARIMAX(df_train['values_log'], order=(0, 1, 1), seasonal_order=(0, 1, 1, 12), enforce_invertibility=False, enforce_stationarity=False)
model_fit = model.fit()

#### Test data

In [12]:
df = df_test.copy()

start, end = df.index[[0,-1]]
df["predictions_log"] = model_fit.predict(start=start, end=end)
df["predictions_log_exp"] = np.exp(df["predictions_log"])

idx = 12 + 1

score = root_mean_squared_error(df["values"][idx:], df["predictions_log_exp"][idx:])
score

43.43902943845554

In [13]:
df_test = df.copy()

#### Train data

In [14]:
df = df_train.copy()
df

Unnamed: 0_level_0,values,values_log
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-31,112,4.718499
1949-02-28,118,4.770685
...,...,...
1957-03-31,356,5.874931
1957-04-30,348,5.852202


In [15]:
start, end = df.index[[0,-1]]
df["predictions_log"] = model_fit.predict(start=start, end=end)
df["predictions_log_exp"] = np.exp(df["predictions_log"])

idx = 12 + 1

score = root_mean_squared_error(df["values"][idx:], df["predictions_log_exp"][idx:])
score

8.545512270973711

In [16]:
df_train = df.copy()

### Interpret overfitting

In [17]:
df_pred = pd.DataFrame({
    'train': df_train['values'],
    'test': df_test['values'],
    'train_forecast_sarima': df_train['predictions_log_exp'],
    'test_forecast_sarima': df_test['predictions_log_exp'],
})

df_pred

Unnamed: 0_level_0,train,test,train_forecast_sarima,test_forecast_sarima
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1949-01-31,112.0,,1.0,
1949-02-28,118.0,,112.0,
...,...,...,...,...
1960-11-30,,390.0,,438.001046
1960-12-31,,432.0,,499.663157


In [18]:
df_pred[12+1:].plot()

## Model comparison: SARIMA vs ETS vs Prophet

In [21]:
configs = {
    'sarima': {
        'model_params': {
            'order': (0, 1, 1),
            'seasonal_order': (0, 1, 1, 12),
            'enforce_invertibility': False,
            'enforce_stationarity': False,
        },
        'log_transform': True,
    },
    'ets': {
        'model_params': {
            'trend': 'add',
            'seasonal': 'mul',
            'damped_trend': False,
        },
        'log_transform': False,
    },
    'prophet': {
        'model_params': {
            'seasonality_mode': 'multiplicative',
            'yearly_seasonality': True,
        },
        'log_transform': True,
    },
}

In [20]:
train, test = train_test_split(df_base['values'], test_size=0.3, shuffle=False)

In [22]:
tf = utils.TimeSeriesForecaster(train=train, test=test, freq="ME", idx_offset=13)

from sklearn.metrics import root_mean_squared_error, mean_absolute_error

metrics = {
    'rmse': root_mean_squared_error,
    'mae': mean_absolute_error 
}

In [23]:
from modules.utils import TimeSeriesForecaster

tf = TimeSeriesForecaster(train=train, test=test, freq="ME")
df_forecast = tf.bulk_forecast(configs, metrics=metrics)
df_forecast

10:56:27 - cmdstanpy - INFO - Chain [1] start processing
10:56:27 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,model,split,values,datetime,rmse,mae
0,sarima,train,"[121.1607144324168, 140.94915253005868, 137.77...","DatetimeIndex(['1950-02-28', '1950-03-31', '19...",8.545512,6.675626
1,sarima,test,"[351.4576739515267, 404.5978119857047, 452.697...","DatetimeIndex(['1957-05-31', '1957-06-30', '19...",38.558619,33.725548
...,...,...,...,...,...,...
4,prophet,train,"[119.74588031063298, 138.14157103814293, 134.0...","DatetimeIndex(['1950-02-28', '1950-03-31', '19...",6.065670,4.589269
5,prophet,test,"[353.3604620080639, 408.01507923894104, 453.50...","DatetimeIndex(['1957-05-31', '1957-06-30', '19...",46.292891,39.641206


In [25]:
dfp = df_forecast.pivot(index=["split"], columns="model", values="rmse")
dfp.style.background_gradient(cmap="Greens_r", axis=None).format(precision=2)

model,ets,prophet,sarima
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,31.86,46.29,38.56
train,6.94,6.07,8.55


In [23]:
df_combined = tf.combine_with_historical(df_forecast)
df_combined

10:55:37 - cmdstanpy - INFO - Chain [1] start processing
10:55:37 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,model,split,datetime,values
0,ets,test,1957-05-31,346.973505
1,ets,test,1957-06-30,393.889927
...,...,...,...,...
522,historical,test,1960-11-30,390
523,historical,test,1960-12-31,432


In [24]:
import plotly.express as px

fig = px.line(
    data_frame=df_combined,
    x='datetime',
    y='values',
    color='model',
    facet_col='split',
    category_orders={'split': ['train', 'test']}
)

fig.update_xaxes(matches=None)

| Objetivo                                    | Confía más en... | Justificación                                                   |
| ------------------------------------------- | ---------------- | --------------------------------------------------------------- |
| **Forecast inmediato (pocos pasos)**        | Test split       | Optimizas el rendimiento empírico                               |
| **Modelo estable, reusable, interpretable** | Diagnostics      | Te aseguras que el modelo captura bien la estructura subyacente |
| **Forecast multistep largo**                | Diagnostics      | Modelos mal especificados se degradan con el horizonte          |


Buena pregunta. **No, no tiene sentido asegurar la calidad predictiva de un modelo basándote solo en los errores in-sample (residuos sobre el train)**. Pero **sí tiene sentido analizarlos para validar la estructura del modelo**.

---

### 🔍 Diferenciemos:

#### ✅ **Usar los residuos in-sample tiene sentido para:**

* Verificar que el modelo esté bien especificado (sin autocorrelación, varianza constante).
* Asegurarte de que no hay patrones no explicados.
* Validar que los supuestos del modelo se cumplen.

#### ❌ **Usar los residuos in-sample para reportar error de predicción es engañoso:**

* Estás midiendo el ajuste, no la capacidad de generalización.
* Es un error común que da modelos con bajo error in-sample pero que fallan fuera de muestra (overfitting).

---

### 📌 Ejemplo concreto:

* Un `ARIMA(12,1,1)` puede tener residuos in-sample muy pequeños.
* Pero si el modelo está sobreajustado o no generaliza, su error out-of-sample será alto.
* Solo evaluando el RMSE en el test set puedes confirmar su poder predictivo.

---

### ✅ Conclusión clara:

> **Diagnósticos in-sample te dicen si el modelo tiene sentido. Error out-of-sample te dice si sirve.**

¿Quieres una visualización clara que muestre ambos lados en una notebook o clase?