## Importamos librerías


In [77]:
import pandas as pd

## Cargamos la data


In [78]:
train_df = pd.read_parquet("../../../data/time_series/train_df.parquet").asfreq("D")
validation_df = pd.read_parquet(
    "../../../data/time_series/validation_df.parquet"
).asfreq("D")
test_df = pd.read_parquet("../../../data/time_series/test_df.parquet").asfreq("D")

In [79]:
# Missing values
# ==============================================================================
print(
    f"Number of rows in train with missing values: {train_df.isnull().any(axis=1).mean()}"
)
print(
    f"Number of rows in validation with missing values: {validation_df.isnull().any(axis=1).mean()}"
)
print(
    f"Number of rows in test with missing values: {test_df.isnull().any(axis=1).mean()}"
)

Number of rows in train with missing values: 0.0
Number of rows in validation with missing values: 0.0
Number of rows in test with missing values: 0.0


## Recursive multi-step forecasting


In [80]:
from sklearn.ensemble import RandomForestRegressor
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [81]:
forecaster = ForecasterAutoreg(
    regressor=RandomForestRegressor(random_state=123), lags=7
)

forecaster.fit(y=train_df["gap"])
forecaster

ForecasterAutoreg 
Regressor: RandomForestRegressor(random_state=123) 
Lags: [1 2 3 4 5 6 7] 
Transformer for y: None 
Transformer for exog: None 
Window size: 7 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2001-11-01 00:00:00'), Timestamp('2015-07-02 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 123, 'verbose': 0, 'warm_start': False} 
fit_kwargs: {} 
Creation date: 2024-05-29 16:12:24 
Last fit date: 2024-05-29 16:12:27 
Skforecast version: 

## Predicciones en el set de validación


In [82]:
steps = len(validation_df)
predictions = forecaster.predict(steps=steps)
predictions.head(5)

2015-07-03    68593.07
2015-07-04    69629.61
2015-07-05    66868.56
2015-07-06    70224.02
2015-07-07    66613.19
Freq: D, Name: pred, dtype: float64

In [83]:
import plotly.graph_objects as go

In [84]:
fig = go.Figure()

# Add historical data line
fig.add_trace(
    go.Scatter(
        x=train_df.index,
        y=train_df["gap"],
        mode="lines",
        name="Historical data",
        line=dict(color="royalblue"),
    )
)

# Add validation data line
fig.add_trace(
    go.Scatter(
        x=validation_df.index,
        y=validation_df["gap"],
        mode="lines",
        name="validation",
        line=dict(color="green"),
    )
)

# Add median forecast line
fig.add_trace(
    go.Scatter(
        x=predictions.index,
        y=predictions,
        mode="lines",
        name="forecast RF",
        line=dict(color="tomato"),
    )
)


fig.update_layout(
    title="Forecast con data histórica y validación",
    xaxis_title="Time",
    yaxis_title="Value",
    legend_title="Legend",
    template="plotly_white",
    width=800,
    height=400,
)

# Show the figure
fig.show()

In [85]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

## Métricas con la data de validación


In [86]:
error_rmse = root_mean_squared_error(y_true=validation_df["gap"], y_pred=predictions)
error_mape = mean_absolute_percentage_error(
    y_true=validation_df["gap"], y_pred=predictions
)
print(f"Test error (RMSE): {error_rmse}")
print(f"Test error (MAE): {error_mape}")

Test error (RMSE): 20365.78652460191
Test error (MAE): 0.18189947271419799


## Hyperparameter tuning


In [87]:
forecaster = ForecasterAutoreg(
    regressor=RandomForestRegressor(random_state=123),
    lags=7,  # This value will be replaced in the grid search
)

# Candidate values for lags
lags_grid = [3, 7, 10]

# Candidate values for regressor's hyperparameters
param_grid = {
    "n_estimators": [1, 2, 3, 5],
    "max_depth": [50, 100, 150],
}

results_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=train_df["gap"],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=len(validation_df),
    refit=False,
    metric=root_mean_squared_error,
    initial_train_size=int(len(train_df["gap"]) * 0.5),
    fixed_train_size=False,
    return_best=True,
    n_jobs="auto",
    verbose=False,
)

Number of models compared: 36.


lags grid:   0%|          | 0/3 [00:00<?, ?it/s]

params grid:   0%|          | 0/12 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'max_depth': 100, 'n_estimators': 2}
  Backtesting metric: 26805.785603257227



In [88]:
results_grid = results_grid.reset_index()
best_max_depth = results_grid["params"][0]["max_depth"]
best_n_estimators = results_grid["params"][0]["n_estimators"]
print(f"Best max depth: {best_max_depth}")
print(f"Best n_estimators: {best_n_estimators}")

Best max depth: 100
Best n_estimators: 2


## Modelo final con los mejores hiperparámetros


In [89]:
train_and_validation = pd.concat([train_df, validation_df])

In [90]:
regressor = RandomForestRegressor(
    n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=123
)

forecaster = ForecasterAutoreg(regressor=regressor, lags=7)

forecaster.fit(y=train_and_validation["gap"])

## Predicciones con el modelo final en la data de test


In [91]:
predictions_test = forecaster.predict(steps=len(test_df))

Calculamos el RMSE y el MAE para la data de test


In [92]:
error_rmse = root_mean_squared_error(y_true=test_df["gap"], y_pred=predictions_test)
error_mape = mean_absolute_percentage_error(
    y_true=test_df["gap"], y_pred=predictions_test
)
print(f"Test error (RMSE): {error_rmse}")
print(f"Test error (MAE): {error_mape}")

Test error (RMSE): 16184.817656268688
Test error (MAE): 0.16631261904482017


## Visualización de los resultados con la data real y predicha para test


In [93]:
fig = go.Figure()

# Add historical data line
fig.add_trace(
    go.Scatter(
        x=train_and_validation.index,
        y=train_and_validation["gap"],
        mode="lines",
        name="train and validation",
        line=dict(color="royalblue"),
    )
)

# Add validation data line
fig.add_trace(
    go.Scatter(
        x=test_df.index,
        y=test_df["gap"],
        mode="lines",
        name="test",
        line=dict(color="green"),
    )
)

# Add median forecast line
fig.add_trace(
    go.Scatter(
        x=predictions_test.index,
        y=predictions_test,
        mode="lines",
        name="forecast RF",
        line=dict(color="tomato"),
    )
)


fig.update_layout(
    title="Forecast with Historical Data",
    xaxis_title="Time",
    yaxis_title="Value",
    legend_title="Legend",
    template="plotly_white",
    width=800,
    height=400,
)

# Show the figure
fig.show()

**Backtesting with refit (rolling origin)**


In [94]:
metric, predictions_backtest = backtesting_forecaster(
    forecaster=forecaster,
    y=pd.concat([train_and_validation, test_df])["gap"],
    initial_train_size=len(train_and_validation["gap"]),
    fixed_train_size=False,
    steps=len(test_df),
    metric=root_mean_squared_error,
    refit=True,
    verbose=True,
    show_progress=True,
)

print(f"Backtest metric (RMSE): {metric}")

Information of backtesting process
----------------------------------
Number of observations used for initial training: 5022
Number of observations used for backtesting: 30
    Number of folds: 1
    Number of steps per fold: 30
    Number of steps to exclude from the end of each train set before test (gap): 0

Fold: 0
    Training:   2001-11-01 00:00:00 -- 2015-08-01 00:00:00  (n=5022)
    Validation: 2015-08-02 00:00:00 -- 2015-08-31 00:00:00  (n=30)



  0%|          | 0/1 [00:00<?, ?it/s]

Backtest metric (RMSE): 16184.817656268688
