# |Modeling| Walk Forward Validation with Tree-based models
## Forecasting Cross Validation pipeline with Tree-based Scikit-learn models

**Objetivo**: The primary objective of this notebook is to perform experimentation with the tree-based models for multi-step ahead forecasting.

**Conclusions**:


## 0 Imports and Config

In [4]:
import sys
import os

# sys.path.insert(0,'../..')
from concurrent.futures import ThreadPoolExecutor, as_completed

import warnings
from typing import Any
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import logging
from joblib import load, dump
from scipy.stats import uniform, randint, norm
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor

# from src.models.evaluate_model import walk_forward_validation, model_crossval_pipeline
# from src.config import *

In [5]:
with open("../../src/configuration/project_config.yaml", 'r') as f:
    config = yaml.safe_load(f.read())
    model_config = config['model_config']
    data_config = config['data_config']
    PROCESSED_DATA_PATH = data_config['paths']['processed_data_path']
    PROCESSED_DATA_NAME = data_config['table_names']['processed_table_name']
    OUTPUT_DATA_PATH = data_config['paths']['output_data_path']
    OUTPUT_DATA_NAME = data_config['table_names']['output_table_name']
    DAILY_PERFORMANCE_DATA_NAME = data_config['table_names']['model_performance_table_name']
    CROSS_VAL_DATA_NAME = data_config['table_names']['cross_validation_table_name']
    MODELS_PATH = data_config['paths']['models_path']
    TARGET_COL = model_config['target_col']
    CATEGORY_COL = model_config['category_col']
    PREDICTED_COL = model_config['predicted_col']
    FORECAST_HORIZON = model_config['forecast_horizon']
    features_list = config['features_list']
    available_models = model_config['available_models']

with open("../../src/configuration/logging_config.yaml", 'r') as f:  
    logging_config = yaml.safe_load(f.read())
    logging.config.dictConfig(logging_config)
    logging.getLogger('matplotlib').setLevel(logging.ERROR)
    logger = logging.getLogger(__name__)

In [6]:

def update_test_values(X: pd.DataFrame, y: pd.Series, day: int) -> tuple[pd.DataFrame, pd.Series]:
    """
    Prepares the feature and target data for testing on a specific day.

    This function extracts a single row (or the remaining rows if it's the last day) 
    from the input feature DataFrame (X) and target Series (y) to create a test set for 
    a specific day. The day is specified relative to the end of the DataFrame, where
    day 1 represents the last day, day 2 the second-to-last day, and so on.

    Args:
        X (pd.DataFrame): The feature DataFrame containing all historical data.
        y (pd.Series): The target Series containing all historical target values.
        day (int): The day to extract for testing, relative to the end of the DataFrames.
                   1 is the last day, 2 is the second-to-last, etc.

    Returns:
        tuple[pd.DataFrame, pd.Series]: A tuple containing:
            - X_test (pd.DataFrame): A DataFrame with the features for the specified day.
            - y_test (pd.Series): A Series with the target value for the specified day.

    Raises:
        IndexError: If the specified `day` is out of bounds for the input DataFrames.
    """
    if day != 1:
        # Select a single row using negative indexing
        X_test = X.iloc[-day:-day+1,:]
        y_test = y.iloc[-day:-day+1]

    else:
        # Handle the special case of the last day (day 1)
        X_test = X.iloc[-day:,:]
        y_test = y.iloc[-day:]

    X_test.reset_index(drop=True, inplace=True)

    return X_test, y_test


def calculate_metrics(pred_df, actuals, predictions):
    logger.debug("Calculating the evaluation metrics...")
    
    model_mape = round(mean_absolute_percentage_error(actuals, predictions), 4)
    model_rmse = round(np.sqrt(mean_squared_error(actuals, predictions)), 2)
    model_mae = round(mean_absolute_error(actuals, predictions), 2)
    model_wape = round((pred_df.ACTUAL - pred_df.FORECAST).abs().sum() / pred_df.ACTUAL.sum(), 2)

    pred_df["MAPE"] = model_mape
    pred_df["MAE"] = model_mae
    pred_df["WAPE"] = model_wape
    pred_df["RMSE"] = model_rmse

    return pred_df


def stepwise_validation(X: pd.DataFrame, y: pd.Series, forecast_horizon: int, model_type: Any, ticker: str, tune_params: bool = False) -> pd.DataFrame:
    """
    Performs iterativly 1 step ahead forecast validation for a given model type and ticker symbol.

    This function iteratively trains a model on historical data, then forecasts into the future using a sliding window approach.
    The forecast horizon is adjusted to exclude weekends. It returns a DataFrame with the actual and predicted values, along with performance metrics.

    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target variable.
        forecast_horizon (int): The number of days to forecast ahead.
        model_type (Any): The type of model to use (e.g., 'xgb', 'rf', 'et').
        ticker (str): The stock ticker symbol.
        tune_params (bool, optional): Whether to perform hyperparameter tuning. Defaults to False.

    Returns:
        pd.DataFrame: A DataFrame containing:
            - DATE: The dates of the predictions.
            - ACTUAL: The actual target values.
            - PREDICTED_COL: The predicted values.
            - MODEL_TYPE: The type of model used.
            - CLASS: "Testing" (indicates the type of data).
            - Additional columns with performance metrics (MAE, RMSE, MAPE).
    """

    # Create empty list for storing each prediction
    predictions = []
    actuals = []
    dates = []
    X_testing_df = pd.DataFrame()

    forecast_horizon = weekend_adj_forecast_horizon(forecast_horizon, 2)
    
    # get the one-shot training set
    X_train = X.iloc[:-forecast_horizon, :]
    y_train = y.iloc[:-forecast_horizon]
    final_y = y_train.copy()

    logger.debug(f"Last training date: {X_train["DATE"].max().date()}")

    best_model = train_model(
        X_train.drop(columns=["DATE"]),
        y_train,
        model_type,
        ticker,
        tune_params,
        save_model=False
    )

    for day in range(forecast_horizon, 0, -1):
        X_test, y_test = update_test_values(X, y, day)
        logger.debug(f"Testing Date: {X_test["DATE"].min().date()}")

        if len(predictions) != 0:

            X_test = update_lag_features(X_test, -1, list(final_y.values), X_test.columns)
            X_test = update_ma_features(X_test, -1, list(final_y.values), X_test.columns)

        prediction = best_model.predict(X_test.drop("DATE", axis=1))

        # store the results
        predictions.append(prediction[0])
        actuals.append(y_test.values[0])
        dates.append(X_test["DATE"].max())

        final_y = pd.concat([final_y, pd.Series(prediction[0])], axis=0)
        final_y = final_y.reset_index(drop=True)
        X_testing_df = pd.concat([X_testing_df, X_test], axis=0)

    pred_df = pd.DataFrame(list(zip(dates, actuals, predictions)), columns=["DATE", "ACTUAL", PREDICTED_COL])
    pred_df = calculate_metrics(pred_df, actuals, predictions)
    pred_df["MODEL_TYPE"] = str(type(best_model)).split('.')[-1][:-2]
    pred_df["CLASS"] = "Testing"
    
    X_testing_df[PREDICTED_COL] = predictions
    X_testing_df.reset_index(drop=True, inplace=True)

    # Plotting the Validation Results
    # validation_metrics_fig = visualize_validation_results(pred_df, model_mape, model_mae, model_wape, ticker)

    # Plotting the Learning Results
    #learning_curves_fig, feat_imp = extract_learning_curves(best_model, display=True)
    
    return pred_df, X_testing_df


In [13]:
def model_crossval_pipeline(tune_params, model_type, ticker):
    available_models = model_config['available_models']
    validation_report_df = pd.DataFrame()

    logger.debug("Loading the featurized dataset..")
    feature_df = pd.read_csv(os.path.join('../../'+PROCESSED_DATA_PATH, PROCESSED_DATA_NAME), parse_dates=["DATE"])

    # Check the ticker parameter
    if ticker:
        ticker = ticker.upper() + '.SA'
        feature_df = feature_df[feature_df[CATEGORY_COL] == ticker]
        
    # Check the model_type parameter 
    if model_type is not None and model_type not in available_models:
        raise ValueError(f"Invalid model_type: {model_type}. Choose from: {available_models}")
    
    elif model_type:
        available_models = [model_type.upper()]

    for ticker in feature_df[CATEGORY_COL].unique():
        filtered_feature_df = feature_df[feature_df[CATEGORY_COL] == ticker].copy().drop(CATEGORY_COL, axis=1)
        
        for model_type in available_models:
            logger.info(f"Performing model cross validation for ticker symbol [{ticker}] using model [{model_type}]...")
 
            predictions_df, X_testing_df = stepwise_validation(
                X=filtered_feature_df.drop(columns=[TARGET_COL], axis=1),
                y=filtered_feature_df[TARGET_COL],
                forecast_horizon=FORECAST_HORIZON,
                model_type=model_type,
                ticker=ticker,
                tune_params=tune_params
            )

            predictions_df[CATEGORY_COL] = ticker
            predictions_df["TRAINING_DATE"] = dt.datetime.today().date()
            validation_report_df = pd.concat([validation_report_df, predictions_df], axis=0)
    
    logger.info("Writing the testing results dataframe...")
    validation_report_df.to_csv(os.path.join(OUTPUT_DATA_PATH, CROSS_VAL_DATA_NAME), index=False)

In [14]:
model_crossval_pipeline(False, None, None)

2024-06-19 13:26:53,596 - __main__ - INFO - Performing model cross validation for ticker symbol [BOVA11.SA] using model [XGB]...


NameError: name 'weekend_adj_forecast_horizon' is not defined