# **Libraries**

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
import xgboost as xgb

# **Load Datasets**

In [None]:
df_full = pd.read_csv('..//data//processed_datasets//processed_datasets.csv')
df_full.head(5)

In [None]:
# reset index
df_full['date'] = pd.to_datetime(df_full['date'])
df_full = df_full.set_index("date")
df_full = df_full.asfreq('D')
df_full = df_full.sort_index()
df_full.head(5)

In [None]:
# Because it is tree based algorithm,we won't use cyclical encoding
cyclical_encoding_columns = [
  'dayofweek_sin',
  'dayofweek_cos',
  'quarter_sin',
  'quarter_cos',
  'month_sin',
  'month_cos',
  'dayofyear_sin',
  'dayofyear_cos',
  'dayofmonth_sin',
  'dayofmonth_cos',
  'weekofyear_sin',
  'weekofyear_cos'
]

# Drop columns
df_full = df_full.drop(cyclical_encoding_columns, axis=1)
df_full.head(5)

# **dataloader**

In [None]:
def create_xy_multiple_input_single_output(
    df,
    target_column,
    exog_columns,
    window_size,
    prediction_horizon,
    flatten=False,
    return_shape=False
    ):
    """
    Transform time series data into supervised tasks with input-output pairs.

    Parameters:
      - df (pandas.DataFrame): The DataFrame containing the time series data.
      - target_column (str): The name of the target column for prediction.
      - exog_columns (list): A list of column names representing exogenous variables.
      - window_size (int): The size of the lookback window (past observations) to create inputs in input-output pairs 
      - prediction_horizon (int): The number of steps ahead to predict.
      - flatten (bool, optional): Whether to flatten the inputs of the input-output pairs created (default is False).
      - return_shape (bool, optional): Whether to print and return the shapes of input and output arrays (default is False).

    Returns:
      - tuple or None: A tuple containing input data and corresponding output data if flatten is False, 
                   otherwise, flattened input data and output data. If return_shape is True, 
                   prints and returns the shapes of input and output arrays in input-output pairs.
    """
    exog_vars = df[exog_columns]

    x = []
    y = []
    for i in range(0, len(df) - window_size - prediction_horizon + 1):
        x_window = df.iloc[i:(i + window_size)][target_column].values
        exog_window = exog_vars.iloc[i:(i + window_size)].values  # Extract the corresponding exogenous window
        # Reshape x_window to have 2 dimensions
        x_window = np.atleast_2d(x_window).T
        # Stack the exogenous variable horizontally
        x_window = np.hstack((x_window, exog_window))
        x.append(x_window)
        y.append(df.iloc[(i + window_size):(i + window_size + prediction_horizon)][target_column].values)

    x = np.array(x)
    y = np.array(y)

    if flatten:
      n_input = x.shape[1] * x.shape[2]
      x_flattened = x.reshape((x.shape[0], n_input))

      if return_shape:
        print(f'x shape: {x_flattened.shape}')
        print(f'y shape: {y.shape}')

      return x_flattened, y

    else:
      if return_shape:
        print(f'x shape: {x.shape}')
        print(f'y shape: {y.shape}')

      return x, y

In [None]:
def create_single_fold(train_set, n_fold, ith_fold, test_size, lookback_window):
    """
    Create subsets of data including three dataframes including the subsets for training, evaluating, and lookback values used for making predictions into the future 
    
    Parameters:
      - train_set (pd.DataFrame): The input dataframe from which subsets will be created
      - n_fold (int): A total number of subsets to be created 
      - ith_fold (int): A current index of subset in n_fold 
      - test_size (int): The number of steps ahead to predict (also the size of evaluation set in each subset)
      - lookback_window (int): The size of the lookback window (past observations) to include
    
    Returns:
        - tuple: A tuple contains three dataframes including the subsets for training, evaluating, and lookback values used for making predictions into the future 
    """
    # Calculate the start and end indices for the test set of the current fold
    start_test_idx = - test_size * (n_fold - ith_fold + 1)
    end_test_idx = start_test_idx + test_size
    lookback_test_idx = start_test_idx - lookback_window

    # Extract the train set for the current fold (excluding the test set)
    fold_train_set = train_set.iloc[:start_test_idx]

    # Extract the test set for the current fold
    if ith_fold == n_fold:
      fold_test_set = train_set.iloc[start_test_idx:]
      fold_test_lookback = train_set.iloc[lookback_test_idx:]
    else:
      fold_test_lookback = train_set.iloc[lookback_test_idx:end_test_idx]
      fold_test_set = train_set.iloc[start_test_idx:end_test_idx]


    return fold_train_set, fold_test_set, fold_test_lookback

# **modelling**

In [None]:
def calculate_forecast_bias(pred_series, actual_series):
    """ 
    Forecast Bias (FB)

    Parameters
        - actual_series (pd.Series): The time series of actual values  
        - pred_series (pd.Series): The time series of predicted values.

    Returns
        - float: The Forecast Bias 
    """

    y_true, y_pred = actual_series, pred_series
    
    y_true_sum, y_pred_sum = np.sum(y_true), np.sum(y_pred)
    # raise_if_not(y_true_sum > 0, 'The series of actual value cannot sum to zero when computing OPE.', logger)
    return ((y_pred_sum - y_true_sum) / len(y_true))

In [None]:

def run_xgb_full(data_train, data_test, target_column, exog_column, window_size, horizon, xgb_params=None):
    """
    Run XGBoost model for time series forecasting.

    Parameters:
        - data_train (pandas.DataFrame): The DataFrame containing the training data.
        - data_test (pandas.DataFrame): The DataFrame containing the testing data.
        - target_column (str): The name of the target column for prediction.
        - exog_column (list): A list of column names representing exogenous variables.
        - window_size (int): The size of the lookback window (past observations) to create inputs in input-output pairs 
        - horizon (int): The number of steps ahead to predict.
        - xgb_params (dict, optional): Dictionary of parameters to configure the XGBoost (default is None).

    Returns:
        - tuple: A tuple containing the following elements:
            - A dictionary containing evaluation metrics (MAE, RMSE, MAPE, sMAPE, forecast bias).
            - A DataFrame containing forecasted and actual values.
            - Trained XGBoost model.
    """
    ## Create input-output pairs for training model
    train_x, train_y = create_xy_multiple_input_single_output(
        df=data_train,
        target_column=target_column,
        exog_columns=exog_column,
        window_size=window_size,
        prediction_horizon=horizon,
        flatten=True,
        return_shape=True
    )
    
    # Create input-output pairs for testing 
    test_x, test_y = create_xy_multiple_input_single_output(
        df=data_test,
        target_column=target_column,
        exog_columns=exog_column,
        window_size=window_size,
        prediction_horizon=horizon,
        flatten=True,
        return_shape=True
    )
    
    # Modelling 
    lst_col = []
    lst_col.append(target_column)
    lst_col = lst_col + exog_column

    feature_name_list = []
    for i in range(window_size, 0, -1):
        feature_name_list += [f'{col}_lag_{str(i)}' for col in lst_col]
    if xgb_params == None:
        model = MultiOutputRegressor(xgb.XGBRegressor(random_state=42))
    else:
        model = MultiOutputRegressor(xgb.XGBRegressor(**xgb_params, random_state=42))

    model.fit(train_x, train_y)
    forecast = model.predict(test_x)

    # Return forecasted and actual values dataframe 
    df_result = data_test[-horizon:][[target_column]]
    df_result['Forecast'] = forecast[0]

    # Calculate metrics based on the specified metric
    metric_eval_dict = {}
    target_series = data_test[-horizon:][target_column]
    metric_eval_dict['mae'] = np.mean(np.abs(forecast[0] - target_series)) #mae:
    metric_eval_dict['rmse'] = np.sqrt(np.mean((forecast[0] - target_series)**2)) #rmse
    metric_eval_dict['mape'] = np.mean(np.abs((forecast[0] - target_series) / target_series)) * 100 #mape
    metric_eval_dict['smape'] = 2 * np.mean(np.abs(forecast[0] - target_series) / (np.abs(forecast[0]) + np.abs(target_series))) * 100 #smape
    metric_eval_dict['forecast_bias'] = calculate_forecast_bias(forecast[0], target_series)
    
    return metric_eval_dict, df_result, model

In [None]:

def plot_feature_importance(model, n_limit, target_column, exog_columns, window_size):
    """
    Plot feature importance for a XGBoost model.

    Parameters:
        - model: The trained LightGBM model.
        - n_limit (int): The number of features to plot with highest importance values.
        - target_column (str): The name of the target column for prediction.
        - exog_columns (list): A list of column names representing exogenous variables.
        - window_size (int): The size of the lookback window (past observations) 

    Returns:
        - None
    """
    lst_col = [target_column] + exog_columns
    feature_name_list = [f'{col}_lag_{str(i)}' for col in lst_col for i in range(window_size, 0, -1)]

    # Set feature names for each booster
    for estimator in model.estimators_:
        if hasattr(estimator, 'get_booster'):
            estimator.get_booster().feature_names = feature_name_list

    # Get feature importance scores from the first estimator
    feature_important = model.estimators_[0].get_booster().get_score(importance_type='weight')

    # Normalize feature importance scores
    total_importance = sum(feature_important.values())
    normalized_importance = {key: value / total_importance for key, value in feature_important.items()}

    # Sort the features by importance and get the indices of the top n_limit features
    sorted_features = sorted(normalized_importance.items(), key=lambda x: x[1], reverse=True)[:n_limit][::-1]
    features, importance = zip(*sorted_features)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(features)), importance, align='center')
    plt.yticks(range(len(features)), features)
    plt.xlabel('Normalized Feature Importance')
    plt.ylabel('Features')
    plt.title('Top Feature Importance Plot')
    plt.show()

In [None]:
def run_xgb(data_train, data_test, target_column, exog_column, window_size, horizon, metric, xgb_params=None):
    """
    Run XGBoost model for time series forecasting. 
    This is a simple implementation which only returns a value of a certain error metric 

    Parameters:
        - data_train (pandas.DataFrame): The DataFrame containing the training data.
        - data_test (pandas.DataFrame): The DataFrame containing the testing data.
        - target_column (str): The name of the target column for prediction.
        - exog_column (list): A list of column names representing exogenous variables.
        - window_size (int): The size of the lookback window (past observations) to create inputs in input-output pairs 
        - horizon (int): The number of steps ahead to predict.
        - metric (str): Select a metric of which a value will be returned. Metrics include MAE, RMSE, MAPE, sMAPE
        - xgb_params (dict, optional): Dictionary of parameters to configure the XGBoost (default is None).

    Returns:
        - float: A value of the metric chosen will be returned 
    """
    # Create data train
    train_x, train_y = create_xy_multiple_input_single_output(
        df=data_train,
        target_column=target_column,
        exog_columns=exog_column,
        window_size=window_size,
        prediction_horizon=horizon,
        flatten=True,
        return_shape=True
    )
    
    # Create data test 
    test_x, test_y = create_xy_multiple_input_single_output(
        df=data_test,
        target_column=target_column,
        exog_columns=exog_column,
        window_size=window_size,
        prediction_horizon=horizon,
        flatten=True,
        return_shape=True
    )
    
    # Modelling 
    lst_col = []
    lst_col.append(target_column)
    lst_col = lst_col + exog_column

    feature_name_list = []
    for i in range(window_size, 0, -1):
        feature_name_list += [f'{col}_lag_{str(i)}' for col in lst_col]
    if xgb_params == None:
        model = MultiOutputRegressor(xgb.XGBRegressor(random_state=42), n_jobs = -1)
    else:
        model = MultiOutputRegressor(xgb.XGBRegressor(**xgb_params, random_state=42), n_jobs = -1)

    model.fit(train_x, train_y)
    forecast = model.predict(test_x)

    # Calculate metrics based on the specified metric
    metric_val = None
    metric = 'mae'  # Change this to 'rmse', 'mape', or 'smape' for different metrics
    if metric == 'mae':
        metric_val = np.mean(np.abs(forecast - test_y))
    elif metric == 'rmse':
        metric_val = np.sqrt(np.mean((forecast - test_y)**2))
    elif metric == 'mape':
        metric_val = np.mean(np.abs((forecast - test_y) / test_y)) * 100
    elif metric == 'smape':
        metric_val = 2 * np.mean(np.abs(forecast - test_y) / (np.abs(forecast) + np.abs(test_y))) * 100
    
    return metric_val 

# **tuning**

In [None]:
def generate_grid_search_combinations(param_distributions):
  """
  Generates a list of dictionaries representing all possible combinations of values
  from the given parameter distributions.

  Parameters:
      - param_distributions: A dictionary where keys are parameter names and values are
                          lists of possible values for each parameter.

  Returns:
      - A list of dictionaries, where each dictionary represents a unique combination
      of parameter values.
  """

  combinations = []
  for key, values in param_distributions.items():
    if combinations:
      # Combine existing combinations with each value for the current key
      new_combinations = []
      for value in values:
        for combination in combinations:
          new_combination = {**combination, key: value}
          new_combinations.append(new_combination)
      combinations = new_combinations
    else:
      # For the first key, each value becomes a separate combination
      combinations = [{key: value} for value in values]
  return combinations


In [None]:
import random
def cv_xgb(
    train_df,
    target_column,
    exog_columns,
    horizon,
    n_folds,
    param_grid,
    eval_metric='mae',
    method='random search',
    n_iter=5,
    return_result=True
):
    """
    Perform cross-validation with XGBoost for time-series forecasting.
    Print best hyperparameter combination 

    Parameters:
      - train_df (pandas.DataFrame): The DataFrame containing the training data.
      - target_column (str): The name of the target column for prediction.
      - exog_columns (list): A list of column names representing exogenous variables.
      - horizon (int): The number of steps ahead to predict.
      - n_folds (int): The number of folds for cross-validation.
      - param_grid (dict): A dictionary specifying the hyperparameter grid.
      - eval_metric (str, optional): The evaluation metric to use for model performance (default is 'mae').
      - method (str, optional): The method for parameter search, either 'random search' or 'grid search' (default is 'random search').
      - n_iter (int, optional): The number of iterations for random search (default is 5).
      - return_result (bool, optional): Whether to return the results in a DataFrame (default is True).

    Returns:
      - pandas.DataFrame or None: A DataFrame containing the evaluation metric scores for each parameter combination if return_result is True, otherwise, None
    """

    # Take in search space and create combinations of parameters 
    if method == 'random search':
      full_params_combinations = generate_grid_search_combinations(param_grid)
      # Set random seed for reproducibility
      random.seed(42)
      # Randomly select n values from the original list
      params_combinations = random.sample(full_params_combinations, n_iter)
      
    if method == 'grid search':
      params_combinations = generate_grid_search_combinations(param_grid)

    # Initialize params list and score
    lst_params_combi = []
    lst_cv_performance_scores = []

    # Iterate over parameter combinations
    for param_combination in params_combinations:
      
      # Initialize a list to store evaluation values of n_fold
      performance_scores = []

      # Extract window_size param 
      window_size = param_combination['lookback_window']
      # Remove 'lookback_window' key and its value ot use in xgb model 
      param_combination.pop('lookback_window', None)

      for ith_fold in range(1, n_folds+1, 1):

        ith_fold_train, ith_fold_test, ith_fold_lookback = create_single_fold(
            train_set=train_df,
            n_fold=n_folds,
            ith_fold=ith_fold,
            test_size=horizon,
            lookback_window=window_size
            )

        model_performance = run_xgb(
            data_train=ith_fold_train, 
            data_test=ith_fold_lookback, 
            target_column=target_column, 
            exog_column=exog_columns, 
            window_size=window_size, 
            horizon=horizon, 
            metric=eval_metric, 
            xgb_params=param_combination
        )

        # Append the performance value at ith fold 
        performance_scores.append(model_performance)
        
      # Save the combination 
      param_combination['lookback_window'] = window_size
      lst_params_combi.append(param_combination)
      
      # Calculate the average values after n_fold iteration for the current param combination
      avg_performance_scores = np.average(performance_scores)
      # Append to the list 
      lst_cv_performance_scores.append(avg_performance_scores)


    # Dataframe of params and performance
    summary_df = pd.DataFrame(
            {
              'params': lst_params_combi,
              'eval_metric': lst_cv_performance_scores,
            }
            )
    summary_df = summary_df.sort_values(by=['eval_metric'], ascending=True)

    print(f"Best params: {summary_df.head(1)['params']} \nBest performance: {summary_df.head(1)['eval_metric']}")

    if return_result:
      return summary_df

# **utils**

In [None]:
def plot_forecast(df_result, actual_column, forecast_column, labels=['Actual Values', 'Forecast Values']):
    """
    Plot actual and forecasted values from a DataFrame.

    Parameters:
        - df_result (pandas.DataFrame): The DataFrame containing actual and forecasted values.
        - actual_column (str): The name of the column containing actual values.
        - forecast_column (str): The name of the column containing forecasted values.
        - labels (list, optional): A list containing labels for actual and forecasted values (default is ['Actual Values', 'Forecast Values']).

    Returns:
        - None
    """   
    # Plotting
    plt.figure(figsize=(10, 6))

    # Plot selling_price
    plt.plot(df_result.index, df_result[actual_column], label=labels[0])

    # Plot direct forecast
    plt.plot(df_result.index, df_result[forecast_column], label=labels[1])

    # Adding labels and title
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title(f'{labels[0]} vs {labels[1]}')
    plt.legend()

    # Show plot
    plt.show()

# **Data Splitting**

In [None]:
# target and exog columns
TARGET = 'selling_price'
EXOGS = df_full.columns.tolist()
EXOGS.remove(TARGET)

In [None]:
# define constant
WINDOW_SIZE = 31
HORIZON = 31 
HOLDOUT_SIZE = 310

In [None]:
# Split data 
data_train = df_full.iloc[: -HORIZON-HOLDOUT_SIZE, :]
data_holdout = df_full.iloc[-HORIZON-HOLDOUT_SIZE:-HORIZON, :]
data_test  = df_full.iloc[-HORIZON:, :]
data_lookback_test = df_full.iloc[-WINDOW_SIZE-HORIZON:, :]
data_validation = df_full.iloc[-HORIZON-HOLDOUT_SIZE-WINDOW_SIZE:-HORIZON-HOLDOUT_SIZE+HORIZON, :]

In [None]:
# Plot the target column of each dataframe
plt.figure(figsize=(10, 6))

plt.plot(data_train.index, data_train[TARGET], label='Train Data')
plt.plot(data_holdout.index, data_holdout[TARGET], label='Holdout Data')
plt.plot(data_test.index, data_test[TARGET], label='Test Data')
plt.plot(data_lookback_test.index, data_lookback_test[TARGET], label='Lookback Test Data')
plt.plot(data_validation.index, data_validation[TARGET], label='Validation Data')

plt.xlabel('Date')
plt.ylabel('Target Column')
plt.title('Target Column Over Time')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
lst_col = []
lst_col.append(TARGET)
lst_col = lst_col + EXOGS

feature_name_list = []
for i in range(WINDOW_SIZE, 0, -1):
    feature_name_list += [f'{col}_lag_{str(i)}' for col in lst_col]

# **Simple Model**

In [None]:
# XGBoost model with default hyperparameter values 
metric_res, result, model = run_xgb_full(
    data_train=data_train, 
    data_test=data_validation, 
    target_column=TARGET, 
    exog_column=EXOGS, 
    window_size=31, 
    horizon=31, 
    xgb_params=None
)

In [None]:
# Display results of error metrics 
metric_res

In [None]:
# Plot the forecast values 
plot_forecast(
    df_result=result, 
    actual_column='selling_price', 
    forecast_column='Forecast', 
    labels=['Actual Values', 'Forecast Values']
)

In [None]:
# Plot feature importance 
plot_feature_importance(
    model=model, 
    n_limit=15, 
    target_column=TARGET, 
    exog_columns=EXOGS, 
    window_size=31
)

# **Tuning**

In [None]:
# Define search space for hyperparameters
param_distributions = {
  'lookback_window': [31],
  'eta': [0.2, 0.3, 0.4],
  'n_estimators': [300, 400, 500, 600],
  'max_depth': [5, 6, 7],
  'colsample_bytree': [0.6, 0.8, 1],
  'subsample': [0.6, 0.8, 0.1]
}
# Perform hyperparameter tuning using random search
result_df = cv_xgb(
    train_df=data_train,
    target_column=TARGET,
    exog_columns=EXOGS,
    horizon=HORIZON,
    n_folds=5,
    param_grid=param_distributions,
    eval_metric='mae',
    method='random search',
    n_iter=5,
    return_result=True
)

# Save results of hypeparameter tuning stage to a dataframe 
result_df.to_csv(f"tuning_xgb_results.csv")

# **Tuned Model**

In [None]:
# Tuned params
tuned_window = 31
tuned_params = {
    'eta': 0.2, 
    'n_estimators': 600, 
    'max_depth': 6,
    'colsample_bytree': 0.8, 
    'subsample': 0.6
}

In [None]:
# XGBoost model with optimal combination of hyperparameters 
tuned_metric_res, tuned_result, tuned_model= run_xgb_full(
    data_train=data_train, 
    data_test=data_validation, 
    target_column=TARGET, 
    exog_column=EXOGS, 
    window_size=tuned_window, 
    horizon=HORIZON, 
    xgb_params=tuned_params
)

In [None]:
# Display values of error metrics
tuned_metric_res

In [None]:
# Plot forecasted values
plot_forecast(
    df_result=tuned_result, 
    actual_column='selling_price', 
    forecast_column='Forecast', 
    labels=['Actual Values', 'Forecast Values']
)

In [None]:
# Plot feature importance
plot_feature_importance(
    model=tuned_model, 
    n_limit=5, 
    target_column=TARGET, 
    exog_columns=EXOGS, 
    window_size=31
)

# **Holdout Forecast**

In [None]:
# New df containing both training data (used as inputs) to forecast into the holdout set 
data_holdout_rolling = df_full[:-HORIZON]

In [None]:
# test
len(data_holdout_rolling) == len(data_train) + len(data_holdout)
# True

In [None]:
# test
print(data_holdout.head(1).index)
print(data_holdout.tail(1).index)
print(data_holdout.shape)

In [None]:
repeat_time = 10

index = []
forecast_values = []

for i in range(repeat_time, 0, -1):
    
    # Define train and test set 
    temp_train_data = data_holdout_rolling[:-HORIZON*i]    
    # Forecast to the future in holdout
    if i > 1:
        temp_forecast_lookback_data = data_holdout_rolling[-HORIZON*i-WINDOW_SIZE:-HORIZON*i+HORIZON]
    else:
        temp_forecast_lookback_data = data_holdout_rolling[-HORIZON*i-WINDOW_SIZE:]
        
    # Refit the model and make forecast 
    holdout_metric_res, holdout_result, holdout_model= run_xgb_full(
       data_train=temp_train_data, 
       data_test=temp_forecast_lookback_data, 
       target_column=TARGET, 
       exog_column=EXOGS, 
       window_size=WINDOW_SIZE, 
       horizon=HORIZON, 
       xgb_params=tuned_params
    )
    
    index += holdout_result.index.tolist()
    forecast_values += holdout_result['Forecast'].values.tolist()


In [None]:
# save to dictionary 
holdout_result_dict = {
    'index': index,
    'forecast_values_xgb': forecast_values
}

df_holdout_result = pd.DataFrame(holdout_result_dict)
df_holdout_result = df_holdout_result.set_index('index')


In [None]:
# Plot forecasts on Holdout set and actual values 
plt.figure(figsize=(10, 6))

plt.plot(df_holdout_result.index, df_holdout_result['forecast_values_xgb'], label='Holdout Forecast')
plt.plot(data_holdout.index, data_holdout['selling_price'], label='Actual Holdout')

plt.title('Holdout forecast vs Actual')
plt.xlabel('Date')
plt.ylabel('Selling Price')
plt.legend()
plt.grid(True)

plt.show()

# **Test Period Forecast**

In [None]:
# Data train to make forecast 
data_train_refit = df_full[:-HORIZON]

In [None]:
# Plot the training period and testing period 
plt.figure(figsize=(10, 6))

plt.plot(data_train_refit.index, data_train_refit['selling_price'], label='Training Refit')
plt.plot(data_test.index, data_test['selling_price'], label='Test Period')
# plt.plot(data_lookback_test.index, data_lookback_test['selling_price'], label='Lookback Test Period')

plt.title('Training and Testing Period')
plt.xlabel('Date')
plt.ylabel('Selling Price')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
# Refit the model using training and holdout data and make forecast 
test_metric_res, test_result, test_model= run_xgb_full(
       data_train=data_train_refit, 
       data_test=data_lookback_test, 
       target_column=TARGET, 
       exog_column=EXOGS, 
       window_size=WINDOW_SIZE, 
       horizon=HORIZON, 
       xgb_params=tuned_params
    )

In [None]:
# Display values of error metrics
test_metric_res

In [None]:
# Plot
plt.figure(figsize=(10, 6))

plt.plot(test_result.index, test_result['selling_price'], label='Forecast Test Prices')
plt.plot(test_result.index, test_result['Forecast'], label='Actual Test Prices')
# plt.plot(data_lookback_test.index, data_lookback_test['selling_price'], label='Lookback Test Period')

plt.title('Testing Actual vs Forecast')
plt.xlabel('Date')
plt.ylabel('Selling Price')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
# Feature importance 
plot_feature_importance(
    model=test_model, 
    n_limit=5, 
    target_column=TARGET, 
    exog_columns=EXOGS, 
    window_size=31
)

In [None]:
# Export result to dataframe
test_predictions_to_dict = test_result.reset_index()
dict_test_forecast = {
    'index': test_predictions_to_dict['date'].tolist(),
    'forecast_values_xgb': test_result['Forecast'].tolist()
}

df_test_forecast = pd.DataFrame(
    dict_test_forecast
)

# **Save Forecast**

In [None]:
# Save holdout forecast values
df_holdout_result.to_csv(f'..//results//xgboost_holdout_forecast.csv')

In [None]:
# Save test forecast values
df_test_forecast.to_csv(f'..//results//xgboost_test_forecast.csv')