In [None]:
# Cell 1 : Standard Library Imports

import os
import time
import logging
import joblib
import warnings
import random
from datetime import datetime, timedelta

# Third-Party Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import xgboost as xgb
import yfinance as yf
import keras_tuner as kt
import matplotlib as mpl
import matplotlib.dates as mdates
import ta
from pandas.tseries.offsets import BDay
import tensorflow as tf
from sklearn.linear_model import RidgeCV
import pandas_market_calendars as mcal
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Input, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.metrics import (mean_squared_error, mean_absolute_error, 
                             mean_absolute_percentage_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import skew, kurtosis, shapiro
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, MultiHeadAttention, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from xgboost import XGBRegressor

In [None]:
# Cell 2: Fetch the Stock Data (Time-series Only)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize dictionaries to store dataframes
daily_data_dict = {}

# List of stocks to fetch data for
stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']

# Define the time frames for data
end_date = datetime.now()
start_date_daily = end_date - timedelta(days=10*365)   # 10 years of daily data

# Create directories for the data
os.makedirs('../data/stock_data', exist_ok=True)

# Function to fetch stock data
def fetch_stock_data(ticker, start, end, interval):
    try:
        data = yf.download(ticker, start=start, end=end, interval=interval)
        if data.empty:
            logging.warning(f"No data retrieved for {ticker} from {start} to {end} with interval {interval}")
        return data.drop(columns=['Adj Close'], errors='ignore')
    except Exception as e:
        logging.error(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

# Fetch and save daily time-series data
for stock in tqdm(stocks, desc="Fetching stocks data"):

    # Daily Data (10 years)
    daily_data = fetch_stock_data(stock, start_date_daily, end_date, '1d')
    if not daily_data.empty:
        daily_data_dict[stock] = daily_data
        daily_data.to_csv(f'../data/stock_data/{stock}_daily.csv', index=True)

    # Add a delay to avoid API rate limits
    time.sleep(2)

print("Time-series data fetching and saving complete.")

In [None]:
# Cell 3: Feature Engineering

def add_close_price_features(df):
    # Ensure the 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    
    # Drop rows with NaN 'Close' values
    df.dropna(subset=['Close'], inplace=True)
    
    # Sort by 'Date' if not already sorted
    df.sort_values('Date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Moving Averages with min_periods
    df['SMA_5'] = df['Close'].rolling(window=5, min_periods=1).mean()
    df['SMA_10'] = df['Close'].rolling(window=10, min_periods=1).mean()
    df['SMA_20'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False, min_periods=1).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False, min_periods=1).mean()
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False, min_periods=1).mean()
    
    # Momentum Indicators
    df['Momentum_5'] = df['Close'] - df['Close'].shift(5)
    df['Momentum_10'] = df['Close'] - df['Close'].shift(10)
    df['ROC_5'] = df['Close'].pct_change(periods=5)
    df['ROC_10'] = df['Close'].pct_change(periods=10)
    
    # Volatility Indicators with min_periods
    df['Volatility_5'] = df['Close'].rolling(window=5, min_periods=1).std()
    df['Volatility_10'] = df['Close'].rolling(window=10, min_periods=1).std()
    
    # Relative Strength Index (RSI) with min_periods
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    average_gain = gain.rolling(window=14, min_periods=1).mean()
    average_loss = loss.rolling(window=14, min_periods=1).mean()
    rs = average_gain / (average_loss + 1e-10)  # Add small constant to avoid division by zero
    df['RSI_14'] = 100 - (100 / (1 + rs))
    
    # Moving Average Convergence Divergence (MACD)
    exp1 = df['Close'].ewm(span=12, adjust=False, min_periods=1).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False, min_periods=1).mean()
    df['MACD'] = exp1 - exp2
    df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False, min_periods=1).mean()
    df['MACD_diff'] = df['MACD'] - df['MACD_signal']
    
    # Bollinger Bands with min_periods
    df['Middle_Band'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['Std_Dev'] = df['Close'].rolling(window=20, min_periods=1).std()
    df['Upper_Band'] = df['Middle_Band'] + (df['Std_Dev'] * 2)
    df['Lower_Band'] = df['Middle_Band'] - (df['Std_Dev'] * 2)
    df['Bollinger_Width'] = df['Upper_Band'] - df['Lower_Band']
    
    # Percent B (%B) Indicator
    df['Percent_B'] = (df['Close'] - df['Lower_Band']) / (df['Upper_Band'] - df['Lower_Band'] + 1e-10)
    
    # Simplified Williams %R with min_periods
    df['Highest_Close_14'] = df['Close'].rolling(window=14, min_periods=1).max()
    df['Lowest_Close_14'] = df['Close'].rolling(window=14, min_periods=1).min()
    df['Williams_%R'] = ((df['Highest_Close_14'] - df['Close']) / (df['Highest_Close_14'] - df['Lowest_Close_14'] + 1e-10)) * -100
    
    # Exponential Moving Average Differences
    df['EMA_5_10_Diff'] = df['EMA_5'] - df['EMA_10']
    df['EMA_5_20_Diff'] = df['EMA_5'] - df['EMA_20']
    
    # Lag Features
    df['Lag_Close_1'] = df['Close'].shift(1)
    df['Lag_Close_2'] = df['Close'].shift(2)
    df['Lag_Close_3'] = df['Close'].shift(3)
    
    # Rolling Statistics with min_periods
    df['Rolling_Skew_Close_5'] = df['Close'].rolling(window=5, min_periods=1).skew()
    df['Rolling_Kurt_Close_5'] = df['Close'].rolling(window=5, min_periods=1).kurt()
    
    # Handle NaN values appropriately
    # Replace deprecated fillna methods with ffill() and bfill()
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    
    # Reset index after processing
    df.reset_index(drop=True, inplace=True)
    
    return df

# Apply the function to each stock's daily data
for stock in stocks:
    df_daily = daily_data_dict[stock].copy()
    
    # Reset index if 'Date' is not a column
    if 'Date' not in df_daily.columns:
        df_daily.reset_index(inplace=True)
    
    # Ensure 'Date' is of datetime type
    df_daily['Date'] = pd.to_datetime(df_daily['Date'])
    
    # Sort by 'Date'
    df_daily.sort_values('Date', inplace=True)
    df_daily.reset_index(drop=True, inplace=True)
    
    # Add enhanced 'Close' price-based features
    df_daily_with_features = add_close_price_features(df_daily)
    
    # Log information
    logging.info(f"'Close' price-based technical indicators added for daily data of {stock}")
    logging.info(f"Sample features for {stock}:\n{df_daily_with_features.tail(5)}")
    
    # Update the dictionary
    daily_data_dict[stock] = df_daily_with_features

print("Feature engineering complete. Data is ready for splitting into training and testing sets.")

In [None]:
# Cell 4: Splitting Data into Training and Testing Sets (Modified to Include 'test_dates')

# Function to split time series data
def split_time_series_data(df, date_column='Date', target_column='Close', split_ratio=0.8):
    # Sort the DataFrame by the date/time column
    df_sorted = df.sort_values(by=date_column).reset_index(drop=True)
    
    # Determine the split index
    split_index = int(len(df_sorted) * split_ratio)
    
    # Split the data
    train_df = df_sorted.iloc[:split_index]
    test_df = df_sorted.iloc[split_index:]
    
    # Prepare features and target
    feature_cols = [col for col in df.columns if col not in [date_column, target_column]]
    
    X_train = train_df[feature_cols]
    y_train = train_df[target_column]
    
    X_test = test_df[feature_cols]
    y_test = test_df[target_column]
    
    # Extract 'test_dates'
    test_dates = test_df[date_column].reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test, feature_cols, test_dates

# Initialize a dictionary to hold split data for each stock
split_data_dict = {}

# Apply the function to each stock's data
for stock in stocks:
    df = daily_data_dict[stock].copy()
    
    # Verify that the DataFrame is not empty
    if df.empty:
        logging.warning(f"The DataFrame for {stock} is empty. Skipping.")
        continue
    
    # Split the data
    X_train, X_test, y_train, y_test, feature_cols, test_dates = split_time_series_data(
        df,
        date_column='Date',
        target_column='Close',
        split_ratio=0.8
    )
    
    # Store the split data in the dictionary
    split_data_dict[stock] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_columns': feature_cols,
        'test_dates': test_dates  # Include 'test_dates'
    }
    
    logging.info(f"Data for {stock} has been split into training and testing sets.")

# Function to verify the split data
def verify_split(split_dict, stocks):
    for stock in stocks:
        data = split_dict.get(stock)
        if data is None:
            print(f"No data found for {stock}.")
            continue
        
        X_train = data.get('X_train')
        X_test = data.get('X_test')
        y_train = data.get('y_train')
        y_test = data.get('y_test')
        feature_columns = data.get('feature_columns', [])
        test_dates = data.get('test_dates', None)
        
        # Check if any of the datasets are None or empty
        if X_train is None or X_train.empty:
            print(f"X_train is None or empty for {stock}.")
            continue
        if X_test is None or X_test.empty:
            print(f"X_test is None or empty for {stock}.")
            continue
        if y_train is None or y_train.empty:
            print(f"y_train is None or empty for {stock}.")
            continue
        if y_test is None or y_test.empty:
            print(f"y_test is None or empty for {stock}.")
            continue
        if test_dates is None or test_dates.empty:
            print(f"test_dates is None or empty for {stock}.")
            continue
        
        print(f"Verifying data split for {stock}:")
        print(f" - Training set size: {X_train.shape[0]} samples")
        print(f" - Testing set size: {X_test.shape[0]} samples")
        print(f" - Number of features: {len(feature_columns)}")
        print(f" - Feature columns:\n{feature_columns}")
        print(f" - Test Dates (first 5): {test_dates.head().tolist()}")
        print("-" * 80)

# Verify the data splitting for each stock
print("\nVerifying Data Splitting:")
verify_split(split_data_dict, stocks)

print("\nData splitting complete. Ready for scaling in the next cell.")


In [None]:
# Cell 5: Scaling the Data Using MinMaxScaler

# Function to scale data using MinMaxScaler for both features and targets
def scale_data_with_target(split_data_dict):
    scaled_data_dict = {}
    
    # Directories to save scalers
    scaler_save_dir = '../models/scalers'
    os.makedirs(scaler_save_dir, exist_ok=True)
    
    for stock, data in split_data_dict.items():
        logging.info(f"Scaling data for {stock}...")
        
        X_train = data['X_train']
        X_test = data['X_test']
        y_train = data['y_train'].values.reshape(-1, 1)  # Reshape for scaler
        y_test = data['y_test'].values.reshape(-1, 1)
        test_dates = data.get('test_dates')  # Retrieve 'test_dates'
        
        # Initialize scalers
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        
        # Fit scalers on training data and transform both training and testing data
        X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler_X.transform(X_test), columns=X_test.columns, index=X_test.index)
        
        y_train_scaled = scaler_y.fit_transform(y_train).flatten()  # Flatten to 1D array
        y_test_scaled = scaler_y.transform(y_test).flatten()
        
        # Save scalers using joblib
        scaler_X_path = os.path.join(scaler_save_dir, f'minmax_scaler_X_{stock}.joblib')
        scaler_y_path = os.path.join(scaler_save_dir, f'minmax_scaler_y_{stock}.joblib')
        joblib.dump(scaler_X, scaler_X_path)
        joblib.dump(scaler_y, scaler_y_path)
        logging.info(f"Scalers saved for {stock} at {scaler_X_path} and {scaler_y_path}.")
        
        # ----- Include 'test_dates' -----
        if test_dates is not None and not test_dates.empty:
            logging.info(f"'test_dates' found for {stock}. Including in scaled data.")
        else:
            logging.warning(f"No 'test_dates' found for {stock}. Creating dummy dates.")
            test_dates = pd.date_range(start='2020-01-01', periods=len(y_test_scaled), freq='D')
        
        # Update the scaled data dictionary with 'test_dates'
        scaled_data_dict[stock] = {
            'X_train_scaled': X_train_scaled,
            'X_test_scaled': X_test_scaled,
            'y_train_scaled': y_train_scaled,
            'y_test_scaled': y_test_scaled,
            'scaler_X': scaler_X,
            'scaler_y': scaler_y,
            'feature_columns': data['feature_columns'],
            'test_dates': test_dates  # Include 'test_dates'
        }
        
        logging.info(f"Completed scaling for {stock}.")
    
    return scaled_data_dict

# Scale the split daily data with target
logging.info("Starting to scale Daily Data with target...")
scaled_daily_data = scale_data_with_target(split_data_dict)
logging.info("Completed scaling Daily Data with target.")

# Verification
print("\nVerifying Scaled Data for Daily Data (Including 'test_dates'):")
for stock in scaled_daily_data.keys():
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    test_dates = data['test_dates']
    
    print(f"Scaled data for {stock}:")
    print(f" - Scaled Training set shape: {X_train_scaled.shape}, Scaled Training targets shape: {y_train_scaled.shape}")
    print(f" - Scaled Testing set shape: {X_test_scaled.shape}, Scaled Testing targets shape: {y_test_scaled.shape}")
    print(f" - Feature columns: {X_train_scaled.columns.tolist()}")
    
    # Handle 'test_dates' based on its type
    if isinstance(test_dates, pd.DatetimeIndex):
        # Slice the first five dates and convert to a list
        test_dates_list = test_dates[:5].tolist()
    elif isinstance(test_dates, pd.Series):
        # Use head() if it's a Series
        test_dates_list = test_dates.head(5).tolist()
    else:
        # Convert to list and slice if it's another type
        test_dates_list = list(test_dates)[:5]
    
    print(f" - Test Dates (first 5): {test_dates_list}")
    print("-" * 80)


In [None]:
# Cell 6: Training and Evaluating LSTM Models for Daily Data

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Define Parameters
TIMESTEPS = 60  # Number of past days to use for prediction
BATCH_SIZE = 32
EPOCHS = 100  # Increased to allow more training
VALIDATION_SPLIT = 0.1  # Fraction of training data to use for validation

# Define Evaluation Metrics Function
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Function to Create Sequences
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Function to Build LSTM Model
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape)) # Explicit Input Layer
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Initialize a dictionary to store model performance
model_performance = {}

# Directories to save models and scalers
model_save_dir = '../models/lstm_models'
scaler_save_dir = '../models/scalers'
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(scaler_save_dir, exist_ok=True)

# Iterate Through Each Stock
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining LSTM Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_X = data['scaler_X']
    scaler_y = data['scaler_y']
    
    # Create Sequences
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # Build the Model
    model = build_lstm_model(input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
    model.summary()
    
    # Define Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(
        filepath=os.path.join(model_save_dir, f'lstm_{stock}_best.keras'),  # Changed to .keras
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    # Train the Model
    history = model.fit(
        X_train_seq, y_train_seq,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=VALIDATION_SPLIT,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )
    
    # Load the Best Model
    best_model_path = os.path.join(model_save_dir, f'lstm_{stock}_best.keras')
    model = load_model(best_model_path)
    print(f" - Loaded best model from {best_model_path}")
    
    # Predict on Test Data
    predictions_scaled = model.predict(X_test_seq).flatten()
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).flatten()
    
    # Ensure Consistent Lengths
    print(f" - Length of y_test: {len(y_test)}")
    print(f" - Length of predictions: {len(predictions)}")
    
    # Evaluation Metrics
    rmse, mae, r2 = evaluate_model(y_test, predictions)
    model_performance[stock] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
    print(f" - Evaluation Metrics for {stock}: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R2 = {r2:.4f}")
    
    print(f"Model training and evaluation completed for {stock}.\n")

# Summary of Model Performance
print(f"\n{'='*50}\nSummary of Model Performance\n{'='*50}")
for stock, metrics in model_performance.items():
    print(f"{stock}: RMSE = {metrics['RMSE']:.4f}, MAE = {metrics['MAE']:.4f}, R2 = {metrics['R2']:.4f}")


In [None]:
# Cell 8: Training and Evaluating GRU Models for Daily Data

# Define Parameters
TIMESTEPS = 60  # Number of past days to use for prediction
BATCH_SIZE = 32
EPOCHS = 100
VALIDATION_SPLIT = 0.1
SEED = 42

# Set random seeds for reproducibility
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Function to Create Sequences (Already Defined in Cell 6)
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Function to Build GRU Model
def build_gru_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(GRU(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Initialize a dictionary to store GRU model performance
gru_model_performance = {}

# Define Directory to Save GRU Models
gru_model_save_dir = '../models/gru_models'
os.makedirs(gru_model_save_dir, exist_ok=True)

# Iterate Through Each Stock
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining GRU Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']
    
    # Create Sequences
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # Build the GRU Model
    model = build_gru_model(input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
    model.summary()
    
    # Define Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(
        filepath=os.path.join(gru_model_save_dir, f'gru_{stock}_best.keras'),
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    # Train the GRU Model
    history = model.fit(
        X_train_seq, y_train_seq,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=VALIDATION_SPLIT,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )
    
    # Load the Best Model
    best_model_path = os.path.join(gru_model_save_dir, f'gru_{stock}_best.keras')
    if os.path.exists(best_model_path):
        model = load_model(best_model_path)
        print(f" - Loaded best model from {best_model_path}")
    else:
        print(f" - Best GRU model for {stock} not found at {best_model_path}.")
        continue  # Skip evaluation if model not saved
    
    # Predict on Test Data
    predictions_scaled = model.predict(X_test_seq).flatten()
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).flatten()
    
    # Ensure Consistent Lengths
    print(f" - Length of y_test: {len(y_test)}")
    print(f" - Length of predictions: {len(predictions)}")
    
    # Evaluation Metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    gru_model_performance[stock] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
    print(f" - Evaluation Metrics for GRU {stock}: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R2 = {r2:.4f}")
    
    print(f"GRU model training and evaluation completed for {stock}.\n")

# Summary of GRU Model Performance
print(f"\n{'='*50}\nSummary of GRU Model Performance\n{'='*50}")
for stock, metrics in gru_model_performance.items():
    print(f"{stock}: RMSE = {metrics['RMSE']:.4f}, MAE = {metrics['MAE']:.4f}, R2 = {metrics['R2']:.4f}")


In [None]:
# Cell 9: Training and Evaluating XGBoost Models

# Define Parameters
TIMESTEPS = 60  # Ensure consistency with LSTM
model_save_dir = '../models/xgb_models'
os.makedirs(model_save_dir, exist_ok=True)

# Initialize dictionaries to store metrics
overall_metrics_xgb = {}
grouped_metrics_all_xgb = {'Month': {}, 'Quarter': {}, 'Season': {}}

# Function to Add Time Features
def add_time_features_xgb(eval_df):
    eval_df['Month'] = eval_df['Date'].dt.month
    eval_df['Quarter'] = eval_df['Date'].dt.quarter
    eval_df['Season'] = eval_df['Month'].apply(
        lambda month: 'Winter' if month in [12, 1, 2] else
                      'Spring' if month in [3, 4, 5] else
                      'Summer' if month in [6, 7, 8] else
                      'Autumn'
    )
    return eval_df

# Iterate Through Each Stock for Evaluation and Plotting
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining and Evaluating XGBoost Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    y_train_scaled = data['y_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']

    # Initialize and Train XGBoost Regressor
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    xgb_model.fit(X_train_scaled, y_train_scaled)
    print(f" - XGBoost model trained for {stock}")
    
    # Predict on Test Data
    predictions_scaled = xgb_model.predict(X_test_scaled)
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
    
    # Create Evaluation DataFrame
    if isinstance(X_test_scaled, pd.DataFrame) and isinstance(X_test_scaled.index, pd.DatetimeIndex):
        dates = X_test_scaled.index
    elif isinstance(X_test_scaled, pd.DataFrame) and 'Date' in X_test_scaled.columns:
        dates = pd.to_datetime(X_test_scaled['Date'])
    else:
        print(f" - No Date information found for {stock}. Creating dummy dates.")
        dates = pd.date_range(start='2020-01-01', periods=len(y_test), freq='D')
    
    eval_df = pd.DataFrame({
        'Date': dates,
        'Actual': y_test,
        'Predicted': predictions
    })
    
    # Add Time Features
    eval_df = add_time_features_xgb(eval_df)
    
    # Calculate Overall Metrics
    rmse = np.sqrt(mean_squared_error(eval_df['Actual'], eval_df['Predicted']))
    mae = mean_absolute_error(eval_df['Actual'], eval_df['Predicted'])
    r2 = r2_score(eval_df['Actual'], eval_df['Predicted'])
    mape = mean_absolute_percentage_error(eval_df['Actual'], eval_df['Predicted']) * 100
    
    overall_metrics_xgb[stock] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }
    
    print(f" - Overall Evaluation Metrics for {stock}:")
    print(f"    RMSE = {rmse:.4f}")
    print(f"    MAE = {mae:.4f}")
    print(f"    R2 = {r2:.4f}")
    print(f"    MAPE = {mape:.2f}%")
    
    # Calculate Grouped Metrics
    grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_all_xgb['Month'][stock] = grouped_metrics_month
    grouped_metrics_all_xgb['Quarter'][stock] = grouped_metrics_quarter
    grouped_metrics_all_xgb['Season'][stock] = grouped_metrics_season

    # Save the trained model
    model_save_path = os.path.join(model_save_dir, f'xgb_{stock}_model.json')
    xgb_model.save_model(model_save_path)
    print(f" - XGBoost model saved for {stock} at {model_save_path}")
    
    print(f"Training and evaluation completed for {stock}.\n")

# Create Overall Metrics Table
overall_metrics_xgb_df = pd.DataFrame(overall_metrics_xgb).T
print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - XGBoost")
print("="*50)
display(overall_metrics_xgb_df)
overall_metrics_xgb_df.to_csv('overall_evaluation_metrics_xgb.csv')
print("\n - Overall Evaluation Metrics table for XGBoost saved as 'overall_evaluation_metrics_xgb.csv'.")

# Function to Create Grouped Metrics Tables
def create_grouped_metrics_tables_xgb(grouped_metrics_all_xgb, grouping):
    grouped_metrics_tables_xgb = {}
    for stock, metrics in grouped_metrics_all_xgb[grouping].items():
        metrics_df = metrics.reset_index()
        metrics_df.columns = [grouping] + list(metrics_df.columns[1:])
        grouped_metrics_tables_xgb[stock] = metrics_df
    return grouped_metrics_tables_xgb

# Create and Save Grouped Metrics Tables
for grouping in ['Month', 'Quarter', 'Season']:
    grouped_tables_xgb = create_grouped_metrics_tables_xgb(grouped_metrics_all_xgb, grouping)
    for stock, table in grouped_tables_xgb.items():
        print(f"\n{'='*50}\n{grouping} Evaluation Metrics for {stock} - XGBoost\n{'='*50}")
        display(table)
        filename = f'{stock}_{grouping}_evaluation_metrics_xgb.csv'
        table.to_csv(filename, index=False)
        print(f" - {grouping} Evaluation Metrics table for {stock} saved as '{filename}'.")
    
    # Create Comparative Metrics Tables Across Stocks
    for metric in ['RMSE', 'MAE', 'R2', 'MAPE']:
        comparative_df_xgb = pd.DataFrame({stock: grouped_metrics_all_xgb[grouping][stock][metric] for stock in grouped_metrics_all_xgb[grouping].keys()})
        comparative_df_xgb.index.name = grouping
        print(f"\n{'='*50}\nComparative {metric} Across {grouping} for All Stocks - XGBoost\n{'='*50}")
        display(comparative_df_xgb)
        filename = f'comparative_{metric}_across_{grouping}_xgb.csv'
        comparative_df_xgb.to_csv(filename)
        print(f" - Comparative {metric} Across {grouping} table for XGBoost saved as '{filename}'.")


In [None]:
# Cell 10: Training and Evaluating Random Forest Models

# Define Parameters
model_save_dir = '../models/random_forest_models'
os.makedirs(model_save_dir, exist_ok=True)

# Initialize dictionaries to store metrics
overall_metrics_rf = {}
grouped_metrics_all_rf = {'Month': {}, 'Quarter': {}, 'Season': {}}

# Function to Add Time Features (if not already added)
def add_time_features_rf(eval_df):
    eval_df['Month'] = eval_df['Date'].dt.month
    eval_df['Quarter'] = eval_df['Date'].dt.quarter
    eval_df['Season'] = eval_df['Month'].apply(
        lambda month: 'Winter' if month in [12, 1, 2] else
                      'Spring' if month in [3, 4, 5] else
                      'Summer' if month in [6, 7, 8] else
                      'Autumn'
    )
    return eval_df

# Iterate Through Each Stock for Evaluation and Plotting
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining and Evaluating Random Forest Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    y_train_scaled = data['y_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']
    
    # Initialize and Train Random Forest Regressor
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train_scaled, y_train_scaled)
    print(f" - Random Forest model trained for {stock}")
    
    # Predict on Test Data
    predictions_scaled = rf_model.predict(X_test_scaled)
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
    
    # Retrieve 'test_dates' from scaled_daily_data
    test_dates = data.get('test_dates')
    
    # Create Evaluation DataFrame using 'test_dates'
    if test_dates is not None and len(test_dates) == len(y_test):
        dates = test_dates
    else:
        print(f" - No 'test_dates' found for {stock}. Creating dummy dates.")
        dates = pd.date_range(start='2020-01-01', periods=len(y_test), freq='D')
    
    eval_df = pd.DataFrame({
        'Date': dates,
        'Actual': y_test,
        'Predicted': predictions
    })
    
    # Add Time Features
    eval_df = add_time_features_rf(eval_df)
    
    # Calculate Overall Metrics
    rmse = np.sqrt(mean_squared_error(eval_df['Actual'], eval_df['Predicted']))
    mae = mean_absolute_error(eval_df['Actual'], eval_df['Predicted'])
    r2 = r2_score(eval_df['Actual'], eval_df['Predicted'])
    mape = mean_absolute_percentage_error(eval_df['Actual'], eval_df['Predicted']) * 100
    
    overall_metrics_rf[stock] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }
    
    print(f" - Overall Evaluation Metrics for {stock}:")
    print(f"    RMSE = {rmse:.4f}")
    print(f"    MAE = {mae:.4f}")
    print(f"    R2 = {r2:.4f}")
    print(f"    MAPE = {mape:.2f}%")
    
    # Calculate Grouped Metrics
    grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }), include_groups=False)
    
    grouped_metrics_all_rf['Month'][stock] = grouped_metrics_month
    grouped_metrics_all_rf['Quarter'][stock] = grouped_metrics_quarter
    grouped_metrics_all_rf['Season'][stock] = grouped_metrics_season
    
    # Save the trained model
    model_save_path = os.path.join(model_save_dir, f'rf_{stock}_model.pkl')
    joblib.dump(rf_model, model_save_path)
    print(f" - Random Forest model saved for {stock} at {model_save_path}")
    
    print(f"Training and evaluation completed for {stock}.\n")

# Create Overall Metrics Table
overall_metrics_rf_df = pd.DataFrame(overall_metrics_rf).T
print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - Random Forest")
print("="*50)
display(overall_metrics_rf_df)
overall_metrics_rf_df.to_csv('overall_evaluation_metrics_rf.csv')
print("\n - Overall Evaluation Metrics table for Random Forest saved as 'overall_evaluation_metrics_rf.csv'.")

# Function to Create Grouped Metrics Tables
def create_grouped_metrics_tables_rf(grouped_metrics_all_rf, grouping):
    grouped_metrics_tables_rf = {}
    for stock, metrics in grouped_metrics_all_rf[grouping].items():
        metrics_df = metrics.reset_index()
        metrics_df.columns = [grouping] + list(metrics_df.columns[1:])
        grouped_metrics_tables_rf[stock] = metrics_df
    return grouped_metrics_tables_rf

# Create and Save Grouped Metrics Tables
for grouping in ['Month', 'Quarter', 'Season']:
    grouped_tables_rf = create_grouped_metrics_tables_rf(grouped_metrics_all_rf, grouping)
    for stock, table in grouped_tables_rf.items():
        print(f"\n{'='*50}\n{grouping} Evaluation Metrics for {stock} - Random Forest\n{'='*50}")
        display(table)
        filename = f'{stock}_{grouping}_evaluation_metrics_rf.csv'
        table.to_csv(filename, index=False)
        print(f" - {grouping} Evaluation Metrics table for {stock} saved as '{filename}'.")
    
    # Create Comparative Metrics Tables Across Stocks
    for metric in ['RMSE', 'MAE', 'R2', 'MAPE']:
        comparative_df_rf = pd.DataFrame({stock: grouped_metrics_all_rf[grouping][stock][metric] for stock in grouped_metrics_all_rf[grouping].keys()})
        comparative_df_rf.index.name = grouping
        print(f"\n{'='*50}\nComparative {metric} Across {grouping} for All Stocks - Random Forest\n{'='*50}")
        display(comparative_df_rf)
        filename = f'comparative_{metric}_across_{grouping}_rf.csv'
        comparative_df_rf.to_csv(filename)
        print(f" - Comparative {metric} Across {grouping} table for Random Forest saved as '{filename}'.")


In [None]:
# Cell 11A: Generating Meta-Features from Base Models

# Define Paths for Models
xgb_model_dir = '../models/xgb_models'
rf_model_dir = '../models/random_forest_models'
lstm_model_dir = '../models/lstm_models'
gru_model_dir = '../models/gru_models'
meta_model_dir = '../models/meta_model'
os.makedirs(meta_model_dir, exist_ok=True)

# Initialize Dictionaries to Store Meta-Features and Targets
meta_features_train_dict = {}
meta_features_test_dict = {}
y_train_dict = {}
y_test_dict = {}

# Function to Create Sequences (Assuming it's defined in a common cell)
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Iterate Through Each Stock to Populate Meta-Features
for stock in stocks:
    print(f"\n{'='*50}\nProcessing Stock: {stock}\n{'='*50}")

    # ----- Load Base Models -----
    missing_models = []
    
    # Load XGBoost Model
    xgb_model_path = os.path.join(xgb_model_dir, f'xgb_{stock}_model.json')
    if not os.path.exists(xgb_model_path):
        missing_models.append('XGBoost')
    else:
        try:
            xgb_model = xgb.XGBRegressor()
            xgb_model.load_model(xgb_model_path)
            print(f" - Loaded XGBoost model for {stock}.")
        except Exception as e:
            print(f" - Error loading XGBoost model for {stock}: {e}")
            missing_models.append('XGBoost')
    
    # Load Random Forest Model
    rf_model_path = os.path.join(rf_model_dir, f'rf_{stock}_model.pkl')
    if not os.path.exists(rf_model_path):
        missing_models.append('Random Forest')
    else:
        try:
            rf_model = joblib.load(rf_model_path)
            print(f" - Loaded Random Forest model for {stock}.")
        except Exception as e:
            print(f" - Error loading Random Forest model for {stock}: {e}")
            missing_models.append('Random Forest')

    # Load LSTM Model
    lstm_model_path = os.path.join(lstm_model_dir, f'lstm_{stock}_best.keras')
    if not os.path.exists(lstm_model_path):
        missing_models.append('LSTM')
    else:
        try:
            lstm_model = load_model(lstm_model_path)
            print(f" - Loaded LSTM model for {stock}.")
        except Exception as e:
            print(f" - Error loading LSTM model for {stock}: {e}")
            missing_models.append('LSTM')

    # Load GRU Model
    gru_model_path = os.path.join(gru_model_dir, f'gru_{stock}_best.keras')
    if not os.path.exists(gru_model_path):
        missing_models.append('GRU')
    else:
        try:
            gru_model = load_model(gru_model_path)
            print(f" - Loaded GRU model for {stock}.")
        except Exception as e:
            print(f" - Error loading GRU model for {stock}: {e}")
            missing_models.append('GRU')

    if missing_models:
        print(f" - Missing or failed to load models for {stock}: {', '.join(missing_models)}. Skipping.")
        continue

    # ----- Retrieve Scaled Data -----
    data = scaled_daily_data.get(stock)
    if data is None:
        print(f" - No scaled data found for {stock}. Skipping.")
        continue

    # Check for 'test_dates'
    if 'test_dates' not in data:
        print(f" - 'test_dates' not found for {stock}. Skipping.")
        continue
    test_dates = data['test_dates']

    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']

    # ----- Create Sequences for LSTM and GRU Models -----
    TIMESTEPS = 60  # Ensure consistency
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # ----- Generate Predictions from Base Models -----
    # Initialize lists to store predictions
    try:
        # XGBoost Predictions
        xgb_pred_train_scaled = xgb_model.predict(X_train_scaled.iloc[TIMESTEPS:])
        xgb_pred_test_scaled = xgb_model.predict(X_test_scaled.iloc[TIMESTEPS:])
        xgb_pred_train = scaler_y.inverse_transform(xgb_pred_train_scaled.reshape(-1, 1)).flatten()
        xgb_pred_test = scaler_y.inverse_transform(xgb_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - XGBoost predictions generated for {stock}.")
        
        # Random Forest Predictions
        rf_pred_train_scaled = rf_model.predict(X_train_scaled.iloc[TIMESTEPS:])
        rf_pred_test_scaled = rf_model.predict(X_test_scaled.iloc[TIMESTEPS:])
        rf_pred_train = scaler_y.inverse_transform(rf_pred_train_scaled.reshape(-1, 1)).flatten()
        rf_pred_test = scaler_y.inverse_transform(rf_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - Random Forest predictions generated for {stock}.")
        
        # LSTM Predictions
        lstm_pred_train_scaled = lstm_model.predict(X_train_seq).flatten()
        lstm_pred_test_scaled = lstm_model.predict(X_test_seq).flatten()
        lstm_pred_train = scaler_y.inverse_transform(lstm_pred_train_scaled.reshape(-1, 1)).flatten()
        lstm_pred_test = scaler_y.inverse_transform(lstm_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - LSTM predictions generated for {stock}.")
        
        # GRU Predictions
        gru_pred_train_scaled = gru_model.predict(X_train_seq).flatten()
        gru_pred_test_scaled = gru_model.predict(X_test_seq).flatten()
        gru_pred_train = scaler_y.inverse_transform(gru_pred_train_scaled.reshape(-1, 1)).flatten()
        gru_pred_test = scaler_y.inverse_transform(gru_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - GRU predictions generated for {stock}.")
    except Exception as e:
        print(f" - Error during prediction generation for {stock}: {e}")
        continue
    
    # ----- Align Predictions and Targets -----
    # Determine the minimum length to ensure alignment
    min_length_train = min(len(xgb_pred_train), len(rf_pred_train), len(lstm_pred_train), len(gru_pred_train), len(y_train_seq))
    min_length_test = min(len(xgb_pred_test), len(rf_pred_test), len(lstm_pred_test), len(gru_pred_test), len(y_test_seq))
    
    # Slice predictions and targets to min_length
    xgb_pred_train = xgb_pred_train[:min_length_train]
    rf_pred_train = rf_pred_train[:min_length_train]
    lstm_pred_train = lstm_pred_train[:min_length_train]
    gru_pred_train = gru_pred_train[:min_length_train]
    y_train = scaler_y.inverse_transform(y_train_seq[:min_length_train].reshape(-1, 1)).flatten()
    
    xgb_pred_test = xgb_pred_test[:min_length_test]
    rf_pred_test = rf_pred_test[:min_length_test]
    lstm_pred_test = lstm_pred_test[:min_length_test]
    gru_pred_test = gru_pred_test[:min_length_test]
    y_test = scaler_y.inverse_transform(y_test_seq[:min_length_test].reshape(-1, 1)).flatten()
    
    # ----- Populate Meta-Features for Training Data -----
    meta_features_train = pd.DataFrame({
        'XGB_Pred': xgb_pred_train,
        'RF_Pred': rf_pred_train,
        'LSTM_Pred': lstm_pred_train,
        'GRU_Pred': gru_pred_train
    })
    meta_features_train_dict[stock] = meta_features_train
    print(f" - Meta-features for training data populated for {stock}.")
    
    # ----- Populate Meta-Features for Test Data -----
    meta_features_test = pd.DataFrame({
        'XGB_Pred': xgb_pred_test,
        'RF_Pred': rf_pred_test,
        'LSTM_Pred': lstm_pred_test,
        'GRU_Pred': gru_pred_test
    })
    meta_features_test_dict[stock] = meta_features_test
    print(f" - Meta-features for testing data populated for {stock}.")
    
    # ----- Store Target Variables -----
    y_train_dict[stock] = y_train
    y_test_dict[stock] = y_test
    print(f" - Target variables stored for {stock}.")

print("\n" + "="*50 + "\nMeta-Features DataFrame Shapes\n" + "="*50)
for stock in meta_features_train_dict.keys():
    print(f" - {stock}: meta_features_train shape: {meta_features_train_dict[stock].shape}, meta_features_test shape: {meta_features_test_dict[stock].shape}, y_train shape: {y_train_dict[stock].shape}, y_test shape: {y_test_dict[stock].shape}")
print("-" * 80)


In [None]:
# Cell 11B: Training and Evaluating the Stacking Meta-Model

# Initialize a dictionary to store meta-model performance
meta_model_per_stock = {}

print("\n" + "="*50 + "\nTraining Meta-Models for Each Stock\n" + "="*50)

for stock in meta_features_train_dict.keys():
    print(f"\nTraining Meta-Model for {stock}")
    
    meta_features_train = meta_features_train_dict[stock]
    meta_features_test = meta_features_test_dict[stock]
    y_train = y_train_dict[stock]
    y_test = y_test_dict[stock]
    
    # Check if meta_features_train and y_train are non-empty
    if meta_features_train.empty or len(y_train) == 0:
        print(f" - Empty meta-features or target variables for {stock}. Skipping.")
        continue
    
    # ----- Train Meta-Model (Ridge Regression with Cross-Validation) -----
    try:
        meta_model = RidgeCV()
        meta_model.fit(meta_features_train, y_train)
        print(f" - Meta-Model (Ridge Regression) trained successfully for {stock}.")
    except Exception as e:
        print(f" - Error training Meta-Model for {stock}: {e}")
        continue
    
    # ----- Save Meta-Model -----
    meta_model_path = os.path.join(meta_model_dir, f'stacking_meta_model_{stock}.pkl')
    try:
        joblib.dump(meta_model, meta_model_path)
        print(f" - Meta-Model saved at '{meta_model_path}'")
    except Exception as e:
        print(f" - Error saving Meta-Model for {stock}: {e}")
        continue
    
    # ----- Generate Meta-Predictions on Test Data -----
    try:
        meta_pred_test = meta_model.predict(meta_features_test)
        print(f" - Meta-Predictions generated for {stock}.")
    except Exception as e:
        print(f" - Error generating Meta-Predictions for {stock}: {e}")
        continue
    
    # ----- Evaluate Meta-Model -----
    try:
        rmse_meta = np.sqrt(mean_squared_error(y_test, meta_pred_test))
        mae_meta = mean_absolute_error(y_test, meta_pred_test)
        r2_meta = r2_score(y_test, meta_pred_test)
        mape_meta = mean_absolute_percentage_error(y_test, meta_pred_test) * 100
        
        meta_model_per_stock[stock] = {
            'RMSE': rmse_meta,
            'MAE': mae_meta,
            'R2': r2_meta,
            'MAPE': mape_meta
        }
        
        print(f"\nMeta-Model Evaluation Metrics for {stock}:")
        print(f"    RMSE = {rmse_meta:.4f}")
        print(f"    MAE = {mae_meta:.4f}")
        print(f"    R2 = {r2_meta:.4f}")
        print(f"    MAPE = {mape_meta:.2f}%")
    except Exception as e:
        print(f" - Error evaluating Meta-Model for {stock}: {e}")
        continue
    
    # ----- Create Evaluation DataFrame for Meta-Model -----
    try:
        # Align test_dates with predictions
        # Extract dates similar to Cell 10
        if isinstance(meta_features_test.index, pd.DatetimeIndex):
            adjusted_test_dates = meta_features_test.index
        elif 'Date' in meta_features_test.columns:
            adjusted_test_dates = pd.to_datetime(meta_features_test['Date'], errors='coerce')
        else:
            adjusted_test_dates = pd.date_range(start='2020-01-01', periods=len(meta_pred_test), freq='D')
        
        eval_df_meta = pd.DataFrame({
            'Date': adjusted_test_dates,
            'Actual': y_test,
            'Meta_Predicted': meta_pred_test
        })
        
        # ----- Add Time Features -----
        eval_df_meta['Month'] = eval_df_meta['Date'].dt.month
        eval_df_meta['Quarter'] = eval_df_meta['Date'].dt.quarter
        eval_df_meta['Season'] = eval_df_meta['Month'].apply(
            lambda month: 'Winter' if month in [12, 1, 2] else
                          'Spring' if month in [3, 4, 5] else
                          'Summer' if month in [6, 7, 8] else
                          'Autumn'
        )
        print(f" - Time features added to the meta-model evaluation DataFrame for {stock}.")
        
    except Exception as e:
        print(f" - Error adding time features for {stock}: {e}")
    
    # ----- Save Meta-Model Predictions -----
    try:
        meta_pred_save_path = os.path.join(meta_model_dir, f'meta_predictions_{stock}.csv')
        eval_df_meta.to_csv(meta_pred_save_path, index=False)
        print(f" - Meta-Model predictions saved at '{meta_pred_save_path}'")
    except Exception as e:
        print(f" - Error saving Meta-Model predictions for {stock}: {e}")
    
    # ----- Store Meta-Model Performance -----
    print(f" - Meta-Model performance metrics stored for {stock}.")
    
    # ----- Detailed Descriptive Statistics -----
    try:
        df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': meta_pred_test
        })
        descriptive_stats = df.describe().T
        correlation = df.corr().loc['Actual', 'Predicted']
        descriptive_stats['Correlation'] = correlation
        print(f"\nDescriptive Statistics for {stock}:")
        display(descriptive_stats)
        
        # Save Descriptive Statistics
        descriptive_stats.to_csv(os.path.join(meta_model_dir, f'detailed_metrics_{stock}.csv'))
        print(f" - Detailed metrics saved for {stock} at '{meta_model_dir}/detailed_metrics_{stock}.csv'")
    except Exception as e:
        print(f" - Error generating detailed statistics for {stock}: {e}")

# ----- Create Overall Metrics Table -----
overall_metrics_meta_df = pd.DataFrame(meta_model_per_stock).T
overall_metrics_meta_df = overall_metrics_meta_df[['RMSE', 'MAE', 'R2', 'MAPE']].round(4)

print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - Meta Stacked Model")
print("="*50)
display(overall_metrics_meta_df)
overall_metrics_meta_df.to_csv('overall_evaluation_metrics_meta_stacked.csv')
print("\n - Overall Evaluation Metrics table for Meta Stacked Model saved as 'overall_evaluation_metrics_meta_stacked.csv'.")


In [None]:
# Cell 12: Define Essential Functions

# Function to load all necessary models and scalers for a given stock
def load_models(stock):
    try:
        # Define model directories
        xgb_model_dir = '../models/xgb_models'
        rf_model_dir = '../models/random_forest_models'
        lstm_model_dir = '../models/lstm_models'
        gru_model_dir = '../models/gru_models'
        meta_model_dir = '../models/meta_model'
        scalers_dir = '../models/scalers'
        
        # Load XGBoost Model
        xgb_model_path = os.path.join(xgb_model_dir, f'xgb_{stock}_model.json')
        xgb_model = xgb.XGBRegressor()
        xgb_model.load_model(xgb_model_path)
        print(f" - Loaded XGBoost model from '{xgb_model_path}'")
        
        # Load Random Forest Model
        rf_model_path = os.path.join(rf_model_dir, f'rf_{stock}_model.pkl')
        rf_model = joblib.load(rf_model_path)
        print(f" - Loaded Random Forest model from '{rf_model_path}'")
        
        # Load LSTM Model
        lstm_model_path = os.path.join(lstm_model_dir, f'lstm_{stock}_best.keras')
        lstm_model = load_model(lstm_model_path)
        print(f" - Loaded LSTM model from '{lstm_model_path}'")
        
        # Load GRU Model
        gru_model_path = os.path.join(gru_model_dir, f'gru_{stock}_best.keras')
        gru_model = load_model(gru_model_path)
        print(f" - Loaded GRU model from '{gru_model_path}'")
        
        # Load Meta-Model
        meta_model_path = os.path.join(meta_model_dir, f'stacking_meta_model_{stock}.pkl')
        meta_model = joblib.load(meta_model_path)
        print(f" - Loaded Meta-Model from '{meta_model_path}'")
        
        # Load Scalers
        scaler_X_path = os.path.join(scalers_dir, f'minmax_scaler_X_{stock}.joblib')
        scaler_y_path = os.path.join(scalers_dir, f'minmax_scaler_y_{stock}.joblib')
        scaler_X = joblib.load(scaler_X_path)
        scaler_y = joblib.load(scaler_y_path)
        print(f" - Loaded Scalers from '{scaler_X_path}' and '{scaler_y_path}'")
        
        return xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y
    except Exception as e:
        print(f" - Error loading models or scalers for {stock}: {e}")
        return None

# Function to generate features for the next day forecast
def generate_next_day_features(current_data_unscaled, scaler_X, timesteps=60):
    if len(current_data_unscaled) < timesteps:
        raise ValueError("Insufficient data to generate features.")
    
    latest_data = current_data_unscaled.tail(timesteps).copy()
    if 'Close' not in latest_data.columns:
        raise KeyError("'Close' column is missing in current_data_unscaled.")

    # Drop 'Close' and 'Date' to get feature columns
    if 'Date' in latest_data.columns:
        features_for_models = latest_data.drop(columns=['Close', 'Date'])
    else:
        features_for_models = latest_data.drop(columns=['Close'])
    
    # Ensure features are in the same order as during training
    expected_features = scaler_X.feature_names_in_
    missing_features = set(expected_features) - set(features_for_models.columns)
    if missing_features:
        raise KeyError(f"Missing required columns: {missing_features}")
    
    features_for_models = features_for_models[expected_features]
    
    # Scale the features
    xgb_rf_features_scaled = scaler_X.transform(features_for_models.iloc[-1:].copy())
    lstm_gru_features_scaled = scaler_X.transform(features_for_models).reshape(1, timesteps, -1)
    
    latest_features = features_for_models.iloc[-1].copy()
    
    return xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features


In [None]:
# Cell 13: Initialize and Prepare Data

# Define Paths
raw_data_dir = '../data/stock_data'
forecast_save_dir = '../models/future_forecasts'
os.makedirs(forecast_save_dir, exist_ok=True)

# Initialize a dictionary to store models and scaled data for each stock
models_per_stock = {}
timesteps = 60

for stock in stocks:
    print(f"\n{'='*50}\nProcessing Stock: {stock}\n{'='*50}")
    
    # Load models
    models = load_models(stock)
    if models is None:
        print(f" - Skipping stock '{stock}' due to model loading issues.")
        continue
    xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = models
    
    # Load raw data
    raw_csv_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(raw_csv_path):
        print(f" - Raw data CSV not found at '{raw_csv_path}'. Skipping.")
        continue
    
    try:
        df = pd.read_csv(raw_csv_path)
        print(f" - Loaded raw data from '{raw_csv_path}'. Shape: {df.shape}")
    except Exception as e:
        print(f" - Error reading CSV for {stock}: {e}")
        continue
    
    # Ensure 'Date' column is present
    if 'Date' not in df.columns:
        print(f" - 'Date' column missing in '{raw_csv_path}'. Skipping.")
        continue
    
    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    if df['Date'].isnull().any():
        print(f" - Some 'Date' entries could not be converted to datetime for {stock}. Dropping these rows.")
        df.dropna(subset=['Date'], inplace=True)
    
    # Sort by Date
    df.sort_values('Date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Add Enhanced 'Close' Price-Based Features
    try:
        df_fe = add_close_price_features(df)
        print(f" - Applied enhanced 'Close' price-based feature engineering. Shape: {df_fe.shape}")
    except Exception as e:
        print(f" - Error during feature engineering for {stock}: {e}")
        continue
    
    # Keep only necessary recent data
    current_data_unscaled = df_fe.tail(timesteps).copy()
    
    # Prepare features
    feature_cols = [col for col in current_data_unscaled.columns if col not in ['Date', 'Close']]
    X_current = current_data_unscaled[feature_cols].copy()
    
    # Drop 'Date' column if present
    if 'Date' in X_current.columns:
        X_current = X_current.drop(columns=['Date'])
        print(" - Dropped 'Date' column from features.")
    
    # Ensure features are in the same order as during training
    expected_features = scaler_X.feature_names_in_
    
    # Identify missing features
    missing_features = set(expected_features) - set(X_current.columns)
    if missing_features:
        print(f" - Missing Features for {stock}: {missing_features}")
        # Add missing features with default values (e.g., 0)
        for feature in missing_features:
            X_current[feature] = 0  # Alternatively, use df_fe[feature].iloc[-1] or another strategy
        print(f" - Added missing features with default values for {stock}.")
    
    # Reorder columns to match expected_features
    X_current = X_current[expected_features]
    
    # Scale features and target
    try:
        X_current_scaled = pd.DataFrame(scaler_X.transform(X_current), columns=X_current.columns)
        current_data_scaled = X_current_scaled.copy()
        current_data_scaled['Close'] = scaler_y.transform(current_data_unscaled['Close'].values.reshape(-1, 1)).flatten()
        current_data_scaled['Date'] = current_data_unscaled['Date'].values
        print(f" - Scaled current data for {stock}")
    except Exception as e:
        print(f" - Error scaling data for {stock}: {e}")
        continue
    
    # Store prepared data in models_per_stock
    models_per_stock[stock] = {
        'models': models,
        'current_data_scaled': current_data_scaled,
        'current_data_unscaled': current_data_unscaled,
        'scaler_y': scaler_y,
        # 'y_train_scaled': y_train_scaled,  # Uncomment if available
        # 'y_test_scaled': y_test_scaled     # Uncomment if available
    }
    
    print(f" - Current data for {stock} loaded and prepared. Total samples: {len(current_data_scaled)}")


In [None]:
# Cell 14: Forecasting Future Prices with Corrected Feature Engineering and pd.concat()

def inverse_scale_prediction(scaler_y, prediction_scaled):
    return scaler_y.inverse_transform([[prediction_scaled]])[0][0]

# Define directories
forecast_save_dir = '../models/future_forecasts'
os.makedirs(forecast_save_dir, exist_ok=True)

# Define forecast days
forecast_days = 30

# Initialize a dictionary to store forecast results
forecast_results = {stock: [] for stock in models_per_stock.keys()}

# Start forecasting
for stock in models_per_stock.keys():
    print(f"\nStarting Forecasting for {stock}")
    try:
        models = models_per_stock[stock]['models']
        current_data_scaled = models_per_stock[stock]['current_data_scaled']
        current_data_unscaled = models_per_stock[stock]['current_data_unscaled']
        scaler_X = models_per_stock[stock]['models'][5]  # scaler_X
        scaler_y = models_per_stock[stock]['models'][6]  # scaler_y
        
        # Corrected model unpacking
        xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = models
        
        # Create a copy of current_data_unscaled to update with predictions
        updated_unscaled = current_data_unscaled.copy()
        
        for day in range(1, forecast_days + 1):
            # Generate features for XGBoost and Random Forest
            xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features = generate_next_day_features(
                updated_unscaled,
                scaler_X,
                timesteps=60
            )
            
            # Make predictions with base models
            xgb_pred_scaled = xgb_model.predict(xgb_rf_features_scaled)[0]
            rf_pred_scaled = rf_model.predict(xgb_rf_features_scaled)[0]
            lstm_pred_scaled = lstm_model.predict(lstm_gru_features_scaled)[0][0]
            gru_pred_scaled = gru_model.predict(lstm_gru_features_scaled)[0][0]
            
            # Debug: Print scaled predictions
            print(f" - Day {day}:")
            print(f"   XGBoost_scaled = {xgb_pred_scaled}")
            print(f"   RF_scaled      = {rf_pred_scaled}")
            print(f"   LSTM_scaled    = {lstm_pred_scaled}")
            print(f"   GRU_scaled     = {gru_pred_scaled}")
            
            # Meta-model prediction
            meta_features = np.array([xgb_pred_scaled, rf_pred_scaled, lstm_pred_scaled, gru_pred_scaled]).reshape(1, -1)
            meta_pred_scaled = meta_model.predict(meta_features)[0]
            
            # Debug: Print scaled meta-prediction
            print(f"   Meta-model_scaled_prediction = {meta_pred_scaled}")
            
            # Inversely scale the meta-prediction
            meta_pred = inverse_scale_prediction(scaler_y, meta_pred_scaled)
            
            # Debug: Print inversely scaled prediction
            print(f"   Meta-model_prediction (original scale) = {meta_pred}")
            
            # Append the prediction
            forecast_results[stock].append(meta_pred)
            
            # Update the unscaled data with the new prediction
            # Assuming 'Close' is the target variable
            new_row = latest_features.copy()
            new_row['Close'] = meta_pred
            new_row['Date'] = updated_unscaled['Date'].max() + BDay(1)  # Increment date by 1 business day
            
            # Debug: Print the new row before feature engineering
            print(f"   New row before feature engineering:\n{new_row}")
            
            # Convert new_row to DataFrame
            new_row_df = pd.DataFrame([new_row])
            
            # Append the new row to updated_unscaled using pd.concat()
            updated_unscaled = pd.concat([updated_unscaled, new_row_df], ignore_index=True)
            
            # Recalculate any derived features based on the new 'Close' price
            updated_unscaled = add_close_price_features(updated_unscaled)
            
            # Ensure that only the last 'timesteps' rows are kept
            updated_unscaled = updated_unscaled.tail(60).reset_index(drop=True)
            
            # Debug: Print the last few rows of updated_unscaled to verify updates
            print(f"   Updated 'Close' prices after Day {day}:")
            print(updated_unscaled['Close'].tail(5).values)
        
        # Verify the number of predictions
        num_predictions = len(forecast_results[stock])
        print(f"{stock}: Number of predictions = {num_predictions}, Expected = {forecast_days}")
        
        # Truncate excess predictions if any
        if num_predictions > forecast_days:
            print(f"{stock}: Truncating {num_predictions - forecast_days} excess predictions.")
            forecast_results[stock] = forecast_results[stock][:forecast_days]
        elif num_predictions < forecast_days:
            print(f"{stock}: Missing {forecast_days - num_predictions} predictions.")
            # Optionally, handle missing predictions
            # For now, we'll skip saving forecasts for this stock
            continue
        
        # Create Forecast DataFrame
        forecast_df = pd.DataFrame({
            'Day': range(1, forecast_days + 1),
            f'{stock}_Predicted': forecast_results[stock]
        })
        
        # Save Forecast
        forecast_save_path = os.path.join(forecast_save_dir, f'future_forecasts_{stock}.csv')
        forecast_df.to_csv(forecast_save_path, index=False)
        print(f" - Forecast saved at '{forecast_save_path}'")
        
    except Exception as e:
        print(f" - Error forecasting for {stock}: {e}")

# Display a sample of forecast_results
for stock in models_per_stock.keys():
    preds = forecast_results.get(stock, [])
    if preds:
        print(f"\nSample Predictions for {stock}: {preds[:5]} ...")
    else:
        print(f"\nNo predictions available for {stock}.")


In [None]:
# Cell 15: Loading Forecasts and Historical Data

def load_forecast(stock, forecast_dir):
    forecast_path = os.path.join(forecast_dir, f'future_forecasts_{stock}.csv')
    if not os.path.exists(forecast_path):
        print(f" - Forecast file for {stock} not found at '{forecast_path}'.")
        return None
    try:
        df = pd.read_csv(forecast_path)
        print(f" - Loaded forecast for {stock} from '{forecast_path}'. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f" - Error loading forecast for {stock}: {e}")
        return None

def load_historical_data(stock, raw_data_dir, validation_days=30):
    historical_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(historical_path):
        print(f" - Historical data for {stock} not found at '{historical_path}'.")
        return None, None
    try:
        df = pd.read_csv(historical_path)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values('Date', inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        # Split into training and validation sets
        if len(df) < validation_days + 1:
            print(f" - Not enough data for {stock} to perform backtesting. Required: {validation_days + 1}, Available: {len(df)}")
            return None, None
        
        training_df = df.iloc[:-validation_days].copy()
        validation_df = df.iloc[-validation_days:].copy()
        
        print(f" - Loaded historical data for {stock} from '{historical_path}'. Training Shape: {training_df.shape}, Validation Shape: {validation_df.shape}")
        return training_df, validation_df
    except Exception as e:
        print(f" - Error loading historical data for {stock}: {e}")
        return None, None

# Define directories
forecast_dir = '../models/future_forecasts'
raw_data_dir = '../data/stock_data'  # Ensure this path is correct relative to your notebook

# List of stocks
stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']

# Initialize dictionaries to store forecasts and historical data
forecasts = {}
historicals = {}

# Load forecasts and historical data
for stock in stocks:
    print(f"\nLoading data for {stock}:")
    forecasts[stock] = load_forecast(stock, forecast_dir)
    historicals[stock] = load_historical_data(stock, raw_data_dir, validation_days=30)

# Inspect loaded data (Optional but recommended)
for stock in stocks:
    print(f"\nInspecting data for {stock}:")
    forecast_df = forecasts.get(stock)
    historical_data = historicals.get(stock)
    
    if historical_data is not None:
        training_df, validation_df = historical_data
    else:
        training_df, validation_df = None, None
    
    if forecast_df is not None:
        print("Forecast DataFrame Head:")
        print(forecast_df.head())
    
    if training_df is not None:
        print("Training DataFrame Tail:")
        print(training_df.tail())
    
    if validation_df is not None:
        print("Validation DataFrame Tail:")
        print(validation_df.tail())


In [None]:
# Import necessary libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

# Define directories (adjust paths as necessary)
forecast_dir = '../models/future_forecasts'
raw_data_dir = '../data/stock_data'  # Ensure this path is correct relative to your notebook

# Function to load forecasts
def load_forecast(stock, forecast_dir):
    forecast_path = os.path.join(forecast_dir, f'future_forecasts_{stock}.csv')
    if not os.path.exists(forecast_path):
        print(f" - Forecast file for {stock} not found at '{forecast_path}'.")
        return None
    try:
        df = pd.read_csv(forecast_path)
        print(f" - Loaded forecast for {stock} from '{forecast_path}'. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f" - Error loading forecast for {stock}: {e}")
        return None

# Function to load historical data
def load_historical_data(stock, raw_data_dir, validation_days=30):
    historical_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(historical_path):
        print(f" - Historical data for {stock} not found at '{historical_path}'.")
        return None, None
    try:
        df = pd.read_csv(historical_path)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values('Date', inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        # Split into training and validation sets
        if len(df) < validation_days + 1:
            print(f" - Not enough data for {stock} to perform backtesting. Required: {validation_days + 1}, Available: {len(df)}")
            return None, None
        
        training_df = df.iloc[:-validation_days].copy()
        validation_df = df.iloc[-validation_days:].copy()
        
        print(f" - Loaded historical data for {stock} from '{historical_path}'. Training Shape: {training_df.shape}, Validation Shape: {validation_df.shape}")
        return training_df, validation_df
    except Exception as e:
        print(f" - Error loading historical data for {stock}: {e}")
        return None, None

# Initialize dictionaries to store forecasts and historical data
forecasts = {}
historicals = {}

# Load forecasts and historical data
for stock in stocks:
    print(f"\nLoading data for {stock}:")
    forecasts[stock] = load_forecast(stock, forecast_dir)
    historicals[stock] = load_historical_data(stock, raw_data_dir, validation_days=30)

# Set display options for better readability
pd.set_option('display.max_rows', None)       # Display all rows
pd.set_option('display.max_columns', None)    # Display all columns
pd.set_option('display.width', None)          # No wrapping in output
pd.set_option('display.float_format', '{:.6f}'.format)  # Format floats

# Plotting the forecasts alongside historical data
for stock in stocks:
    forecast_df = forecasts.get(stock)
    historical_data = historicals.get(stock)
    
    if historical_data is not None and forecast_df is not None:
        training_df, validation_df = historical_data
        
        # Combine training and validation data
        historical_df = pd.concat([training_df, validation_df], ignore_index=True)
        historical_df['Date'] = pd.to_datetime(historical_df['Date'])
        
        # Get the last date from the historical data
        last_historical_date = historical_df['Date'].max()
        
        # Generate future dates for forecasts
        forecast_days = forecast_df.shape[0]
        
        # Use custom business days to account for weekends and US Federal Holidays
        us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
        future_dates = pd.date_range(start=last_historical_date + pd.Timedelta(days=1), periods=forecast_days, freq=us_bd)
        
        # Add dates to forecast_df
        forecast_df['Date'] = future_dates
        forecast_df.rename(columns={f'{stock}_Predicted': 'Predicted_Close'}, inplace=True)
        
        # Merge historical and forecast data
        combined_df = pd.merge(historical_df[['Date', 'Close']], forecast_df[['Date', 'Predicted_Close']], on='Date', how='outer')
        combined_df.sort_values('Date', inplace=True)
        combined_df.reset_index(drop=True, inplace=True)
        
        # Focus on the last year of data
        one_year_ago = last_historical_date - pd.DateOffset(years=1)
        mask = combined_df['Date'] >= one_year_ago
        combined_df_last_year = combined_df.loc[mask].reset_index(drop=True)
        
        # Print out the combined DataFrame for the last year
        print(f"\nCombined Data for {stock} - Last Year:")
        print(combined_df_last_year[['Date', 'Close', 'Predicted_Close']].to_string(index=False))
        
        # Plot the data
        plt.figure(figsize=(14,7))
        plt.plot(combined_df_last_year['Date'], combined_df_last_year['Close'], label='Actual Close Prices')
        plt.plot(combined_df_last_year['Date'], combined_df_last_year['Predicted_Close'], label='Predicted Close Prices', linestyle='--')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.title(f'Stock Price Prediction for {stock} - Last Year')
        plt.axvline(x=last_historical_date, color='grey', linestyle='--', label='Forecast Start')
        plt.legend()
        plt.show()
    else:
        print(f"Data not available for {stock}.")
