In [1]:
# Cell 1 : Standard Library Imports

import os
import time
import logging
import joblib
import warnings
import random
from datetime import datetime, timedelta

# Third-Party Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import xgboost as xgb
import yfinance as yf
import keras_tuner as kt
import matplotlib as mpl
import matplotlib.dates as mdates
import ta
from pandas.tseries.offsets import BDay
import tensorflow as tf
from sklearn.linear_model import RidgeCV
import pandas_market_calendars as mcal
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Input, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.metrics import (mean_squared_error, mean_absolute_error, 
                             mean_absolute_percentage_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import skew, kurtosis, shapiro
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, MultiHeadAttention, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from xgboost import XGBRegressor

In [2]:
# Cell 2: Fetch the Stock Data (Time-series Only)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize dictionaries to store dataframes
daily_data_dict = {}

# List of stocks to fetch data for
stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']

# Define the time frames for data
end_date = datetime.now()
start_date_daily = end_date - timedelta(days=10*365)   # 10 years of daily data

# Create directories for the data
os.makedirs('../data/stock_data', exist_ok=True)

# Function to fetch stock data
def fetch_stock_data(ticker, start, end, interval):
    try:
        data = yf.download(ticker, start=start, end=end, interval=interval)
        if data.empty:
            logging.warning(f"No data retrieved for {ticker} from {start} to {end} with interval {interval}")
        return data.drop(columns=['Adj Close'], errors='ignore')
    except Exception as e:
        logging.error(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()

# Fetch and save daily time-series data
for stock in tqdm(stocks, desc="Fetching stocks data"):

    # Daily Data (10 years)
    daily_data = fetch_stock_data(stock, start_date_daily, end_date, '1d')
    if not daily_data.empty:
        daily_data_dict[stock] = daily_data
        daily_data.to_csv(f'../data/stock_data/{stock}_daily.csv', index=True)

    # Add a delay to avoid API rate limits
    time.sleep(2)

print("Time-series data fetching and saving complete.")

[*********************100%%**********************]  1 of 1 completed00<?, ?it/s]
[*********************100%%**********************]  1 of 1 completed,  2.31s/it]
[*********************100%%**********************]  1 of 1 completed,  2.24s/it]
[*********************100%%**********************]  1 of 1 completed,  2.24s/it]
Fetching stocks data: 100%|███████████████████████| 4/4 [00:08<00:00,  2.23s/it]

Time-series data fetching and saving complete.





In [3]:
# Cell 3: Feature Engineering

def add_close_price_features(df):
    # Ensure the 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    
    # Drop rows with NaN 'Close' values
    df.dropna(subset=['Close'], inplace=True)
    
    # Sort by 'Date' if not already sorted
    df.sort_values('Date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Moving Averages with min_periods
    df['SMA_5'] = df['Close'].rolling(window=5, min_periods=1).mean()
    df['SMA_10'] = df['Close'].rolling(window=10, min_periods=1).mean()
    df['SMA_20'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False, min_periods=1).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False, min_periods=1).mean()
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False, min_periods=1).mean()

    # Momentum Indicators
    df['Momentum_5'] = df['Close'] - df['Close'].shift(5)
    df['Momentum_10'] = df['Close'] - df['Close'].shift(10)
    df['ROC_5'] = df['Close'].pct_change(periods=5)
    df['ROC_10'] = df['Close'].pct_change(periods=10)
    
    # Volatility Indicators with min_periods
    df['Volatility_5'] = df['Close'].rolling(window=5, min_periods=1).std()
    df['Volatility_10'] = df['Close'].rolling(window=10, min_periods=1).std()
    
    # Relative Strength Index (RSI) with min_periods
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    average_gain = gain.rolling(window=14, min_periods=1).mean()
    average_loss = loss.rolling(window=14, min_periods=1).mean()
    rs = average_gain / (average_loss + 1e-10)  # Add small constant to avoid division by zero
    df['RSI_14'] = 100 - (100 / (1 + rs))
    
    # Moving Average Convergence Divergence (MACD)
    exp1 = df['Close'].ewm(span=12, adjust=False, min_periods=1).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False, min_periods=1).mean()
    df['MACD'] = exp1 - exp2
    df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False, min_periods=1).mean()
    df['MACD_diff'] = df['MACD'] - df['MACD_signal']
    
    # Bollinger Bands with min_periods
    df['Middle_Band'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['Std_Dev'] = df['Close'].rolling(window=20, min_periods=1).std()
    df['Upper_Band'] = df['Middle_Band'] + (df['Std_Dev'] * 2)
    df['Lower_Band'] = df['Middle_Band'] - (df['Std_Dev'] * 2)
    df['Bollinger_Width'] = df['Upper_Band'] - df['Lower_Band']
    
    # Percent B (%B) Indicator
    df['Percent_B'] = (df['Close'] - df['Lower_Band']) / (df['Upper_Band'] - df['Lower_Band'] + 1e-10)
    
    # Simplified Williams %R with min_periods
    df['Highest_Close_14'] = df['Close'].rolling(window=14, min_periods=1).max()
    df['Lowest_Close_14'] = df['Close'].rolling(window=14, min_periods=1).min()
    df['Williams_%R'] = ((df['Highest_Close_14'] - df['Close']) / (df['Highest_Close_14'] - df['Lowest_Close_14'] + 1e-10)) * -100
    
    # Exponential Moving Average Differences
    df['EMA_5_10_Diff'] = df['EMA_5'] - df['EMA_10']
    df['EMA_5_20_Diff'] = df['EMA_5'] - df['EMA_20']
    
    # Lag Features
    df['Lag_Close_1'] = df['Close'].shift(1)
    df['Lag_Close_2'] = df['Close'].shift(2)
    df['Lag_Close_3'] = df['Close'].shift(3)
    
    # Rolling Statistics with min_periods
    df['Rolling_Skew_Close_5'] = df['Close'].rolling(window=5, min_periods=1).skew()
    df['Rolling_Kurt_Close_5'] = df['Close'].rolling(window=5, min_periods=1).kurt()
    
    # Handle NaN values appropriately
    # Replace deprecated fillna methods with ffill() and bfill()
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    
    # Reset index after processing
    df.reset_index(drop=True, inplace=True)
    
    return df

# Apply the function to each stock's daily data
for stock in stocks:
    df_daily = daily_data_dict[stock].copy()
    
    # Reset index if 'Date' is not a column
    if 'Date' not in df_daily.columns:
        df_daily.reset_index(inplace=True)
    
    # Ensure 'Date' is of datetime type
    df_daily['Date'] = pd.to_datetime(df_daily['Date'])
    
    # Sort by 'Date'
    df_daily.sort_values('Date', inplace=True)
    df_daily.reset_index(drop=True, inplace=True)
    
    # Add enhanced 'Close' price-based features
    df_daily_with_features = add_close_price_features(df_daily)
    
    # Log information
    logging.info(f"'Close' price-based technical indicators added for daily data of {stock}")
    logging.info(f"Sample features for {stock}:\n{df_daily_with_features.tail(5)}")
    
    # Update the dictionary
    daily_data_dict[stock] = df_daily_with_features

print("Feature engineering complete. Data is ready for splitting into training and testing sets.")

2024-10-30 01:27:13,181 - INFO - 'Close' price-based technical indicators added for daily data of AAPL
2024-10-30 01:27:13,188 - INFO - Sample features for AAPL:
           Date        Open        High         Low       Close    Volume  \
2509 2024-10-23  234.080002  235.139999  227.759995  230.759995  52287000   
2510 2024-10-24  229.979996  230.820007  228.410004  230.570007  31109500   
2511 2024-10-25  229.740005  233.220001  229.570007  231.410004  38802300   
2512 2024-10-28  233.320007  234.729996  232.550003  233.399994  36087100   
2513 2024-10-29  233.100006  234.330002  232.320007  233.669998  35332800   

           SMA_5      SMA_10    SMA_20       EMA_5  ...  Highest_Close_14  \
2509  234.049997  232.376999  229.7270  233.270769  ...        236.479996   
2510  233.734000  232.530000  229.8795  232.370515  ...        236.479996   
2511  233.016000  232.916000  230.0605  232.050345  ...        236.479996   
2512  232.400000  233.125999  230.0805  232.500228  ...        236.

Feature engineering complete. Data is ready for splitting into training and testing sets.


In [4]:
# Cell 4: Splitting Data into Training and Testing Sets (Modified to Include 'test_dates')

# Function to split time series data
def split_time_series_data(df, date_column='Date', target_column='Close', split_ratio=0.8):
    # Sort the DataFrame by the date/time column
    df_sorted = df.sort_values(by=date_column).reset_index(drop=True)
    
    # Determine the split index
    split_index = int(len(df_sorted) * split_ratio)
    
    # Split the data
    train_df = df_sorted.iloc[:split_index]
    test_df = df_sorted.iloc[split_index:]

    # Prepare features and target
    feature_cols = [col for col in df.columns if col not in [date_column, target_column]]
    
    X_train = train_df[feature_cols]
    y_train = train_df[target_column]
    
    X_test = test_df[feature_cols]
    y_test = test_df[target_column]
    
    # Extract 'test_dates'
    test_dates = test_df[date_column].reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test, feature_cols, test_dates

# Initialize a dictionary to hold split data for each stock
split_data_dict = {}

# Apply the function to each stock's data
for stock in stocks:
    df = daily_data_dict[stock].copy()
    
    # Verify that the DataFrame is not empty
    if df.empty:
        logging.warning(f"The DataFrame for {stock} is empty. Skipping.")
        continue
    
    # Split the data
    X_train, X_test, y_train, y_test, feature_cols, test_dates = split_time_series_data(
        df,
        date_column='Date',
        target_column='Close',
        split_ratio=0.8
    )
    
    # Store the split data in the dictionary
    split_data_dict[stock] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_columns': feature_cols,
        'test_dates': test_dates  # Include 'test_dates'
    }
    
    logging.info(f"Data for {stock} has been split into training and testing sets.")

# Function to verify the split data
def verify_split(split_dict, stocks):
    for stock in stocks:
        data = split_dict.get(stock)
        if data is None:
            print(f"No data found for {stock}.")
            continue
        
        X_train = data.get('X_train')
        X_test = data.get('X_test')
        y_train = data.get('y_train')
        y_test = data.get('y_test')
        feature_columns = data.get('feature_columns', [])
        test_dates = data.get('test_dates', None)
        
        # Check if any of the datasets are None or empty
        if X_train is None or X_train.empty:
            print(f"X_train is None or empty for {stock}.")
            continue
        if X_test is None or X_test.empty:
            print(f"X_test is None or empty for {stock}.")
            continue
        if y_train is None or y_train.empty:
            print(f"y_train is None or empty for {stock}.")
            continue
        if y_test is None or y_test.empty:
            print(f"y_test is None or empty for {stock}.")
            continue
        if test_dates is None or test_dates.empty:
            print(f"test_dates is None or empty for {stock}.")
            continue
        
        print(f"Verifying data split for {stock}:")
        print(f" - Training set size: {X_train.shape[0]} samples")
        print(f" - Testing set size: {X_test.shape[0]} samples")
        print(f" - Number of features: {len(feature_columns)}")
        print(f" - Feature columns:\n{feature_columns}")
        print(f" - Test Dates (first 5): {test_dates.head().tolist()}")
        print("-" * 80)

# Verify the data splitting for each stock
print("\nVerifying Data Splitting:")
verify_split(split_data_dict, stocks)

print("\nData splitting complete. Ready for scaling in the next cell.")


2024-10-30 01:27:13,241 - INFO - Data for AAPL has been split into training and testing sets.
2024-10-30 01:27:13,243 - INFO - Data for MSFT has been split into training and testing sets.
2024-10-30 01:27:13,245 - INFO - Data for GOOGL has been split into training and testing sets.
2024-10-30 01:27:13,246 - INFO - Data for AMZN has been split into training and testing sets.



Verifying Data Splitting:
Verifying data split for AAPL:
 - Training set size: 2011 samples
 - Testing set size: 503 samples
 - Number of features: 36
 - Feature columns:
['Open', 'High', 'Low', 'Volume', 'SMA_5', 'SMA_10', 'SMA_20', 'EMA_5', 'EMA_10', 'EMA_20', 'Momentum_5', 'Momentum_10', 'ROC_5', 'ROC_10', 'Volatility_5', 'Volatility_10', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_diff', 'Middle_Band', 'Std_Dev', 'Upper_Band', 'Lower_Band', 'Bollinger_Width', 'Percent_B', 'Highest_Close_14', 'Lowest_Close_14', 'Williams_%R', 'EMA_5_10_Diff', 'EMA_5_20_Diff', 'Lag_Close_1', 'Lag_Close_2', 'Lag_Close_3', 'Rolling_Skew_Close_5', 'Rolling_Kurt_Close_5']
 - Test Dates (first 5): [Timestamp('2022-10-28 00:00:00'), Timestamp('2022-10-31 00:00:00'), Timestamp('2022-11-01 00:00:00'), Timestamp('2022-11-02 00:00:00'), Timestamp('2022-11-03 00:00:00')]
--------------------------------------------------------------------------------
Verifying data split for MSFT:
 - Training set size: 2011 samples

In [5]:
# Cell 5: Scaling the Data Using MinMaxScaler

# Function to scale data using MinMaxScaler for both features and targets
def scale_data_with_target(split_data_dict):
    scaled_data_dict = {}

    # Directories to save scalers
    scaler_save_dir = '../models/scalers'
    os.makedirs(scaler_save_dir, exist_ok=True)
    
    for stock, data in split_data_dict.items():
        logging.info(f"Scaling data for {stock}...")
        
        X_train = data['X_train']
        X_test = data['X_test']
        y_train = data['y_train'].values.reshape(-1, 1)  # Reshape for scaler
        y_test = data['y_test'].values.reshape(-1, 1)
        test_dates = data.get('test_dates')  # Retrieve 'test_dates'
        
        # Initialize scalers
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        
        # Fit scalers on training data and transform both training and testing data
        X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler_X.transform(X_test), columns=X_test.columns, index=X_test.index)
        
        y_train_scaled = scaler_y.fit_transform(y_train).flatten()  # Flatten to 1D array
        y_test_scaled = scaler_y.transform(y_test).flatten()
        
        # Save scalers using joblib
        scaler_X_path = os.path.join(scaler_save_dir, f'minmax_scaler_X_{stock}.joblib')
        scaler_y_path = os.path.join(scaler_save_dir, f'minmax_scaler_y_{stock}.joblib')
        joblib.dump(scaler_X, scaler_X_path)
        joblib.dump(scaler_y, scaler_y_path)
        logging.info(f"Scalers saved for {stock} at {scaler_X_path} and {scaler_y_path}.")
        
        # ----- Include 'test_dates' -----
        if test_dates is not None and not test_dates.empty:
            logging.info(f"'test_dates' found for {stock}. Including in scaled data.")
        else:
            logging.warning(f"No 'test_dates' found for {stock}. Creating dummy dates.")
            test_dates = pd.date_range(start='2020-01-01', periods=len(y_test_scaled), freq='D')
        
        # Update the scaled data dictionary with 'test_dates'
        scaled_data_dict[stock] = {
            'X_train_scaled': X_train_scaled,
            'X_test_scaled': X_test_scaled,
            'y_train_scaled': y_train_scaled,
            'y_test_scaled': y_test_scaled,
            'scaler_X': scaler_X,
            'scaler_y': scaler_y,
            'feature_columns': data['feature_columns'],
            'test_dates': test_dates  # Include 'test_dates'
        }
        
        logging.info(f"Completed scaling for {stock}.")
    
    return scaled_data_dict

# Scale the split daily data with target
logging.info("Starting to scale Daily Data with target...")
scaled_daily_data = scale_data_with_target(split_data_dict)
logging.info("Completed scaling Daily Data with target.")

# Verification
print("\nVerifying Scaled Data for Daily Data (Including 'test_dates'):")
for stock in scaled_daily_data.keys():
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    test_dates = data['test_dates']
    
    print(f"Scaled data for {stock}:")
    print(f" - Scaled Training set shape: {X_train_scaled.shape}, Scaled Training targets shape: {y_train_scaled.shape}")
    print(f" - Scaled Testing set shape: {X_test_scaled.shape}, Scaled Testing targets shape: {y_test_scaled.shape}")
    print(f" - Feature columns: {X_train_scaled.columns.tolist()}")
    
    # Handle 'test_dates' based on its type
    if isinstance(test_dates, pd.DatetimeIndex):
        # Slice the first five dates and convert to a list
        test_dates_list = test_dates[:5].tolist()
    elif isinstance(test_dates, pd.Series):
        # Use head() if it's a Series
        test_dates_list = test_dates.head(5).tolist()
    else:
        # Convert to list and slice if it's another type
        test_dates_list = list(test_dates)[:5]
    
    print(f" - Test Dates (first 5): {test_dates_list}")
    print("-" * 80)


2024-10-30 01:27:13,255 - INFO - Starting to scale Daily Data with target...
2024-10-30 01:27:13,256 - INFO - Scaling data for AAPL...
2024-10-30 01:27:13,262 - INFO - Scalers saved for AAPL at ../models/scalers/minmax_scaler_X_AAPL.joblib and ../models/scalers/minmax_scaler_y_AAPL.joblib.
2024-10-30 01:27:13,262 - INFO - 'test_dates' found for AAPL. Including in scaled data.
2024-10-30 01:27:13,262 - INFO - Completed scaling for AAPL.
2024-10-30 01:27:13,262 - INFO - Scaling data for MSFT...
2024-10-30 01:27:13,271 - INFO - Scalers saved for MSFT at ../models/scalers/minmax_scaler_X_MSFT.joblib and ../models/scalers/minmax_scaler_y_MSFT.joblib.
2024-10-30 01:27:13,272 - INFO - 'test_dates' found for MSFT. Including in scaled data.
2024-10-30 01:27:13,272 - INFO - Completed scaling for MSFT.
2024-10-30 01:27:13,272 - INFO - Scaling data for GOOGL...
2024-10-30 01:27:13,279 - INFO - Scalers saved for GOOGL at ../models/scalers/minmax_scaler_X_GOOGL.joblib and ../models/scalers/minmax_sc


Verifying Scaled Data for Daily Data (Including 'test_dates'):
Scaled data for AAPL:
 - Scaled Training set shape: (2011, 36), Scaled Training targets shape: (2011,)
 - Scaled Testing set shape: (503, 36), Scaled Testing targets shape: (503,)
 - Feature columns: ['Open', 'High', 'Low', 'Volume', 'SMA_5', 'SMA_10', 'SMA_20', 'EMA_5', 'EMA_10', 'EMA_20', 'Momentum_5', 'Momentum_10', 'ROC_5', 'ROC_10', 'Volatility_5', 'Volatility_10', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_diff', 'Middle_Band', 'Std_Dev', 'Upper_Band', 'Lower_Band', 'Bollinger_Width', 'Percent_B', 'Highest_Close_14', 'Lowest_Close_14', 'Williams_%R', 'EMA_5_10_Diff', 'EMA_5_20_Diff', 'Lag_Close_1', 'Lag_Close_2', 'Lag_Close_3', 'Rolling_Skew_Close_5', 'Rolling_Kurt_Close_5']
 - Test Dates (first 5): [Timestamp('2022-10-28 00:00:00'), Timestamp('2022-10-31 00:00:00'), Timestamp('2022-11-01 00:00:00'), Timestamp('2022-11-02 00:00:00'), Timestamp('2022-11-03 00:00:00')]
------------------------------------------------------

In [6]:
# Cell 6: Training and Evaluating LSTM Models for Daily Data

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Define Parameters
TIMESTEPS = 60  # Number of past days to use for prediction
BATCH_SIZE = 32
EPOCHS = 100  # Increased to allow more training
VALIDATION_SPLIT = 0.1  # Fraction of training data to use for validation

# Define Evaluation Metrics Function
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Function to Create Sequences
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Function to Build LSTM Model
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape)) # Explicit Input Layer
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Initialize a dictionary to store model performance
model_performance = {}

# Directories to save models and scalers
model_save_dir = '../models/lstm_models'
scaler_save_dir = '../models/scalers'
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(scaler_save_dir, exist_ok=True)

# Iterate Through Each Stock
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining LSTM Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_X = data['scaler_X']
    scaler_y = data['scaler_y']
    
    # Create Sequences
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # Build the Model
    model = build_lstm_model(input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
    model.summary()

    # Define Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(
        filepath=os.path.join(model_save_dir, f'lstm_{stock}_best.keras'),  # Changed to .keras
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )

    # Train the Model
    history = model.fit(
        X_train_seq, y_train_seq,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=VALIDATION_SPLIT,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )
    
    # Load the Best Model
    best_model_path = os.path.join(model_save_dir, f'lstm_{stock}_best.keras')
    model = load_model(best_model_path)
    print(f" - Loaded best model from {best_model_path}")
    
    # Predict on Test Data
    predictions_scaled = model.predict(X_test_seq).flatten()
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).flatten()
    
    # Ensure Consistent Lengths
    print(f" - Length of y_test: {len(y_test)}")
    print(f" - Length of predictions: {len(predictions)}")
    
    # Evaluation Metrics
    rmse, mae, r2 = evaluate_model(y_test, predictions)
    model_performance[stock] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
    print(f" - Evaluation Metrics for {stock}: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R2 = {r2:.4f}")
    
    print(f"Model training and evaluation completed for {stock}.\n")

# Summary of Model Performance
print(f"\n{'='*50}\nSummary of Model Performance\n{'='*50}")
for stock, metrics in model_performance.items():
    print(f"{stock}: RMSE = {metrics['RMSE']:.4f}, MAE = {metrics['MAE']:.4f}, R2 = {metrics['R2']:.4f}")



Training LSTM Model for AAPL
 - Training sequences: (1951, 60, 36), Training targets: (1951,)
 - Testing sequences: (443, 60, 36), Testing targets: (443,)


Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0258
Epoch 1: val_loss improved from inf to 0.00551, saving model to ../models/lstm_models/lstm_AAPL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0255 - val_loss: 0.0055
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0026
Epoch 2: val_loss did not improve from 0.00551
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0026 - val_loss: 0.0103
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0022
Epoch 3: val_loss improved from 0.00551 to 0.00426, saving model to ../models/lstm_models/lstm_AAPL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0022 - val_loss: 0.0043
Epoch 4/100
[1m53/55[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 0.0019
Epoch 4

Epoch 1/100
[1m53/55[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 0.0408
Epoch 1: val_loss improved from inf to 0.00422, saving model to ../models/lstm_models/lstm_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0395 - val_loss: 0.0042
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.0029
Epoch 2: val_loss improved from 0.00422 to 0.00387, saving model to ../models/lstm_models/lstm_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.0029 - val_loss: 0.0039
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0028
Epoch 3: val_loss improved from 0.00387 to 0.00304, saving model to ../models/lstm_models/lstm_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 0.0028 - val_loss: 0.0030
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━

Epoch 1/100
[1m53/55[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 0.0207
Epoch 1: val_loss improved from inf to 0.00319, saving model to ../models/lstm_models/lstm_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0202 - val_loss: 0.0032
Epoch 2/100
[1m53/55[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 0.0031
Epoch 2: val_loss improved from 0.00319 to 0.00230, saving model to ../models/lstm_models/lstm_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.0031 - val_loss: 0.0023
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0020
Epoch 3: val_loss improved from 0.00230 to 0.00187, saving model to ../models/lstm_models/lstm_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0020 - val_loss: 0.0019
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━

Epoch 1/100
[1m53/55[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 0.0489
Epoch 1: val_loss improved from inf to 0.01439, saving model to ../models/lstm_models/lstm_AMZN_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0473 - val_loss: 0.0144
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.0042
Epoch 2: val_loss improved from 0.01439 to 0.00321, saving model to ../models/lstm_models/lstm_AMZN_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.0042 - val_loss: 0.0032
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0033
Epoch 3: val_loss did not improve from 0.00321
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0033 - val_loss: 0.0058
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0025
Epoch 4

In [7]:
# Cell 7: Training and Evaluating GRU Models for Daily Data

# Define Parameters
TIMESTEPS = 60  # Number of past days to use for prediction
BATCH_SIZE = 32
EPOCHS = 100
VALIDATION_SPLIT = 0.1
SEED = 42

# Set random seeds for reproducibility
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Function to Create Sequences (Already Defined in Cell 6)
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Function to Build GRU Model
def build_gru_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(GRU(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25, activation='relu'))
    model.add(Dense(units=1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Initialize a dictionary to store GRU model performance
gru_model_performance = {}

# Define Directory to Save GRU Models
gru_model_save_dir = '../models/gru_models'
os.makedirs(gru_model_save_dir, exist_ok=True)

# Iterate Through Each Stock
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining GRU Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']
    
    # Create Sequences
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # Build the GRU Model
    model = build_gru_model(input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
    model.summary()
    
    # Define Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(
        filepath=os.path.join(gru_model_save_dir, f'gru_{stock}_best.keras'),
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    # Train the GRU Model
    history = model.fit(
        X_train_seq, y_train_seq,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=VALIDATION_SPLIT,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )
    
    # Load the Best Model
    best_model_path = os.path.join(gru_model_save_dir, f'gru_{stock}_best.keras')
    if os.path.exists(best_model_path):
        model = load_model(best_model_path)
        print(f" - Loaded best model from {best_model_path}")
    else:
        print(f" - Best GRU model for {stock} not found at {best_model_path}.")
        continue  # Skip evaluation if model not saved
    
    # Predict on Test Data
    predictions_scaled = model.predict(X_test_seq).flatten()
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).flatten()
    
    # Ensure Consistent Lengths
    print(f" - Length of y_test: {len(y_test)}")
    print(f" - Length of predictions: {len(predictions)}")
    
    # Evaluation Metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    gru_model_performance[stock] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
    print(f" - Evaluation Metrics for GRU {stock}: RMSE = {rmse:.4f}, MAE = {mae:.4f}, R2 = {r2:.4f}")
    
    print(f"GRU model training and evaluation completed for {stock}.\n")

# Summary of GRU Model Performance
print(f"\n{'='*50}\nSummary of GRU Model Performance\n{'='*50}")
for stock, metrics in gru_model_performance.items():
    print(f"{stock}: RMSE = {metrics['RMSE']:.4f}, MAE = {metrics['MAE']:.4f}, R2 = {metrics['R2']:.4f}")



Training GRU Model for AAPL
 - Training sequences: (1951, 60, 36), Training targets: (1951,)
 - Testing sequences: (443, 60, 36), Testing targets: (443,)


Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0552
Epoch 1: val_loss improved from inf to 0.00408, saving model to ../models/gru_models/gru_AAPL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 0.0546 - val_loss: 0.0041
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0036
Epoch 2: val_loss improved from 0.00408 to 0.00248, saving model to ../models/gru_models/gru_AAPL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 0.0036 - val_loss: 0.0025
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0030
Epoch 3: val_loss did not improve from 0.00248
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0030 - val_loss: 0.0029
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0026
Epoch 4: va

Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0267
Epoch 1: val_loss improved from inf to 0.00245, saving model to ../models/gru_models/gru_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 0.0264 - val_loss: 0.0025
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0046
Epoch 2: val_loss improved from 0.00245 to 0.00229, saving model to ../models/gru_models/gru_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0046 - val_loss: 0.0023
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0034
Epoch 3: val_loss improved from 0.00229 to 0.00094, saving model to ../models/gru_models/gru_MSFT_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0034 - val_loss: 9.4191e-04
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━

Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0625
Epoch 1: val_loss improved from inf to 0.00814, saving model to ../models/gru_models/gru_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 0.0619 - val_loss: 0.0081
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0059
Epoch 2: val_loss improved from 0.00814 to 0.00198, saving model to ../models/gru_models/gru_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 0.0059 - val_loss: 0.0020
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0034
Epoch 3: val_loss improved from 0.00198 to 0.00152, saving model to ../models/gru_models/gru_GOOGL_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0034 - val_loss: 0.0015
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━

Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0518
Epoch 1: val_loss improved from inf to 0.00528, saving model to ../models/gru_models/gru_AMZN_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 0.0513 - val_loss: 0.0053
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0059
Epoch 2: val_loss improved from 0.00528 to 0.00383, saving model to ../models/gru_models/gru_AMZN_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0059 - val_loss: 0.0038
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0031
Epoch 3: val_loss improved from 0.00383 to 0.00296, saving model to ../models/gru_models/gru_AMZN_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0031 - val_loss: 0.0030
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━

In [8]:
# Cell 8: Training and Evaluating XGBoost Models

import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import pickle

# Define Parameters
TIMESTEPS = 60  # Ensure consistency with LSTM and GRU models
model_save_dir = '../models/xgb_models'
os.makedirs(model_save_dir, exist_ok=True)

# Initialize dictionaries to store metrics
overall_metrics_xgb = {}
grouped_metrics_all_xgb = {'Month': {}, 'Quarter': {}, 'Season': {}}

# Function to Add Time Features
def add_time_features_xgb(eval_df):
    eval_df['Month'] = eval_df['Date'].dt.month
    eval_df['Quarter'] = eval_df['Date'].dt.quarter
    eval_df['Season'] = eval_df['Month'].apply(
        lambda month: 'Winter' if month in [12, 1, 2] else
                      'Spring' if month in [3, 4, 5] else
                      'Summer' if month in [6, 7, 8] else
                      'Autumn'
    )
    return eval_df

# Iterate Through Each Stock for Evaluation and Plotting
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining and Evaluating XGBoost Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    y_train_scaled = data['y_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_X = data['scaler_X']
    scaler_y = data['scaler_y']
    test_dates = data.get('test_dates')  # Retrieve 'test_dates'
    
    # Ensure X_train_scaled is a DataFrame with feature names
    if isinstance(X_train_scaled, np.ndarray):
        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=scaler_X.feature_names_in_, index=X_train_scaled.index)
    else:
        X_train_scaled_df = X_train_scaled
    
    # Initialize and Train XGBoost Regressor
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    xgb_model.fit(X_train_scaled_df, y_train_scaled)
    print(f" - XGBoost model trained for {stock}")
    
    # Save feature names used in training
    feature_names = X_train_scaled_df.columns.tolist()
    feature_names_path = os.path.join(model_save_dir, f'feature_names_{stock}.pkl')
    with open(feature_names_path, 'wb') as f:
        pickle.dump(feature_names, f)
    print(f" - Feature names saved for {stock} at {feature_names_path}")
    
    # Prepare test data with correct feature names
    if isinstance(X_test_scaled, np.ndarray):
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=scaler_X.feature_names_in_, index=X_test_scaled.index)
    else:
        X_test_scaled_df = X_test_scaled
    
    # Predict on Test Data
    predictions_scaled = xgb_model.predict(X_test_scaled_df)
    
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
    
    # Create Evaluation DataFrame
    if isinstance(X_test_scaled_df.index, pd.DatetimeIndex):
        dates = X_test_scaled_df.index
    elif 'Date' in X_test_scaled_df.columns:
        dates = pd.to_datetime(X_test_scaled_df['Date'])
    elif test_dates is not None and len(test_dates) == len(y_test):
        dates = test_dates
    else:
        print(f" - No Date information found for {stock}. Creating dummy dates.")
        dates = pd.date_range(start='2020-01-01', periods=len(y_test), freq='D')
    
    eval_df = pd.DataFrame({
        'Date': dates,
        'Actual': y_test,
        'Predicted': predictions
    })
    
    # Add Time Features
    eval_df = add_time_features_xgb(eval_df)
    
    # Calculate Overall Metrics
    rmse = np.sqrt(mean_squared_error(eval_df['Actual'], eval_df['Predicted']))
    mae = mean_absolute_error(eval_df['Actual'], eval_df['Predicted'])
    r2 = r2_score(eval_df['Actual'], eval_df['Predicted'])
    mape = mean_absolute_percentage_error(eval_df['Actual'], eval_df['Predicted']) * 100
    
    overall_metrics_xgb[stock] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }
    
    print(f" - Overall Evaluation Metrics for {stock}:")
    print(f"    RMSE = {rmse:.4f}")
    print(f"    MAE = {mae:.4f}")
    print(f"    R2 = {r2:.4f}")
    print(f"    MAPE = {mape:.2f}%")
    
    # Calculate Grouped Metrics
    grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    # Reset index to turn the grouping column into a regular column
    grouped_metrics_month = grouped_metrics_month.reset_index()
    grouped_metrics_quarter = grouped_metrics_quarter.reset_index()
    grouped_metrics_season = grouped_metrics_season.reset_index()
    
    grouped_metrics_all_xgb['Month'][stock] = grouped_metrics_month
    grouped_metrics_all_xgb['Quarter'][stock] = grouped_metrics_quarter
    grouped_metrics_all_xgb['Season'][stock] = grouped_metrics_season

    # Save the trained model
    model_save_path = os.path.join(model_save_dir, f'xgb_{stock}_model.json')
    xgb_model.save_model(model_save_path)
    print(f" - XGBoost model saved for {stock} at {model_save_path}")
    
    print(f"Training and evaluation completed for {stock}.\n")

# Create Overall Metrics Table
overall_metrics_xgb_df = pd.DataFrame(overall_metrics_xgb).T
print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - XGBoost")
print("="*50)
display(overall_metrics_xgb_df)
overall_metrics_xgb_df.to_csv('overall_evaluation_metrics_xgb.csv')
print("\n - Overall Evaluation Metrics table for XGBoost saved as 'overall_evaluation_metrics_xgb.csv'.")

# Function to Create Grouped Metrics Tables
def create_grouped_metrics_tables_xgb(grouped_metrics_all_xgb, grouping):
    grouped_metrics_tables_xgb = {}
    for stock, metrics in grouped_metrics_all_xgb[grouping].items():
        metrics_df = metrics  # Already reset index in the loop
        grouped_metrics_tables_xgb[stock] = metrics_df
    return grouped_metrics_tables_xgb

# Create and Save Grouped Metrics Tables
for grouping in ['Month', 'Quarter', 'Season']:
    grouped_tables_xgb = create_grouped_metrics_tables_xgb(grouped_metrics_all_xgb, grouping)
    for stock, table in grouped_tables_xgb.items():
        print(f"\n{'='*50}\n{grouping} Evaluation Metrics for {stock} - XGBoost\n{'='*50}")
        display(table)
        filename = f'{stock}_{grouping}_evaluation_metrics_xgb.csv'
        table.to_csv(filename, index=False)
        print(f" - {grouping} Evaluation Metrics table for {stock} saved as '{filename}'.")
    
    # Create Comparative Metrics Tables Across Stocks
    for metric in ['RMSE', 'MAE', 'R2', 'MAPE']:
        comparative_df_xgb = pd.DataFrame({stock: grouped_metrics_all_xgb[grouping][stock][metric] for stock in grouped_metrics_all_xgb[grouping].keys()})
        comparative_df_xgb.index = grouped_metrics_all_xgb[grouping][stock][grouping]
        comparative_df_xgb.index.name = grouping
        print(f"\n{'='*50}\nComparative {metric} Across {grouping} for All Stocks - XGBoost\n{'='*50}")
        display(comparative_df_xgb)
        filename = f'comparative_{metric}_across_{grouping}_xgb.csv'
        comparative_df_xgb.to_csv(filename)
        print(f" - Comparative {metric} Across {grouping} table for XGBoost saved as '{filename}'.")



Training and Evaluating XGBoost Model for AAPL
 - XGBoost model trained for AAPL
 - Feature names saved for AAPL at ../models/xgb_models/feature_names_AAPL.pkl
 - Overall Evaluation Metrics for AAPL:
    RMSE = 21.5184
    MAE = 12.9100
    R2 = 0.3334
    MAPE = 6.13%
 - XGBoost model saved for AAPL at ../models/xgb_models/xgb_AAPL_model.json
Training and evaluation completed for AAPL.


Training and Evaluating XGBoost Model for MSFT
 - XGBoost model trained for MSFT
 - Feature names saved for MSFT at ../models/xgb_models/feature_names_MSFT.pkl


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


 - Overall Evaluation Metrics for MSFT:
    RMSE = 53.0361
    MAE = 36.3703
    R2 = 0.3896
    MAPE = 8.81%
 - XGBoost model saved for MSFT at ../models/xgb_models/xgb_MSFT_model.json
Training and evaluation completed for MSFT.


Training and Evaluating XGBoost Model for GOOGL
 - XGBoost model trained for GOOGL
 - Feature names saved for GOOGL at ../models/xgb_models/feature_names_GOOGL.pkl
 - Overall Evaluation Metrics for GOOGL:
    RMSE = 12.2651
    MAE = 6.6552
    R2 = 0.8036
    MAPE = 4.05%
 - XGBoost model saved for GOOGL at ../models/xgb_models/xgb_GOOGL_model.json
Training and evaluation completed for GOOGL.


Training and Evaluating XGBoost Model for AMZN
 - XGBoost model trained for AMZN
 - Feature names saved for AMZN at ../models/xgb_models/feature_names_AMZN.pkl
 - Overall Evaluation Metrics for AMZN:
    RMSE = 3.0123
    MAE = 1.7337
    R2 = 0.9926
    MAPE = 1.12%


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


 - XGBoost model saved for AMZN at ../models/xgb_models/xgb_AMZN_model.json
Training and evaluation completed for AMZN.


Overall Evaluation Metrics for All Stocks - XGBoost


Unnamed: 0,RMSE,MAE,R2,MAPE
AAPL,21.518377,12.909998,0.333433,6.129564
MSFT,53.036127,36.370332,0.389636,8.805656
GOOGL,12.265134,6.655197,0.80362,4.04724
AMZN,3.012312,1.733682,0.992608,1.115294



 - Overall Evaluation Metrics table for XGBoost saved as 'overall_evaluation_metrics_xgb.csv'.

Month Evaluation Metrics for AAPL - XGBoost


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,8.523657,6.369372,0.896859,3.478552
1,2,6.804222,5.086796,0.842367,2.791202
2,3,1.44209,1.087113,0.978204,0.658404
3,4,1.118708,0.900531,0.892222,0.5387
4,5,6.670529,4.572242,0.343929,2.440303
5,6,20.647992,16.177428,-1.657299,7.924047
6,7,35.747513,31.628786,-3.620084,14.57775
7,8,31.392237,24.412861,-1.190113,11.248215
8,9,33.645263,25.136786,-0.990752,11.3804
9,10,36.003623,25.089042,-0.531958,10.970298


 - Month Evaluation Metrics table for AAPL saved as 'AAPL_Month_evaluation_metrics_xgb.csv'.

Month Evaluation Metrics for MSFT - XGBoost


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,36.305385,25.919526,0.77681,6.732587
1,2,49.648069,36.173885,0.560395,8.936369
2,3,52.757535,36.996665,0.513296,9.024713
3,4,55.333173,41.041689,0.224465,9.992887
4,5,55.227619,39.41485,-0.113806,9.469837
5,6,68.167317,47.658436,-0.676842,10.916555
6,7,77.853655,57.961684,-1.137751,13.109001
7,8,51.615486,36.97683,-0.346456,9.021289
8,9,60.327816,43.222759,-0.449236,10.230167
9,10,55.48282,39.271974,-0.08909,9.46412


 - Month Evaluation Metrics table for MSFT saved as 'MSFT_Month_evaluation_metrics_xgb.csv'.

Month Evaluation Metrics for GOOGL - XGBoost


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,1.543017,1.047128,0.996436,0.875734
1,2,0.927376,0.721307,0.998463,0.635649
2,3,1.270706,1.00078,0.996931,0.82774
3,4,7.760894,5.526607,0.910253,3.588421
4,5,17.393014,12.480026,0.628874,7.3241
5,6,20.834226,14.850243,0.43649,8.489723
6,7,24.9996,18.166975,0.278244,10.098494
7,8,12.109877,8.993366,0.475421,5.588208
8,9,8.551095,6.071232,0.542098,3.847165
9,10,11.981554,8.61946,0.605677,5.322074


 - Month Evaluation Metrics table for GOOGL saved as 'GOOGL_Month_evaluation_metrics_xgb.csv'.

Month Evaluation Metrics for AMZN - XGBoost


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,1.058915,0.844711,0.998764,0.747608
1,2,0.900813,0.672055,0.999378,0.553332
2,3,0.953712,0.793332,0.999436,0.645173
3,4,1.78252,1.433868,0.997907,0.988127
4,5,2.219901,1.676648,0.996289,1.056928
5,6,3.572012,2.40161,0.98586,1.440424
6,7,7.408559,4.827604,0.93995,2.580227
7,8,1.410268,1.065073,0.994563,0.678367
8,9,3.645815,2.458877,0.979642,1.403266
9,10,3.515444,2.814587,0.986898,1.694704


 - Month Evaluation Metrics table for AMZN saved as 'AMZN_Month_evaluation_metrics_xgb.csv'.

Comparative RMSE Across Month for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8.523657,36.305385,1.543017,1.058915
2,6.804222,49.648069,0.927376,0.900813
3,1.44209,52.757535,1.270706,0.953712
4,1.118708,55.333173,7.760894,1.78252
5,6.670529,55.227619,17.393014,2.219901
6,20.647992,68.167317,20.834226,3.572012
7,35.747513,77.853655,24.9996,7.408559
8,31.392237,51.615486,12.109877,1.410268
9,33.645263,60.327816,8.551095,3.645815
10,36.003623,55.48282,11.981554,3.515444


 - Comparative RMSE Across Month table for XGBoost saved as 'comparative_RMSE_across_Month_xgb.csv'.

Comparative MAE Across Month for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,6.369372,25.919526,1.047128,0.844711
2,5.086796,36.173885,0.721307,0.672055
3,1.087113,36.996665,1.00078,0.793332
4,0.900531,41.041689,5.526607,1.433868
5,4.572242,39.41485,12.480026,1.676648
6,16.177428,47.658436,14.850243,2.40161
7,31.628786,57.961684,18.166975,4.827604
8,24.412861,36.97683,8.993366,1.065073
9,25.136786,43.222759,6.071232,2.458877
10,25.089042,39.271974,8.61946,2.814587


 - Comparative MAE Across Month table for XGBoost saved as 'comparative_MAE_across_Month_xgb.csv'.

Comparative R2 Across Month for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.896859,0.77681,0.996436,0.998764
2,0.842367,0.560395,0.998463,0.999378
3,0.978204,0.513296,0.996931,0.999436
4,0.892222,0.224465,0.910253,0.997907
5,0.343929,-0.113806,0.628874,0.996289
6,-1.657299,-0.676842,0.43649,0.98586
7,-3.620084,-1.137751,0.278244,0.93995
8,-1.190113,-0.346456,0.475421,0.994563
9,-0.990752,-0.449236,0.542098,0.979642
10,-0.531958,-0.08909,0.605677,0.986898


 - Comparative R2 Across Month table for XGBoost saved as 'comparative_R2_across_Month_xgb.csv'.

Comparative MAPE Across Month for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.478552,6.732587,0.875734,0.747608
2,2.791202,8.936369,0.635649,0.553332
3,0.658404,9.024713,0.82774,0.645173
4,0.5387,9.992887,3.588421,0.988127
5,2.440303,9.469837,7.3241,1.056928
6,7.924047,10.916555,8.489723,1.440424
7,14.57775,13.109001,10.098494,2.580227
8,11.248215,9.021289,5.588208,0.678367
9,11.3804,10.230167,3.847165,1.403266
10,10.970298,9.46412,5.322074,1.694704


 - Comparative MAPE Across Month table for XGBoost saved as 'comparative_MAPE_across_Month_xgb.csv'.

Quarter Evaluation Metrics for AAPL - XGBoost


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,6.294782,4.116058,0.892308,2.274706
1,2,12.349159,7.08158,0.236126,3.571376
2,3,33.590584,27.027238,-1.527266,12.390954
3,4,22.651456,13.045289,0.468602,6.119599


 - Quarter Evaluation Metrics table for AAPL saved as 'AAPL_Quarter_evaluation_metrics_xgb.csv'.

Quarter Evaluation Metrics for MSFT - XGBoost


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,46.839806,33.043404,0.624667,8.23266
1,2,59.706445,42.586401,-0.08716,10.104347
2,3,63.990341,45.883925,-0.589401,10.75388
3,4,37.432592,24.057653,0.696435,6.155011


 - Quarter Evaluation Metrics table for MSFT saved as 'MSFT_Quarter_evaluation_metrics_xgb.csv'.

Quarter Evaluation Metrics for GOOGL - XGBoost


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,1.277032,0.927616,0.99721,0.782831
1,2,16.283202,10.957774,0.666223,6.471797
2,3,16.783292,11.106801,0.36803,6.531439
3,4,7.154225,3.540482,0.923354,2.351612


 - Quarter Evaluation Metrics table for GOOGL saved as 'GOOGL_Quarter_evaluation_metrics_xgb.csv'.

Quarter Evaluation Metrics for AMZN - XGBoost


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,0.974186,0.772005,0.999272,0.650198
1,2,2.61912,1.829004,0.994598,1.15708
2,3,4.800279,2.748368,0.964612,1.535643
3,4,2.276936,1.557946,0.995457,1.104349


 - Quarter Evaluation Metrics table for AMZN saved as 'AMZN_Quarter_evaluation_metrics_xgb.csv'.

Comparative RMSE Across Quarter for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,6.294782,46.839806,1.277032,0.974186
2,12.349159,59.706445,16.283202,2.61912
3,33.590584,63.990341,16.783292,4.800279
4,22.651456,37.432592,7.154225,2.276936


 - Comparative RMSE Across Quarter table for XGBoost saved as 'comparative_RMSE_across_Quarter_xgb.csv'.

Comparative MAE Across Quarter for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.116058,33.043404,0.927616,0.772005
2,7.08158,42.586401,10.957774,1.829004
3,27.027238,45.883925,11.106801,2.748368
4,13.045289,24.057653,3.540482,1.557946


 - Comparative MAE Across Quarter table for XGBoost saved as 'comparative_MAE_across_Quarter_xgb.csv'.

Comparative R2 Across Quarter for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.892308,0.624667,0.99721,0.999272
2,0.236126,-0.08716,0.666223,0.994598
3,-1.527266,-0.589401,0.36803,0.964612
4,0.468602,0.696435,0.923354,0.995457


 - Comparative R2 Across Quarter table for XGBoost saved as 'comparative_R2_across_Quarter_xgb.csv'.

Comparative MAPE Across Quarter for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.274706,8.23266,0.782831,0.650198
2,3.571376,10.104347,6.471797,1.15708
3,12.390954,10.75388,6.531439,1.535643
4,6.119599,6.155011,2.351612,1.104349


 - Comparative MAPE Across Quarter table for XGBoost saved as 'comparative_MAPE_across_Quarter_xgb.csv'.

Season Evaluation Metrics for AAPL - XGBoost


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,28.796521,18.265201,0.056138,8.275229
1,Spring,4.049072,2.225362,0.847172,1.232589
2,Summer,30.101011,24.205393,-1.69706,11.302339
3,Winter,9.282704,6.736506,0.860473,3.628497


 - Season Evaluation Metrics table for AAPL saved as 'AAPL_Season_evaluation_metrics_xgb.csv'.

Season Evaluation Metrics for MSFT - XGBoost


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,48.823337,32.417538,0.443856,7.936328
1,Spring,54.444675,39.123588,0.307172,9.487843
2,Summer,66.423154,47.280988,-0.66395,10.970065
3,Winter,37.743444,26.154915,0.73253,6.724702


 - Season Evaluation Metrics table for MSFT saved as 'MSFT_Season_evaluation_metrics_xgb.csv'.

Season Evaluation Metrics for GOOGL - XGBoost


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,8.623782,5.262797,0.865441,3.383358
1,Spring,11.127692,6.39645,0.841629,3.945145
2,Summer,19.883711,13.871844,0.377868,7.993662
3,Winter,1.158802,0.815859,0.997685,0.709932


 - Season Evaluation Metrics table for GOOGL saved as 'GOOGL_Season_evaluation_metrics_xgb.csv'.

Season Evaluation Metrics for AMZN - XGBoost


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,3.013583,2.092224,0.991574,1.330543
1,Spring,1.737037,1.302144,0.998018,0.896566
2,Summer,4.782771,2.730331,0.968669,1.547347
3,Winter,0.993921,0.767793,0.99911,0.667275


 - Season Evaluation Metrics table for AMZN saved as 'AMZN_Season_evaluation_metrics_xgb.csv'.

Comparative RMSE Across Season for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,28.796521,48.823337,8.623782,3.013583
Spring,4.049072,54.444675,11.127692,1.737037
Summer,30.101011,66.423154,19.883711,4.782771
Winter,9.282704,37.743444,1.158802,0.993921


 - Comparative RMSE Across Season table for XGBoost saved as 'comparative_RMSE_across_Season_xgb.csv'.

Comparative MAE Across Season for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,18.265201,32.417538,5.262797,2.092224
Spring,2.225362,39.123588,6.39645,1.302144
Summer,24.205393,47.280988,13.871844,2.730331
Winter,6.736506,26.154915,0.815859,0.767793


 - Comparative MAE Across Season table for XGBoost saved as 'comparative_MAE_across_Season_xgb.csv'.

Comparative R2 Across Season for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,0.056138,0.443856,0.865441,0.991574
Spring,0.847172,0.307172,0.841629,0.998018
Summer,-1.69706,-0.66395,0.377868,0.968669
Winter,0.860473,0.73253,0.997685,0.99911


 - Comparative R2 Across Season table for XGBoost saved as 'comparative_R2_across_Season_xgb.csv'.

Comparative MAPE Across Season for All Stocks - XGBoost


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,8.275229,7.936328,3.383358,1.330543
Spring,1.232589,9.487843,3.945145,0.896566
Summer,11.302339,10.970065,7.993662,1.547347
Winter,3.628497,6.724702,0.709932,0.667275


 - Comparative MAPE Across Season table for XGBoost saved as 'comparative_MAPE_across_Season_xgb.csv'.


In [12]:
# Cell 9: Training and Evaluating Random Forest Models

# Import necessary libraries (if not already imported)
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import joblib
import pickle

# Define Parameters
model_save_dir = '../models/random_forest_models'
os.makedirs(model_save_dir, exist_ok=True)

# Initialize dictionaries to store metrics
overall_metrics_rf = {}
grouped_metrics_all_rf = {'Month': {}, 'Quarter': {}, 'Season': {}}

# Function to Add Time Features (if not already added)
def add_time_features_rf(eval_df):
    eval_df['Month'] = eval_df['Date'].dt.month
    eval_df['Quarter'] = eval_df['Date'].dt.quarter
    eval_df['Season'] = eval_df['Month'].apply(
        lambda month: 'Winter' if month in [12, 1, 2] else
                      'Spring' if month in [3, 4, 5] else
                      'Summer' if month in [6, 7, 8] else
                      'Autumn'
    )
    return eval_df

# Iterate Through Each Stock for Evaluation and Plotting
for stock in scaled_daily_data.keys():
    print(f"\n{'='*50}\nTraining and Evaluating Random Forest Model for {stock}\n{'='*50}")
    
    # Retrieve Scaled Data
    data = scaled_daily_data[stock]
    X_train_scaled = data['X_train_scaled']
    y_train_scaled = data['y_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_X = data['scaler_X']
    scaler_y = data['scaler_y']
    
    # Ensure X_train_scaled is a DataFrame with feature names
    if isinstance(X_train_scaled, np.ndarray):
        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=scaler_X.feature_names_in_)
    else:
        X_train_scaled_df = X_train_scaled
    
    # Initialize and Train Random Forest Regressor
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train_scaled_df, y_train_scaled)
    print(f" - Random Forest model trained for {stock}")
    
    # Save feature names used in training
    feature_names = X_train_scaled_df.columns.tolist()
    feature_names_path = os.path.join(model_save_dir, f'feature_names_{stock.upper()}.pkl')
    with open(feature_names_path, 'wb') as f:
        pickle.dump(feature_names, f)
    print(f" - Feature names saved for {stock} at {feature_names_path}")
    
    # Prepare test data with correct feature names
    if isinstance(X_test_scaled, np.ndarray):
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=scaler_X.feature_names_in_)
    else:
        X_test_scaled_df = X_test_scaled
    
    # Predict on Test Data
    predictions_scaled = rf_model.predict(X_test_scaled_df)
  
    # Inverse Transform Predictions and Targets
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    y_test = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
    
    # Retrieve 'test_dates' from scaled_daily_data
    test_dates = data.get('test_dates')
    
    # Create Evaluation DataFrame using 'test_dates'
    if test_dates is not None and len(test_dates) == len(y_test):
        dates = test_dates
    else:
        print(f" - No 'test_dates' found for {stock}. Creating dummy dates.")
        dates = pd.date_range(start='2020-01-01', periods=len(y_test), freq='D')
    
    eval_df = pd.DataFrame({
        'Date': dates,
        'Actual': y_test,
        'Predicted': predictions
    })
    
    # Add Time Features
    eval_df = add_time_features_rf(eval_df)
    
    # Calculate Overall Metrics
    rmse = np.sqrt(mean_squared_error(eval_df['Actual'], eval_df['Predicted']))
    mae = mean_absolute_error(eval_df['Actual'], eval_df['Predicted'])
    r2 = r2_score(eval_df['Actual'], eval_df['Predicted'])
    mape = mean_absolute_percentage_error(eval_df['Actual'], eval_df['Predicted']) * 100
    
    overall_metrics_rf[stock] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }
    
    print(f" - Overall Evaluation Metrics for {stock}:")
    print(f"    RMSE = {rmse:.4f}")
    print(f"    MAE = {mae:.4f}")
    print(f"    R2 = {r2:.4f}")
    print(f"    MAPE = {mape:.2f}%")
    
    # Calculate Grouped Metrics
    grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({
        'RMSE': np.sqrt(mean_squared_error(x['Actual'], x['Predicted'])),
        'MAE': mean_absolute_error(x['Actual'], x['Predicted']),
        'R2': r2_score(x['Actual'], x['Predicted']),
        'MAPE': mean_absolute_percentage_error(x['Actual'], x['Predicted']) * 100
    }))
    
    grouped_metrics_all_rf['Month'][stock] = grouped_metrics_month
    grouped_metrics_all_rf['Quarter'][stock] = grouped_metrics_quarter
    grouped_metrics_all_rf['Season'][stock] = grouped_metrics_season
    
    # Save the trained model using joblib and change the file extension to .joblib
    model_save_path = os.path.join(model_save_dir, f'rf_{stock.upper()}_model.joblib')
    joblib.dump(rf_model, model_save_path)
    print(f" - Random Forest model saved for {stock} at {model_save_path}")
    
    print(f"Training and evaluation completed for {stock}.\n")

# Create Overall Metrics Table
overall_metrics_rf_df = pd.DataFrame(overall_metrics_rf).T
print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - Random Forest")
print("="*50)
display(overall_metrics_rf_df)
overall_metrics_rf_df.to_csv('overall_evaluation_metrics_rf.csv')
print("\n - Overall Evaluation Metrics table for Random Forest saved as 'overall_evaluation_metrics_rf.csv'.")

# Function to Create Grouped Metrics Tables
def create_grouped_metrics_tables_rf(grouped_metrics_all_rf, grouping):
    grouped_metrics_tables_rf = {}
    for stock, metrics in grouped_metrics_all_rf[grouping].items():
        metrics_df = metrics.reset_index()
        metrics_df.columns = [grouping] + list(metrics_df.columns[1:])
        grouped_metrics_tables_rf[stock] = metrics_df
    return grouped_metrics_tables_rf

# Create and Save Grouped Metrics Tables
for grouping in ['Month', 'Quarter', 'Season']:
    grouped_tables_rf = create_grouped_metrics_tables_rf(grouped_metrics_all_rf, grouping)
    for stock, table in grouped_tables_rf.items():
        print(f"\n{'='*50}\n{grouping} Evaluation Metrics for {stock} - Random Forest\n{'='*50}")
        display(table)
        filename = f'{stock}_{grouping}_evaluation_metrics_rf.csv'
        table.to_csv(filename, index=False)
        print(f" - {grouping} Evaluation Metrics table for {stock} saved as '{filename}'.")
    
    # Create Comparative Metrics Tables Across Stocks
    for metric in ['RMSE', 'MAE', 'R2', 'MAPE']:
        comparative_df_rf = pd.DataFrame({stock: grouped_metrics_all_rf[grouping][stock][metric] for stock in grouped_metrics_all_rf[grouping].keys()})
        comparative_df_rf.index.name = grouping
        print(f"\n{'='*50}\nComparative {metric} Across {grouping} for All Stocks - Random Forest\n{'='*50}")
        display(comparative_df_rf)
        filename = f'comparative_{metric}_across_{grouping}_rf.csv'
        comparative_df_rf.to_csv(filename)
        print(f" - Comparative {metric} Across {grouping} table for Random Forest saved as '{filename}'.")



Training and Evaluating Random Forest Model for AAPL
 - Random Forest model trained for AAPL
 - Feature names saved for AAPL at ../models/random_forest_models/feature_names_AAPL.pkl
 - Overall Evaluation Metrics for AAPL:
    RMSE = 21.0155
    MAE = 12.3651
    R2 = 0.3642
    MAPE = 5.85%
 - Random Forest model saved for AAPL at ../models/random_forest_models/rf_AAPL_model.joblib
Training and evaluation completed for AAPL.


Training and Evaluating Random Forest Model for MSFT


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


 - Random Forest model trained for MSFT
 - Feature names saved for MSFT at ../models/random_forest_models/feature_names_MSFT.pkl
 - Overall Evaluation Metrics for MSFT:
    RMSE = 52.4910
    MAE = 35.9495
    R2 = 0.4021
    MAPE = 8.71%
 - Random Forest model saved for MSFT at ../models/random_forest_models/rf_MSFT_model.joblib
Training and evaluation completed for MSFT.


Training and Evaluating Random Forest Model for GOOGL


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


 - Random Forest model trained for GOOGL
 - Feature names saved for GOOGL at ../models/random_forest_models/feature_names_GOOGL.pkl
 - Overall Evaluation Metrics for GOOGL:
    RMSE = 11.8120
    MAE = 6.4051
    R2 = 0.8179
    MAPE = 3.91%
 - Random Forest model saved for GOOGL at ../models/random_forest_models/rf_GOOGL_model.joblib
Training and evaluation completed for GOOGL.


Training and Evaluating Random Forest Model for AMZN


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


 - Random Forest model trained for AMZN
 - Feature names saved for AMZN at ../models/random_forest_models/feature_names_AMZN.pkl
 - Overall Evaluation Metrics for AMZN:
    RMSE = 2.9358
    MAE = 1.6861
    R2 = 0.9930
    MAPE = 1.10%
 - Random Forest model saved for AMZN at ../models/random_forest_models/rf_AMZN_model.joblib
Training and evaluation completed for AMZN.


Overall Evaluation Metrics for All Stocks - Random Forest


  grouped_metrics_month = eval_df.groupby('Month').apply(lambda x: pd.Series({
  grouped_metrics_quarter = eval_df.groupby('Quarter').apply(lambda x: pd.Series({
  grouped_metrics_season = eval_df.groupby('Season').apply(lambda x: pd.Series({


Unnamed: 0,RMSE,MAE,R2,MAPE
AAPL,21.015516,12.365092,0.364223,5.849518
MSFT,52.490979,35.949461,0.402119,8.70581
GOOGL,11.81201,6.405069,0.817862,3.908214
AMZN,2.935761,1.686115,0.992978,1.100694



 - Overall Evaluation Metrics table for Random Forest saved as 'overall_evaluation_metrics_rf.csv'.

Month Evaluation Metrics for AAPL - Random Forest


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,7.569826,5.483422,0.918651,3.012598
1,2,5.31665,3.941839,0.903758,2.179968
2,3,1.194161,0.994367,0.985054,0.610899
3,4,1.190077,0.955172,0.878032,0.569247
4,5,6.553307,4.408831,0.366785,2.346895
5,6,20.430917,16.00196,-1.60172,7.836589
6,7,35.061171,30.889723,-3.444378,14.22199
7,8,30.85999,23.13586,-1.116477,10.567676
8,9,32.438764,23.841383,-0.850537,10.747264
9,10,35.440438,24.593986,-0.484406,10.737619


 - Month Evaluation Metrics table for AAPL saved as 'AAPL_Month_evaluation_metrics_rf.csv'.

Month Evaluation Metrics for MSFT - Random Forest


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,35.877189,25.667412,0.782043,6.682965
1,2,49.36663,36.275398,0.565365,9.004785
2,3,52.292542,36.505128,0.521837,8.88732
3,4,54.464016,40.143372,0.248637,9.753013
4,5,54.748695,39.281576,-0.094573,9.453885
5,6,67.669103,47.070929,-0.65242,10.764869
6,7,77.090155,57.129577,-1.096028,12.904041
7,8,51.158665,36.625654,-0.322728,8.935016
8,9,59.68421,42.696979,-0.418479,10.097535
9,10,54.634849,38.578609,-0.056054,9.292085


 - Month Evaluation Metrics table for MSFT saved as 'MSFT_Month_evaluation_metrics_rf.csv'.

Month Evaluation Metrics for GOOGL - Random Forest


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,1.561016,1.166854,0.996353,0.97026
1,2,1.185179,0.879562,0.99749,0.770596
2,3,1.031326,0.888361,0.997979,0.758934
3,4,7.200949,4.994309,0.922736,3.236044
4,5,16.7291,12.015641,0.656666,7.063202
5,6,20.43709,14.52608,0.457768,8.294879
6,7,24.088874,17.420144,0.329872,9.674413
7,8,11.301584,8.337376,0.543112,5.175187
8,9,7.980653,5.535097,0.601154,3.503924
9,10,11.579684,8.35223,0.631685,5.154417


 - Month Evaluation Metrics table for GOOGL saved as 'GOOGL_Month_evaluation_metrics_rf.csv'.

Month Evaluation Metrics for AMZN - Random Forest


Unnamed: 0,Month,RMSE,MAE,R2,MAPE
0,1,1.212346,1.015518,0.99838,0.89261
1,2,1.154936,0.824639,0.998978,0.663614
2,3,0.987668,0.817505,0.999395,0.675989
3,4,1.758486,1.431511,0.997963,1.017657
4,5,1.995899,1.474755,0.997,0.943215
5,6,3.571356,2.292149,0.985865,1.360212
6,7,7.135266,4.424334,0.944299,2.344278
7,8,1.632501,1.13392,0.992715,0.740345
8,9,3.488826,2.415065,0.981357,1.402757
9,10,3.429965,2.681868,0.987527,1.614068


 - Month Evaluation Metrics table for AMZN saved as 'AMZN_Month_evaluation_metrics_rf.csv'.

Comparative RMSE Across Month for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.569826,35.877189,1.561016,1.212346
2,5.31665,49.36663,1.185179,1.154936
3,1.194161,52.292542,1.031326,0.987668
4,1.190077,54.464016,7.200949,1.758486
5,6.553307,54.748695,16.7291,1.995899
6,20.430917,67.669103,20.43709,3.571356
7,35.061171,77.090155,24.088874,7.135266
8,30.85999,51.158665,11.301584,1.632501
9,32.438764,59.68421,7.980653,3.488826
10,35.440438,54.634849,11.579684,3.429965


 - Comparative RMSE Across Month table for Random Forest saved as 'comparative_RMSE_across_Month_rf.csv'.

Comparative MAE Across Month for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.483422,25.667412,1.166854,1.015518
2,3.941839,36.275398,0.879562,0.824639
3,0.994367,36.505128,0.888361,0.817505
4,0.955172,40.143372,4.994309,1.431511
5,4.408831,39.281576,12.015641,1.474755
6,16.00196,47.070929,14.52608,2.292149
7,30.889723,57.129577,17.420144,4.424334
8,23.13586,36.625654,8.337376,1.13392
9,23.841383,42.696979,5.535097,2.415065
10,24.593986,38.578609,8.35223,2.681868


 - Comparative MAE Across Month table for Random Forest saved as 'comparative_MAE_across_Month_rf.csv'.

Comparative R2 Across Month for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.918651,0.782043,0.996353,0.99838
2,0.903758,0.565365,0.99749,0.998978
3,0.985054,0.521837,0.997979,0.999395
4,0.878032,0.248637,0.922736,0.997963
5,0.366785,-0.094573,0.656666,0.997
6,-1.60172,-0.65242,0.457768,0.985865
7,-3.444378,-1.096028,0.329872,0.944299
8,-1.116477,-0.322728,0.543112,0.992715
9,-0.850537,-0.418479,0.601154,0.981357
10,-0.484406,-0.056054,0.631685,0.987527


 - Comparative R2 Across Month table for Random Forest saved as 'comparative_R2_across_Month_rf.csv'.

Comparative MAPE Across Month for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.012598,6.682965,0.97026,0.89261
2,2.179968,9.004785,0.770596,0.663614
3,0.610899,8.88732,0.758934,0.675989
4,0.569247,9.753013,3.236044,1.017657
5,2.346895,9.453885,7.063202,0.943215
6,7.836589,10.764869,8.294879,1.360212
7,14.22199,12.904041,9.674413,2.344278
8,10.567676,8.935016,5.175187,0.740345
9,10.747264,10.097535,3.503924,1.402757
10,10.737619,9.292085,5.154417,1.614068


 - Comparative MAPE Across Month table for Random Forest saved as 'comparative_MAPE_across_Month_rf.csv'.

Quarter Evaluation Metrics for AAPL - Random Forest


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,5.344335,3.425283,0.922374,1.908975
1,2,12.21297,6.985832,0.252882,3.520528
2,3,32.793339,25.922342,-1.408725,11.832752
3,4,22.258604,12.757539,0.486875,5.974046


 - Quarter Evaluation Metrics table for AAPL saved as 'AAPL_Quarter_evaluation_metrics_rf.csv'.

Quarter Evaluation Metrics for MSFT - Random Forest


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,46.451611,32.819715,0.630862,8.18978
1,2,59.10467,42.056838,-0.065355,9.971514
2,3,63.361484,45.318707,-0.558315,10.613755
3,4,36.867534,23.696669,0.705531,6.072605


 - Quarter Evaluation Metrics table for MSFT saved as 'MSFT_Quarter_evaluation_metrics_rf.csv'.

Quarter Evaluation Metrics for GOOGL - Random Forest


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,1.276508,0.978402,0.997212,0.833074
1,2,15.785297,10.515985,0.686323,6.202031
2,3,16.038045,10.458518,0.422909,6.136738
3,4,6.940702,3.58341,0.927861,2.412062


 - Quarter Evaluation Metrics table for GOOGL saved as 'GOOGL_Quarter_evaluation_metrics_rf.csv'.

Quarter Evaluation Metrics for AMZN - Random Forest


Unnamed: 0,Quarter,RMSE,MAE,R2,MAPE
0,1,1.119772,0.885771,0.999038,0.744272
1,2,2.54911,1.722137,0.994883,1.101071
2,3,4.649197,2.625599,0.966805,1.479413
3,4,2.207269,1.487875,0.995731,1.067064


 - Quarter Evaluation Metrics table for AMZN saved as 'AMZN_Quarter_evaluation_metrics_rf.csv'.

Comparative RMSE Across Quarter for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.344335,46.451611,1.276508,1.119772
2,12.21297,59.10467,15.785297,2.54911
3,32.793339,63.361484,16.038045,4.649197
4,22.258604,36.867534,6.940702,2.207269


 - Comparative RMSE Across Quarter table for Random Forest saved as 'comparative_RMSE_across_Quarter_rf.csv'.

Comparative MAE Across Quarter for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.425283,32.819715,0.978402,0.885771
2,6.985832,42.056838,10.515985,1.722137
3,25.922342,45.318707,10.458518,2.625599
4,12.757539,23.696669,3.58341,1.487875


 - Comparative MAE Across Quarter table for Random Forest saved as 'comparative_MAE_across_Quarter_rf.csv'.

Comparative R2 Across Quarter for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.922374,0.630862,0.997212,0.999038
2,0.252882,-0.065355,0.686323,0.994883
3,-1.408725,-0.558315,0.422909,0.966805
4,0.486875,0.705531,0.927861,0.995731


 - Comparative R2 Across Quarter table for Random Forest saved as 'comparative_R2_across_Quarter_rf.csv'.

Comparative MAPE Across Quarter for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.908975,8.18978,0.833074,0.744272
2,3.520528,9.971514,6.202031,1.101071
3,11.832752,10.613755,6.136738,1.479413
4,5.974046,6.072605,2.412062,1.067064


 - Comparative MAPE Across Quarter table for Random Forest saved as 'comparative_MAPE_across_Quarter_rf.csv'.

Season Evaluation Metrics for AAPL - Random Forest


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,28.108057,17.733575,0.10073,8.026258
1,Spring,3.961734,2.155534,0.853694,1.194306
2,Summer,29.588004,23.453232,-1.605912,10.916004
3,Winter,8.441225,5.892631,0.884623,3.171645


 - Season Evaluation Metrics table for AAPL saved as 'AAPL_Season_evaluation_metrics_rf.csv'.

Season Evaluation Metrics for MSFT - Random Forest


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,48.178252,31.892595,0.458455,7.803344
1,Spring,53.84366,38.624907,0.322384,9.359369
2,Summer,65.840428,46.69633,-0.634883,10.823939
3,Winter,37.427839,26.097501,0.736985,6.738499


 - Season Evaluation Metrics table for MSFT saved as 'MSFT_Season_evaluation_metrics_rf.csv'.

Season Evaluation Metrics for GOOGL - Random Forest


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,8.262286,5.080965,0.876486,3.284634
1,Spring,10.638116,6.02855,0.855258,3.719475
2,Summer,19.201819,13.290324,0.419807,7.6457
3,Winter,1.281814,0.966461,0.997167,0.839556


 - Season Evaluation Metrics table for GOOGL saved as 'GOOGL_Season_evaluation_metrics_rf.csv'.

Season Evaluation Metrics for AMZN - Random Forest


Unnamed: 0,Season,RMSE,MAE,R2,MAPE
0,Autumn,2.910614,2.002825,0.99214,1.285507
1,Spring,1.639379,1.240108,0.998234,0.877289
2,Summer,4.668892,2.586885,0.970144,1.466013
3,Winter,1.101751,0.880075,0.998907,0.759613


 - Season Evaluation Metrics table for AMZN saved as 'AMZN_Season_evaluation_metrics_rf.csv'.

Comparative RMSE Across Season for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,28.108057,48.178252,8.262286,2.910614
Spring,3.961734,53.84366,10.638116,1.639379
Summer,29.588004,65.840428,19.201819,4.668892
Winter,8.441225,37.427839,1.281814,1.101751


 - Comparative RMSE Across Season table for Random Forest saved as 'comparative_RMSE_across_Season_rf.csv'.

Comparative MAE Across Season for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,17.733575,31.892595,5.080965,2.002825
Spring,2.155534,38.624907,6.02855,1.240108
Summer,23.453232,46.69633,13.290324,2.586885
Winter,5.892631,26.097501,0.966461,0.880075


 - Comparative MAE Across Season table for Random Forest saved as 'comparative_MAE_across_Season_rf.csv'.

Comparative R2 Across Season for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,0.10073,0.458455,0.876486,0.99214
Spring,0.853694,0.322384,0.855258,0.998234
Summer,-1.605912,-0.634883,0.419807,0.970144
Winter,0.884623,0.736985,0.997167,0.998907


 - Comparative R2 Across Season table for Random Forest saved as 'comparative_R2_across_Season_rf.csv'.

Comparative MAPE Across Season for All Stocks - Random Forest


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,AMZN
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autumn,8.026258,7.803344,3.284634,1.285507
Spring,1.194306,9.359369,3.719475,0.877289
Summer,10.916004,10.823939,7.6457,1.466013
Winter,3.171645,6.738499,0.839556,0.759613


 - Comparative MAPE Across Season table for Random Forest saved as 'comparative_MAPE_across_Season_rf.csv'.


In [14]:
# Cell 10A: Generating Meta-Features from Base Models

# Define Paths for Models
xgb_model_dir = '../models/xgb_models'
rf_model_dir = '../models/random_forest_models'
lstm_model_dir = '../models/lstm_models'
gru_model_dir = '../models/gru_models'
meta_model_dir = '../models/meta_model'
os.makedirs(meta_model_dir, exist_ok=True)

# Initialize Dictionaries to Store Meta-Features and Targets
meta_features_train_dict = {}
meta_features_test_dict = {}
y_train_dict = {}
y_test_dict = {}

# Function to Create Sequences (Assuming it's defined in a common cell)
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(timesteps, len(X)):
        X_seq.append(X[i-timesteps:i].values)
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

# Iterate Through Each Stock to Populate Meta-Features
for stock in stocks:
    stock_upper = stock.upper()  # Ensure stock symbol is uppercase
    print(f"\n{'='*50}\nProcessing Stock: {stock_upper}\n{'='*50}")

    # ----- Load Base Models -----
    missing_models = []
    
    # Load XGBoost Model
    xgb_model_path = os.path.join(xgb_model_dir, f'xgb_{stock_upper}_model.json')
    if not os.path.exists(xgb_model_path):
        missing_models.append('XGBoost')
    else:
        try:
            xgb_model = xgb.XGBRegressor()
            xgb_model.load_model(xgb_model_path)
            print(f" - Loaded XGBoost model for {stock_upper}.")
        except Exception as e:
            print(f" - Error loading XGBoost model for {stock_upper}: {e}")
            missing_models.append('XGBoost')
    
    # Load Random Forest Model
    rf_model_path = os.path.join(rf_model_dir, f'rf_{stock_upper}_model.joblib')
    if not os.path.exists(rf_model_path):
        missing_models.append('Random Forest')
    else:
        try:
            rf_model = joblib.load(rf_model_path)
            print(f" - Loaded Random Forest model for {stock_upper}.")
        except Exception as e:
            print(f" - Error loading Random Forest model for {stock_upper}: {e}")
            missing_models.append('Random Forest')

    # Load LSTM Model
    lstm_model_path = os.path.join(lstm_model_dir, f'lstm_{stock_upper}_best.keras')
    if not os.path.exists(lstm_model_path):
        missing_models.append('LSTM')
    else:
        try:
            lstm_model = load_model(lstm_model_path)
            print(f" - Loaded LSTM model for {stock_upper}.")
        except Exception as e:
            print(f" - Error loading LSTM model for {stock_upper}: {e}")
            missing_models.append('LSTM')

    # Load GRU Model
    gru_model_path = os.path.join(gru_model_dir, f'gru_{stock_upper}_best.keras')
    if not os.path.exists(gru_model_path):
        missing_models.append('GRU')
    else:
        try:
            gru_model = load_model(gru_model_path)
            print(f" - Loaded GRU model for {stock_upper}.")
        except Exception as e:
            print(f" - Error loading GRU model for {stock_upper}: {e}")
            missing_models.append('GRU')

    if missing_models:
        print(f" - Missing or failed to load models for {stock_upper}: {', '.join(missing_models)}. Skipping.")
        continue

    # ----- Retrieve Scaled Data -----
    data = scaled_daily_data.get(stock)
    if data is None:
        print(f" - No scaled data found for {stock_upper}. Skipping.")
        continue

    # Check for 'test_dates'
    if 'test_dates' not in data:
        print(f" - 'test_dates' not found for {stock_upper}. Skipping.")
        continue
    test_dates = data['test_dates']

    X_train_scaled = data['X_train_scaled']
    X_test_scaled = data['X_test_scaled']
    y_train_scaled = data['y_train_scaled']
    y_test_scaled = data['y_test_scaled']
    scaler_y = data['scaler_y']

    # ----- Create Sequences for LSTM and GRU Models -----
    TIMESTEPS = 60  # Ensure consistency
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, TIMESTEPS)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, TIMESTEPS)
    
    print(f" - Training sequences: {X_train_seq.shape}, Training targets: {y_train_seq.shape}")
    print(f" - Testing sequences: {X_test_seq.shape}, Testing targets: {y_test_seq.shape}")
    
    # ----- Generate Predictions from Base Models -----
    try:
        # XGBoost Predictions
        xgb_pred_train_scaled = xgb_model.predict(X_train_scaled.iloc[TIMESTEPS:])
        xgb_pred_test_scaled = xgb_model.predict(X_test_scaled.iloc[TIMESTEPS:])
        xgb_pred_train = scaler_y.inverse_transform(xgb_pred_train_scaled.reshape(-1, 1)).flatten()
        xgb_pred_test = scaler_y.inverse_transform(xgb_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - XGBoost predictions generated for {stock_upper}.")
        
        # Random Forest Predictions
        rf_pred_train_scaled = rf_model.predict(X_train_scaled.iloc[TIMESTEPS:])
        rf_pred_test_scaled = rf_model.predict(X_test_scaled.iloc[TIMESTEPS:])
        rf_pred_train = scaler_y.inverse_transform(rf_pred_train_scaled.reshape(-1, 1)).flatten()
        rf_pred_test = scaler_y.inverse_transform(rf_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - Random Forest predictions generated for {stock_upper}.")
        
        # LSTM Predictions
        lstm_pred_train_scaled = lstm_model.predict(X_train_seq).flatten()
        lstm_pred_test_scaled = lstm_model.predict(X_test_seq).flatten()
        lstm_pred_train = scaler_y.inverse_transform(lstm_pred_train_scaled.reshape(-1, 1)).flatten()
        lstm_pred_test = scaler_y.inverse_transform(lstm_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - LSTM predictions generated for {stock_upper}.")
        
        # GRU Predictions
        gru_pred_train_scaled = gru_model.predict(X_train_seq).flatten()
        gru_pred_test_scaled = gru_model.predict(X_test_seq).flatten()
        gru_pred_train = scaler_y.inverse_transform(gru_pred_train_scaled.reshape(-1, 1)).flatten()
        gru_pred_test = scaler_y.inverse_transform(gru_pred_test_scaled.reshape(-1, 1)).flatten()
        print(f" - GRU predictions generated for {stock_upper}.")
    except Exception as e:
        print(f" - Error during prediction generation for {stock_upper}: {e}")
        continue
    
    # ----- Align Predictions and Targets -----
    # Determine the minimum length to ensure alignment
    min_length_train = min(len(xgb_pred_train), len(rf_pred_train), len(lstm_pred_train), len(gru_pred_train), len(y_train_seq))
    min_length_test = min(len(xgb_pred_test), len(rf_pred_test), len(lstm_pred_test), len(gru_pred_test), len(y_test_seq))
    
    # Slice predictions and targets to min_length
    xgb_pred_train = xgb_pred_train[:min_length_train]
    rf_pred_train = rf_pred_train[:min_length_train]
    lstm_pred_train = lstm_pred_train[:min_length_train]
    gru_pred_train = gru_pred_train[:min_length_train]
    y_train = scaler_y.inverse_transform(y_train_seq[:min_length_train].reshape(-1, 1)).flatten()
    
    xgb_pred_test = xgb_pred_test[:min_length_test]
    rf_pred_test = rf_pred_test[:min_length_test]
    lstm_pred_test = lstm_pred_test[:min_length_test]
    gru_pred_test = gru_pred_test[:min_length_test]
    y_test = scaler_y.inverse_transform(y_test_seq[:min_length_test].reshape(-1, 1)).flatten()
    
    # ----- Populate Meta-Features for Training Data -----
    meta_features_train = pd.DataFrame({
        'XGB_Pred': xgb_pred_train,
        'RF_Pred': rf_pred_train,
        'LSTM_Pred': lstm_pred_train,
        'GRU_Pred': gru_pred_train
    })
    meta_features_train_dict[stock] = meta_features_train
    print(f" - Meta-features for training data populated for {stock_upper}.")
    
    # ----- Populate Meta-Features for Test Data -----
    meta_features_test = pd.DataFrame({
        'XGB_Pred': xgb_pred_test,
        'RF_Pred': rf_pred_test,
        'LSTM_Pred': lstm_pred_test,
        'GRU_Pred': gru_pred_test
    })
    meta_features_test_dict[stock] = meta_features_test
    print(f" - Meta-features for testing data populated for {stock_upper}.")
    
    # ----- Store Target Variables -----
    y_train_dict[stock] = y_train
    y_test_dict[stock] = y_test
    print(f" - Target variables stored for {stock_upper}.")

print("\n" + "="*50 + "\nMeta-Features DataFrame Shapes\n" + "="*50)
for stock in meta_features_train_dict.keys():
    print(f" - {stock}: meta_features_train shape: {meta_features_train_dict[stock].shape}, meta_features_test shape: {meta_features_test_dict[stock].shape}, y_train shape: {y_train_dict[stock].shape}, y_test shape: {y_test_dict[stock].shape}")
print("-" * 80)



Processing Stock: AAPL
 - Loaded XGBoost model for AAPL.
 - Loaded Random Forest model for AAPL.
 - Loaded LSTM model for AAPL.
 - Loaded GRU model for AAPL.
 - Training sequences: (1951, 60, 36), Training targets: (1951,)
 - Testing sequences: (443, 60, 36), Testing targets: (443,)
 - XGBoost predictions generated for AAPL.
 - Random Forest predictions generated for AAPL.
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
 - LSTM predictions generated for AAPL.
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
 - GRU predictions generated for AAPL.
 - Meta-features for training data populated for AAPL.
 - Meta-features for testing data populated for AAPL.
 - Target variables stored for AAPL.

Processing Stock: MSFT
 - Loaded XGBoost model for MSFT.
 - Loaded Random Forest model for MSFT.
 - Loade

In [15]:
# Cell 10B: Training and Evaluating the Stacking Meta-Model

# Initialize a dictionary to store meta-model performance
meta_model_per_stock = {}

print("\n" + "="*50 + "\nTraining Meta-Models for Each Stock\n" + "="*50)

for stock in meta_features_train_dict.keys():
    print(f"\nTraining Meta-Model for {stock}")
    
    meta_features_train = meta_features_train_dict[stock]
    meta_features_test = meta_features_test_dict[stock]
    y_train = y_train_dict[stock]
    y_test = y_test_dict[stock]
    
    # Check if meta_features_train and y_train are non-empty
    if meta_features_train.empty or len(y_train) == 0:
        print(f" - Empty meta-features or target variables for {stock}. Skipping.")
        continue
    
    # ----- Train Meta-Model (Ridge Regression with Cross-Validation) -----
    try:
        meta_model = RidgeCV()
        meta_model.fit(meta_features_train, y_train)
        print(f" - Meta-Model (Ridge Regression) trained successfully for {stock}.")
    except Exception as e:
        print(f" - Error training Meta-Model for {stock}: {e}")
        continue
    
    # ----- Save Meta-Model -----
    meta_model_path = os.path.join(meta_model_dir, f'stacking_meta_model_{stock.upper()}.joblib')
    try:
        joblib.dump(meta_model, meta_model_path)
        print(f" - Meta-Model saved at '{meta_model_path}'")
    
        # Save the feature names used during training
        meta_feature_names = meta_features_train.columns.tolist()
        feature_names_path = os.path.join(meta_model_dir, f'meta_feature_names_{stock.upper()}.pkl')
        with open(feature_names_path, 'wb') as f:
            pickle.dump(meta_feature_names, f)
        print(f" - Meta-model feature names saved at '{feature_names_path}'")
    
        # Additional logging for file size and integrity check
        file_size = os.path.getsize(meta_model_path)
        print(f" - File size of saved meta-model for {stock}: {file_size} bytes")
    
        # Verify by loading the saved meta-model immediately
        loaded_meta_model = joblib.load(meta_model_path)
        print(f" - Meta-Model loaded successfully after saving for {stock}")
    except Exception as e:
        print(f" - Error saving or loading Meta-Model for {stock}: {e}")
        continue
    
    # ----- Generate Meta-Predictions on Test Data -----
    try:
        # Ensure meta_features_test has the same columns in the same order
        meta_features_test = meta_features_test[meta_feature_names]
    
        meta_pred_test = meta_model.predict(meta_features_test)
        print(f" - Meta-Predictions generated for {stock}.")
    except Exception as e:
        print(f" - Error generating Meta-Predictions for {stock}: {e}")
        continue
    
    # ----- Evaluate Meta-Model -----
    try:
        rmse_meta = np.sqrt(mean_squared_error(y_test, meta_pred_test))
        mae_meta = mean_absolute_error(y_test, meta_pred_test)
        r2_meta = r2_score(y_test, meta_pred_test)
        mape_meta = mean_absolute_percentage_error(y_test, meta_pred_test) * 100
        
        meta_model_per_stock[stock] = {
            'RMSE': rmse_meta,
            'MAE': mae_meta,
            'R2': r2_meta,
            'MAPE': mape_meta
        }
        
        print(f"\nMeta-Model Evaluation Metrics for {stock}:")
        print(f"    RMSE = {rmse_meta:.4f}")
        print(f"    MAE = {mae_meta:.4f}")
        print(f"    R2 = {r2_meta:.4f}")
        print(f"    MAPE = {mape_meta:.2f}%")
    except Exception as e:
        print(f" - Error evaluating Meta-Model for {stock}: {e}")
        continue
    
    # ----- Create Evaluation DataFrame for Meta-Model -----
    try:
        # Align test_dates with predictions
        adjusted_test_dates = pd.date_range(start='2020-01-01', periods=len(meta_pred_test), freq='D')
        
        eval_df_meta = pd.DataFrame({
            'Date': adjusted_test_dates,
            'Actual': y_test,
            'Meta_Predicted': meta_pred_test
        })
        
        # ----- Add Time Features -----
        eval_df_meta['Month'] = eval_df_meta['Date'].dt.month
        eval_df_meta['Quarter'] = eval_df_meta['Date'].dt.quarter
        eval_df_meta['Season'] = eval_df_meta['Month'].apply(
            lambda month: 'Winter' if month in [12, 1, 2] else
                          'Spring' if month in [3, 4, 5] else
                          'Summer' if month in [6, 7, 8] else
                          'Autumn'
        )
        print(f" - Time features added to the meta-model evaluation DataFrame for {stock}.")
        
        # ----- Save Meta-Model Predictions -----
        meta_pred_save_path = os.path.join(meta_model_dir, f'meta_predictions_{stock.upper()}.csv')
        eval_df_meta.to_csv(meta_pred_save_path, index=False)
        print(f" - Meta-Model predictions saved at '{meta_pred_save_path}'")
    except Exception as e:
        print(f" - Error saving Meta-Model predictions for {stock}: {e}")
    
    # ----- Store Meta-Model Performance -----
    print(f" - Meta-Model performance metrics stored for {stock}.")
    
    # ----- Detailed Descriptive Statistics -----
    try:
        df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': meta_pred_test
        })
        descriptive_stats = df.describe().T
        correlation = df.corr().loc['Actual', 'Predicted']
        descriptive_stats['Correlation'] = correlation
        print(f"\nDescriptive Statistics for {stock}:")
        display(descriptive_stats)
        
        # Save Descriptive Statistics
        detailed_metrics_path = os.path.join(meta_model_dir, f'detailed_metrics_{stock.upper()}.csv')
        descriptive_stats.to_csv(detailed_metrics_path)
        print(f" - Detailed metrics saved for {stock} at '{detailed_metrics_path}'")
    except Exception as e:
        print(f" - Error generating detailed statistics for {stock}: {e}")
    
# ----- Create Overall Metrics Table -----
overall_metrics_meta_df = pd.DataFrame(meta_model_per_stock).T
overall_metrics_meta_df = overall_metrics_meta_df[['RMSE', 'MAE', 'R2', 'MAPE']].round(4)

print("\n" + "="*50)
print("Overall Evaluation Metrics for All Stocks - Meta Stacked Model")
print("="*50)
display(overall_metrics_meta_df)
overall_metrics_meta_df.to_csv('overall_evaluation_metrics_meta_stacked.csv')
print("\n - Overall Evaluation Metrics table for Meta Stacked Model saved as 'overall_evaluation_metrics_meta_stacked.csv'.")



Training Meta-Models for Each Stock

Training Meta-Model for AAPL
 - Meta-Model (Ridge Regression) trained successfully for AAPL.
 - Meta-Model saved at '../models/meta_model/stacking_meta_model_AAPL.joblib'
 - Meta-model feature names saved at '../models/meta_model/meta_feature_names_AAPL.pkl'
 - File size of saved meta-model for AAPL: 1036 bytes
 - Meta-Model loaded successfully after saving for AAPL
 - Meta-Predictions generated for AAPL.

Meta-Model Evaluation Metrics for AAPL:
    RMSE = 22.8628
    MAE = 14.3100
    R2 = -0.0161
    MAPE = 6.74%
 - Time features added to the meta-model evaluation DataFrame for AAPL.
 - Meta-Model predictions saved at '../models/meta_model/meta_predictions_AAPL.csv'
 - Meta-Model performance metrics stored for AAPL.

Descriptive Statistics for AAPL:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Correlation
Actual,443.0,187.283838,22.706633,143.0,172.175003,183.860001,194.965004,236.479996,0.688458
Predicted,443.0,173.247324,8.202596,143.090083,172.264543,176.5734,178.394041,179.893591,0.688458


 - Detailed metrics saved for AAPL at '../models/meta_model/detailed_metrics_AAPL.csv'

Training Meta-Model for MSFT
 - Meta-Model (Ridge Regression) trained successfully for MSFT.
 - Meta-Model saved at '../models/meta_model/stacking_meta_model_MSFT.joblib'
 - Meta-model feature names saved at '../models/meta_model/meta_feature_names_MSFT.pkl'
 - File size of saved meta-model for MSFT: 1036 bytes
 - Meta-Model loaded successfully after saving for MSFT
 - Meta-Predictions generated for MSFT.

Meta-Model Evaluation Metrics for MSFT:
    RMSE = 56.5649
    MAE = 41.0254
    R2 = 0.0324
    MAPE = 9.89%
 - Time features added to the meta-model evaluation DataFrame for MSFT.
 - Meta-Model predictions saved at '../models/meta_model/meta_predictions_MSFT.csv'
 - Meta-Model performance metrics stored for MSFT.

Descriptive Statistics for MSFT:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Correlation
Actual,443.0,366.145237,57.56926,242.710007,325.985001,374.070007,416.220001,467.559998,0.812971
Predicted,443.0,325.988641,25.195417,244.037868,326.062129,338.290217,339.737897,342.160162,0.812971


 - Detailed metrics saved for MSFT at '../models/meta_model/detailed_metrics_MSFT.csv'

Training Meta-Model for GOOGL
 - Meta-Model (Ridge Regression) trained successfully for GOOGL.
 - Meta-Model saved at '../models/meta_model/stacking_meta_model_GOOGL.joblib'
 - Meta-model feature names saved at '../models/meta_model/meta_feature_names_GOOGL.pkl'
 - File size of saved meta-model for GOOGL: 1036 bytes
 - Meta-Model loaded successfully after saving for GOOGL
 - Meta-Predictions generated for GOOGL.

Meta-Model Evaluation Metrics for GOOGL:
    RMSE = 12.9345
    MAE = 7.3082
    R2 = 0.7248
    MAPE = 4.38%
 - Time features added to the meta-model evaluation DataFrame for GOOGL.
 - Meta-Model predictions saved at '../models/meta_model/meta_predictions_GOOGL.csv'
 - Meta-Model performance metrics stored for GOOGL.

Descriptive Statistics for GOOGL:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Correlation
Actual,443.0,139.437404,24.682654,89.129997,123.584999,137.399994,161.635002,191.179993,0.92833
Predicted,443.0,132.529753,16.942178,89.901272,123.752378,137.4726,147.056648,148.985035,0.92833


 - Detailed metrics saved for GOOGL at '../models/meta_model/detailed_metrics_GOOGL.csv'

Training Meta-Model for AMZN
 - Meta-Model (Ridge Regression) trained successfully for AMZN.
 - Meta-Model saved at '../models/meta_model/stacking_meta_model_AMZN.joblib'
 - Meta-model feature names saved at '../models/meta_model/meta_feature_names_AMZN.pkl'
 - File size of saved meta-model for AMZN: 1036 bytes
 - Meta-Model loaded successfully after saving for AMZN
 - Meta-Predictions generated for AMZN.

Meta-Model Evaluation Metrics for AMZN:
    RMSE = 3.0815
    MAE = 1.7260
    R2 = 0.9904
    MAPE = 1.06%
 - Time features added to the meta-model evaluation DataFrame for AMZN.
 - Meta-Model predictions saved at '../models/meta_model/meta_predictions_AMZN.csv'
 - Meta-Model performance metrics stored for AMZN.

Descriptive Statistics for AMZN:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Correlation
Actual,443.0,149.469413,31.541288,90.730003,127.115002,147.419998,179.770004,200.0,0.996619
Predicted,443.0,148.343762,30.197046,91.127857,127.505768,146.79202,179.182356,185.401356,0.996619


 - Detailed metrics saved for AMZN at '../models/meta_model/detailed_metrics_AMZN.csv'

Overall Evaluation Metrics for All Stocks - Meta Stacked Model


Unnamed: 0,RMSE,MAE,R2,MAPE
AAPL,22.8628,14.31,-0.0161,6.7429
MSFT,56.5649,41.0254,0.0324,9.8878
GOOGL,12.9345,7.3082,0.7248,4.3816
AMZN,3.0815,1.726,0.9904,1.0578



 - Overall Evaluation Metrics table for Meta Stacked Model saved as 'overall_evaluation_metrics_meta_stacked.csv'.


In [16]:
# Cell 11: Define Essential Functions for all models

# Function to load all necessary models and scalers for a given stock
def load_models(stock):
    try:
        # Define model directories
        xgb_model_dir = '../models/xgb_models'
        rf_model_dir = '../models/random_forest_models'
        lstm_model_dir = '../models/lstm_models'
        gru_model_dir = '../models/gru_models'
        meta_model_dir = '../models/meta_model'
        scalers_dir = '../models/scalers'
        
        # Load XGBoost Model
        xgb_model_path = os.path.join(xgb_model_dir, f'xgb_{stock}_model.json')
        xgb_model = xgb.XGBRegressor()
        xgb_model.load_model(xgb_model_path)
        print(f" - Loaded XGBoost model from '{xgb_model_path}'")
        
        # Load Random Forest Model
        rf_model_path = os.path.join(rf_model_dir, f'rf_{stock}_model.pkl')
        rf_model = joblib.load(rf_model_path)
        print(f" - Loaded Random Forest model from '{rf_model_path}'")
        
        # Load LSTM Model
        lstm_model_path = os.path.join(lstm_model_dir, f'lstm_{stock}_best.keras')
        lstm_model = load_model(lstm_model_path)
        print(f" - Loaded LSTM model from '{lstm_model_path}'")
        
        # Load GRU Model
        gru_model_path = os.path.join(gru_model_dir, f'gru_{stock}_best.keras')
        gru_model = load_model(gru_model_path)
        print(f" - Loaded GRU model from '{gru_model_path}'")
        
        # Load Meta-Model
        meta_model_path = os.path.join(meta_model_dir, f'stacking_meta_model_{stock}.pkl')
        meta_model = joblib.load(meta_model_path)
        print(f" - Loaded Meta-Model from '{meta_model_path}'")
        
        # Load Scalers
        scaler_X_path = os.path.join(scalers_dir, f'minmax_scaler_X_{stock}.joblib')
        scaler_y_path = os.path.join(scalers_dir, f'minmax_scaler_y_{stock}.joblib')
        scaler_X = joblib.load(scaler_X_path)
        scaler_y = joblib.load(scaler_y_path)
        print(f" - Loaded Scalers from '{scaler_X_path}' and '{scaler_y_path}'")
        
        return xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y
    except Exception as e:
        print(f" - Error loading models or scalers for {stock}: {e}")
        return None

# Function to generate features for the next day forecast
def generate_next_day_features(current_data_unscaled, scaler_X, timesteps=60):
    if len(current_data_unscaled) < timesteps:
        raise ValueError("Insufficient data to generate features.")
    
    # Extract the most recent 'timesteps' worth of data
    latest_data = current_data_unscaled.tail(timesteps).copy()

    if 'Close' not in latest_data.columns:
        raise KeyError("'Close' column is missing in current_data_unscaled.")

    # Drop 'Close' and 'Date' columns to get the feature columns
    if 'Date' in latest_data.columns:
        features_for_models = latest_data.drop(columns=['Close', 'Date'])
    else:
        features_for_models = latest_data.drop(columns=['Close'])

    # Ensure features are in the same order as during training
    expected_features = scaler_X.feature_names_in_
    
    # Check if there are any missing features compared to what was used during training
    missing_features = set(expected_features) - set(features_for_models.columns)
    if missing_features:
        raise KeyError(f"Missing required columns: {missing_features}")
    
    # Arrange features in the expected order
    features_for_models = features_for_models[expected_features]
    
    # 1. Scale features for XGBoost and Random Forest (single row for last day)
    xgb_rf_features_scaled = scaler_X.transform(features_for_models.iloc[-1:].copy())  # Last row
    
    # 2. Scale features for LSTM and GRU (entire sequence of 'timesteps')
    lstm_gru_features_scaled = scaler_X.transform(features_for_models).reshape(1, timesteps, -1)  # Sequence of 60 days
    
    # Latest features (last row) for future updates (e.g., updating 'Close' price)
    latest_features = features_for_models.iloc[-1].copy()

    # Return both single-row and sequence features
    return xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features


In [17]:
# Cell 12: Initialize and Prepare Data

# Define Paths
raw_data_dir = '../data/stock_data'
forecast_save_dir = '../models/future_forecasts'
os.makedirs(forecast_save_dir, exist_ok=True)

# Initialize a dictionary to store models and scaled data for each stock
models_per_stock = {}
timesteps = 60

for stock in stocks:
    print(f"\n{'='*50}\nProcessing Stock: {stock}\n{'='*50}")
    
    # Load models
    models = load_models(stock)
    if models is None:
        print(f" - Skipping stock '{stock}' due to model loading issues.")
        continue
    xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = models
    
    # Load raw data
    raw_csv_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(raw_csv_path):
        print(f" - Raw data CSV not found at '{raw_csv_path}'. Skipping.")
        continue
    
    try:
        df = pd.read_csv(raw_csv_path)
        print(f" - Loaded raw data from '{raw_csv_path}'. Shape: {df.shape}")
    except Exception as e:
        print(f" - Error reading CSV for {stock}: {e}")
        continue
    
    # Ensure 'Date' column is present
    if 'Date' not in df.columns:
        print(f" - 'Date' column missing in '{raw_csv_path}'. Skipping.")
        continue
    
    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    if df['Date'].isnull().any():
        print(f" - Some 'Date' entries could not be converted to datetime for {stock}. Dropping these rows.")
        df.dropna(subset=['Date'], inplace=True)
    
    # Sort by Date
    df.sort_values('Date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Add Enhanced 'Close' Price-Based Features
    try:
        df_fe = add_close_price_features(df)
        print(f" - Applied enhanced 'Close' price-based feature engineering. Shape: {df_fe.shape}")
    except Exception as e:
        print(f" - Error during feature engineering for {stock}: {e}")
        continue
    
    # Keep only necessary recent data
    current_data_unscaled = df_fe.tail(timesteps).copy()
    
    # Prepare features
    feature_cols = [col for col in current_data_unscaled.columns if col not in ['Date', 'Close']]
    X_current = current_data_unscaled[feature_cols].copy()
    
    # Drop 'Date' column if present
    if 'Date' in X_current.columns:
        X_current = X_current.drop(columns=['Date'])
        print(" - Dropped 'Date' column from features.")
    
    # Ensure features are in the same order as during training
    expected_features = scaler_X.feature_names_in_
    
    # Identify missing features
    missing_features = set(expected_features) - set(X_current.columns)
    if missing_features:
        print(f" - Missing Features for {stock}: {missing_features}")
        # Add missing features with default values (e.g., 0)
        for feature in missing_features:
            X_current[feature] = 0  # Alternatively, use df_fe[feature].iloc[-1] or another strategy
        print(f" - Added missing features with default values for {stock}.")
    
    # Reorder columns to match expected_features
    X_current = X_current[expected_features]
    
    # Scale features and target
    try:
        X_current_scaled = pd.DataFrame(scaler_X.transform(X_current), columns=X_current.columns)
        current_data_scaled = X_current_scaled.copy()
        current_data_scaled['Close'] = scaler_y.transform(current_data_unscaled['Close'].values.reshape(-1, 1)).flatten()
        current_data_scaled['Date'] = current_data_unscaled['Date'].values
        print(f" - Scaled current data for {stock}")
    except Exception as e:
        print(f" - Error scaling data for {stock}: {e}")
        continue
    
    # Store prepared data in models_per_stock
    models_per_stock[stock] = {
        'models': models,
        'current_data_scaled': current_data_scaled,
        'current_data_unscaled': current_data_unscaled,
        'scaler_y': scaler_y,
    }
    
    print(f" - Current data for {stock} loaded and prepared. Total samples: {len(current_data_scaled)}")



Processing Stock: AAPL
 - Loaded XGBoost model from '../models/xgb_models/xgb_AAPL_model.json'
 - Error loading models or scalers for AAPL: [Errno 2] No such file or directory: '../models/random_forest_models/rf_AAPL_model.pkl'
 - Skipping stock 'AAPL' due to model loading issues.

Processing Stock: MSFT
 - Loaded XGBoost model from '../models/xgb_models/xgb_MSFT_model.json'
 - Error loading models or scalers for MSFT: [Errno 2] No such file or directory: '../models/random_forest_models/rf_MSFT_model.pkl'
 - Skipping stock 'MSFT' due to model loading issues.

Processing Stock: GOOGL
 - Loaded XGBoost model from '../models/xgb_models/xgb_GOOGL_model.json'
 - Error loading models or scalers for GOOGL: [Errno 2] No such file or directory: '../models/random_forest_models/rf_GOOGL_model.pkl'
 - Skipping stock 'GOOGL' due to model loading issues.

Processing Stock: AMZN
 - Loaded XGBoost model from '../models/xgb_models/xgb_AMZN_model.json'
 - Error loading models or scalers for AMZN: [Errn

In [18]:
# Cell 13: Forecasting Future Prices

def inverse_scale_prediction(scaler_y, prediction_scaled):
    return scaler_y.inverse_transform([[prediction_scaled]])[0][0]

# Define directories
forecast_save_dir = '../models/future_forecasts'
os.makedirs(forecast_save_dir, exist_ok=True)

# Define forecast days
forecast_days = 30

# Initialize a dictionary to store forecast results
forecast_results = {stock: [] for stock in models_per_stock.keys()}

# Start forecasting
for stock in models_per_stock.keys():
    print(f"\nStarting Forecasting for {stock}")
    try:
        models = models_per_stock[stock]['models']
        current_data_scaled = models_per_stock[stock]['current_data_scaled']
        current_data_unscaled = models_per_stock[stock]['current_data_unscaled']
        scaler_X = models_per_stock[stock]['models'][5]  # scaler_X
        scaler_y = models_per_stock[stock]['models'][6]  # scaler_y

        # Corrected model unpacking
        xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = models

        # Create a copy of current_data_unscaled to update with predictions
        updated_unscaled = current_data_unscaled.copy()

        for day in range(1, forecast_days + 1):
            try:
                # Determine the timesteps for this iteration
                timesteps = min(60, len(updated_unscaled))

                # Generate features for XGBoost and Random Forest
                xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features = generate_next_day_features(
                    updated_unscaled,
                    scaler_X,
                    timesteps=timesteps  # Use the current timesteps
                )

                # Ensure lstm_gru_features_scaled has the correct shape for LSTM/GRU
                # Padding sequences if necessary
                if lstm_gru_features_scaled.shape[1] < 60:
                    padding_length = 60 - lstm_gru_features_scaled.shape[1]
                    padding = np.zeros((lstm_gru_features_scaled.shape[0], padding_length, lstm_gru_features_scaled.shape[2]))
                    lstm_gru_features_scaled = np.concatenate((padding, lstm_gru_features_scaled), axis=1)

                # Make predictions with base models
                # Ensure feature names are retained as pandas DataFrame
                xgb_rf_features_scaled_df = pd.DataFrame(xgb_rf_features_scaled, columns=scaler_X.feature_names_in_)

                xgb_pred_scaled = xgb_model.predict(xgb_rf_features_scaled_df)[0]
                rf_pred_scaled = rf_model.predict(xgb_rf_features_scaled_df)[0]

                # Flatten LSTM and GRU predictions to single values
                lstm_pred_scaled = lstm_model.predict(lstm_gru_features_scaled).flatten()[0]  # Ensure it's a single value
                gru_pred_scaled = gru_model.predict(lstm_gru_features_scaled).flatten()[0]    # Ensure it's a single value

                # Debug: Print scaled predictions
                print(f" - Day {day}:")
                print(f"   XGBoost_scaled = {xgb_pred_scaled}")
                print(f"   RF_scaled      = {rf_pred_scaled}")
                print(f"   LSTM_scaled    = {lstm_pred_scaled}")
                print(f"   GRU_scaled     = {gru_pred_scaled}")

                # Meta-model prediction
                meta_features = pd.DataFrame({
                    'XGB_Pred': [xgb_pred_scaled],
                    'RF_Pred': [rf_pred_scaled],
                    'LSTM_Pred': [lstm_pred_scaled],
                    'GRU_Pred': [gru_pred_scaled]
                })

                meta_pred_scaled = meta_model.predict(meta_features)[0]

                # Debug: Print scaled meta-prediction
                print(f"   Meta-model_scaled_prediction = {meta_pred_scaled}")

                # Inversely scale the meta-prediction
                meta_pred = inverse_scale_prediction(scaler_y, meta_pred_scaled)

                # Debug: Print inversely scaled prediction
                print(f"   Meta-model_prediction (original scale) = {meta_pred}")

                # Validation: Ensure predicted 'Close' price is realistic
                # Define realistic bounds based on previous 'Close' price
                last_close_price = updated_unscaled['Close'].iloc[-1]
                min_price = 0.5 * last_close_price
                max_price = 1.5 * last_close_price
                if meta_pred <= 0 or np.isnan(meta_pred):
                    print(f"   Warning: Predicted 'Close' is non-positive or NaN ({meta_pred}). Adjusting to previous 'Close' price.")
                    meta_pred = last_close_price
                else:
                    meta_pred = max(min(meta_pred, max_price), min_price)

                # Append the prediction
                forecast_results[stock].append(meta_pred)

                # Update the unscaled data with the new prediction
                # Assuming 'Close' is the target variable
                new_row = latest_features.copy()
                new_row['Close'] = meta_pred
                new_row['Date'] = updated_unscaled['Date'].max() + BDay(1)  # Increment date by 1 business day

                # Debug: Print the new row before feature engineering
                print(f"   New row before feature engineering:\n{new_row}")

                # Convert new_row to DataFrame
                new_row_df = pd.DataFrame([new_row])

                # Append the new row to updated_unscaled using pd.concat()
                updated_unscaled = pd.concat([updated_unscaled, new_row_df], ignore_index=True)

                # Recalculate any derived features based on the new 'Close' price
                updated_unscaled = add_close_price_features(updated_unscaled)

                # Handle NaN values if any
                updated_unscaled.ffill(inplace=True)
                updated_unscaled.bfill(inplace=True)
                updated_unscaled.fillna(0, inplace=True)

                # Only trim if necessary
                if len(updated_unscaled) > 60:
                    updated_unscaled = updated_unscaled.tail(60).reset_index(drop=True)

                # Debug: Print the last few rows of updated_unscaled to verify updates
                print(f"   Updated 'Close' prices after Day {day}:")
                print(updated_unscaled['Close'].tail(5).values)
                print(f"   Length of updated_unscaled after Day {day}: {len(updated_unscaled)}")
                print(f"   Number of NaNs after feature engineering: {updated_unscaled.isna().sum().sum()}")

            except Exception as e:
                print(f" - Error at Day {day}: {e}")
                break  # Exit the loop if an error occurs

        # Verify the number of predictions
        num_predictions = len(forecast_results[stock])
        print(f"{stock}: Number of predictions = {num_predictions}, Expected = {forecast_days}")

        # Truncate excess predictions if any
        if num_predictions > forecast_days:
            print(f"{stock}: Truncating {num_predictions - forecast_days} excess predictions.")
            forecast_results[stock] = forecast_results[stock][:forecast_days]
        elif num_predictions < forecast_days:
            print(f"{stock}: Missing {forecast_days - num_predictions} predictions.")
            # Optionally, handle missing predictions
            # For now, we'll skip saving forecasts for this stock
            continue

        # Create Forecast DataFrame
        forecast_df = pd.DataFrame({
            'Day': range(1, forecast_days + 1),
            f'{stock}_Predicted': forecast_results[stock]
        })

        # Save Forecast
        forecast_save_path = os.path.join(forecast_save_dir, f'future_forecasts_{stock}.csv')
        forecast_df.to_csv(forecast_save_path, index=False)
        print(f" - Forecast saved at '{forecast_save_path}'")

    except Exception as e:
        print(f" - Error forecasting for {stock}: {e}")

# Display a sample of forecast_results
for stock in models_per_stock.keys():
    preds = forecast_results.get(stock, [])
    if preds:
        print(f"\nSample Predictions for {stock}: {preds[:5]} ...")
    else:
        print(f"\nNo predictions available for {stock}.")


In [19]:
# Cell 14: Forecasting Future Prices

def inverse_scale_prediction(scaler_y, prediction_scaled):
    return scaler_y.inverse_transform([[prediction_scaled]])[0][0]

# Define directories
forecast_save_dir = '../models/future_forecasts'
os.makedirs(forecast_save_dir, exist_ok=True)

# Define forecast days
forecast_days = 30

# Initialize a dictionary to store forecast results
forecast_results = {stock: [] for stock in models_per_stock.keys()}

# Start forecasting
for stock in models_per_stock.keys():
    print(f"\nStarting Forecasting for {stock}")
    try:
        models = models_per_stock[stock]['models']
        current_data_scaled = models_per_stock[stock]['current_data_scaled']
        current_data_unscaled = models_per_stock[stock]['current_data_unscaled']
        scaler_X = models_per_stock[stock]['models'][5]  # scaler_X
        scaler_y = models_per_stock[stock]['models'][6]  # scaler_y

        # Corrected model unpacking
        xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = models

        # Create a copy of current_data_unscaled to update with predictions
        updated_unscaled = current_data_unscaled.copy()

        for day in range(1, forecast_days + 1):
            try:
                # Determine the timesteps for this iteration
                timesteps = min(60, len(updated_unscaled))

                # Generate features for XGBoost and Random Forest
                xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features = generate_next_day_features(
                    updated_unscaled,
                    scaler_X,
                    timesteps=timesteps  # Use the current timesteps
                )

                # Ensure lstm_gru_features_scaled has the correct shape for LSTM/GRU
                # Padding sequences if necessary
                if lstm_gru_features_scaled.shape[1] < 60:
                    padding_length = 60 - lstm_gru_features_scaled.shape[1]
                    padding = np.zeros((lstm_gru_features_scaled.shape[0], padding_length, lstm_gru_features_scaled.shape[2]))
                    lstm_gru_features_scaled = np.concatenate((padding, lstm_gru_features_scaled), axis=1)

                # Make predictions with base models
                xgb_pred_scaled = xgb_model.predict(xgb_rf_features_scaled)[0]
                rf_pred_scaled = rf_model.predict(xgb_rf_features_scaled)[0]
                lstm_pred_scaled = lstm_model.predict(lstm_gru_features_scaled)[0][0]
                gru_pred_scaled = gru_model.predict(lstm_gru_features_scaled)[0][0]

                # Debug: Print scaled predictions
                print(f" - Day {day}:")
                print(f"   XGBoost_scaled = {xgb_pred_scaled}")
                print(f"   RF_scaled      = {rf_pred_scaled}")
                print(f"   LSTM_scaled    = {lstm_pred_scaled}")
                print(f"   GRU_scaled     = {gru_pred_scaled}")

                # Meta-model prediction
                meta_features = np.array([xgb_pred_scaled, rf_pred_scaled, lstm_pred_scaled, gru_pred_scaled]).reshape(1, -1)
                meta_pred_scaled = meta_model.predict(meta_features)[0]

                # Debug: Print scaled meta-prediction
                print(f"   Meta-model_scaled_prediction = {meta_pred_scaled}")

                # Inversely scale the meta-prediction
                meta_pred = inverse_scale_prediction(scaler_y, meta_pred_scaled)

                # Debug: Print inversely scaled prediction
                print(f"   Meta-model_prediction (original scale) = {meta_pred}")

                # Validation: Ensure predicted 'Close' price is realistic
                # Define realistic bounds based on previous 'Close' price
                last_close_price = updated_unscaled['Close'].iloc[-1]
                min_price = 0.5 * last_close_price
                max_price = 1.5 * last_close_price
                if meta_pred <= 0 or np.isnan(meta_pred):
                    print(f"   Warning: Predicted 'Close' is non-positive or NaN ({meta_pred}). Adjusting to previous 'Close' price.")
                    meta_pred = last_close_price
                else:
                    meta_pred = max(min(meta_pred, max_price), min_price)

                # Append the prediction
                forecast_results[stock].append(meta_pred)

                # Update the unscaled data with the new prediction
                # Assuming 'Close' is the target variable
                new_row = latest_features.copy()
                new_row['Close'] = meta_pred
                new_row['Date'] = updated_unscaled['Date'].max() + BDay(1)  # Increment date by 1 business day

                # Debug: Print the new row before feature engineering
                print(f"   New row before feature engineering:\n{new_row}")

                # Convert new_row to DataFrame
                new_row_df = pd.DataFrame([new_row])

                # Append the new row to updated_unscaled using pd.concat()
                updated_unscaled = pd.concat([updated_unscaled, new_row_df], ignore_index=True)

                # Recalculate any derived features based on the new 'Close' price
                updated_unscaled = add_close_price_features(updated_unscaled)

                # Handle NaN values if any
                updated_unscaled.ffill(inplace=True)
                updated_unscaled.bfill(inplace=True)
                updated_unscaled.fillna(0, inplace=True)

                # Only trim if necessary
                if len(updated_unscaled) > 60:
                    updated_unscaled = updated_unscaled.tail(60).reset_index(drop=True)

                # Debug: Print the last few rows of updated_unscaled to verify updates
                print(f"   Updated 'Close' prices after Day {day}:")
                print(updated_unscaled['Close'].tail(5).values)
                print(f"   Length of updated_unscaled after Day {day}: {len(updated_unscaled)}")
                print(f"   Number of NaNs after feature engineering: {updated_unscaled.isna().sum().sum()}")

            except Exception as e:
                print(f" - Error at Day {day}: {e}")
                break  # Exit the loop if an error occurs

        # Verify the number of predictions
        num_predictions = len(forecast_results[stock])
        print(f"{stock}: Number of predictions = {num_predictions}, Expected = {forecast_days}")

        # Truncate excess predictions if any
        if num_predictions > forecast_days:
            print(f"{stock}: Truncating {num_predictions - forecast_days} excess predictions.")
            forecast_results[stock] = forecast_results[stock][:forecast_days]
        elif num_predictions < forecast_days:
            print(f"{stock}: Missing {forecast_days - num_predictions} predictions.")
            # Optionally, handle missing predictions
            # For now, we'll skip saving forecasts for this stock
            continue

        # Create Forecast DataFrame
        forecast_df = pd.DataFrame({
            'Day': range(1, forecast_days + 1),
            f'{stock}_Predicted': forecast_results[stock]
        })

        # Save Forecast
        forecast_save_path = os.path.join(forecast_save_dir, f'future_forecasts_{stock}.csv')
        forecast_df.to_csv(forecast_save_path, index=False)
        print(f" - Forecast saved at '{forecast_save_path}'")

    except Exception as e:
        print(f" - Error forecasting for {stock}: {e}")

# Display a sample of forecast_results
for stock in models_per_stock.keys():
    preds = forecast_results.get(stock, [])
    if preds:
        print(f"\nSample Predictions for {stock}: {preds[:5]} ...")
    else:
        print(f"\nNo predictions available for {stock}.")


In [20]:
# Cell 15: Loading Forecasts and Historical Data

def load_forecast(stock, forecast_dir):
    forecast_path = os.path.join(forecast_dir, f'future_forecasts_{stock}.csv')
    if not os.path.exists(forecast_path):
        print(f" - Forecast file for {stock} not found at '{forecast_path}'.")
        return None
    try:
        df = pd.read_csv(forecast_path)
        print(f" - Loaded forecast for {stock} from '{forecast_path}'. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f" - Error loading forecast for {stock}: {e}")
        return None

def load_historical_data(stock, raw_data_dir, validation_days=30):
    historical_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(historical_path):
        print(f" - Historical data for {stock} not found at '{historical_path}'.")
        return None, None
    try:
        df = pd.read_csv(historical_path)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values('Date', inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        # Split into training and validation sets
        if len(df) < validation_days + 1:
            print(f" - Not enough data for {stock} to perform backtesting. Required: {validation_days + 1}, Available: {len(df)}")
            return None, None
        
        training_df = df.iloc[:-validation_days].copy()
        validation_df = df.iloc[-validation_days:].copy()
        
        print(f" - Loaded historical data for {stock} from '{historical_path}'. Training Shape: {training_df.shape}, Validation Shape: {validation_df.shape}")
        return training_df, validation_df
    except Exception as e:
        print(f" - Error loading historical data for {stock}: {e}")
        return None, None

# Define directories
forecast_dir = '../models/future_forecasts'
raw_data_dir = '../data/stock_data'  # Ensure this path is correct relative to your notebook

# List of stocks
stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']

# Initialize dictionaries to store forecasts and historical data
forecasts = {}
historicals = {}

# Load forecasts and historical data
for stock in stocks:
    print(f"\nLoading data for {stock}:")
    forecasts[stock] = load_forecast(stock, forecast_dir)
    historicals[stock] = load_historical_data(stock, raw_data_dir, validation_days=30)

# Inspect loaded data (Optional but recommended)
for stock in stocks:
    print(f"\nInspecting data for {stock}:")
    forecast_df = forecasts.get(stock)
    historical_data = historicals.get(stock)
    
    if historical_data is not None:
        training_df, validation_df = historical_data
    else:
        training_df, validation_df = None, None
    
    if forecast_df is not None:
        print("Forecast DataFrame Head:")
        print(forecast_df.head())
    
    if training_df is not None:
        print("Training DataFrame Tail:")
        print(training_df.tail())
    
    if validation_df is not None:
        print("Validation DataFrame Tail:")
        print(validation_df.tail())



Loading data for AAPL:
 - Forecast file for AAPL not found at '../models/future_forecasts/future_forecasts_AAPL.csv'.
 - Loaded historical data for AAPL from '../data/stock_data/AAPL_daily.csv'. Training Shape: (2484, 6), Validation Shape: (30, 6)

Loading data for MSFT:
 - Forecast file for MSFT not found at '../models/future_forecasts/future_forecasts_MSFT.csv'.
 - Loaded historical data for MSFT from '../data/stock_data/MSFT_daily.csv'. Training Shape: (2484, 6), Validation Shape: (30, 6)

Loading data for GOOGL:
 - Forecast file for GOOGL not found at '../models/future_forecasts/future_forecasts_GOOGL.csv'.
 - Loaded historical data for GOOGL from '../data/stock_data/GOOGL_daily.csv'. Training Shape: (2484, 6), Validation Shape: (30, 6)

Loading data for AMZN:
 - Forecast file for AMZN not found at '../models/future_forecasts/future_forecasts_AMZN.csv'.
 - Loaded historical data for AMZN from '../data/stock_data/AMZN_daily.csv'. Training Shape: (2484, 6), Validation Shape: (30, 6)

In [None]:
# Define directories (adjust paths as necessary)
forecast_dir = '../models/future_forecasts'
raw_data_dir = '../data/stock_data'  # Ensure this path is correct relative to your notebook

# Function to load forecasts
def load_forecast(stock, forecast_dir):
    forecast_path = os.path.join(forecast_dir, f'future_forecasts_{stock}.csv')
    if not os.path.exists(forecast_path):
        print(f" - Forecast file for {stock} not found at '{forecast_path}'.")
        return None
    try:
        df = pd.read_csv(forecast_path)
        print(f" - Loaded forecast for {stock} from '{forecast_path}'. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f" - Error loading forecast for {stock}: {e}")
        return None

# Function to load historical data
def load_historical_data(stock, raw_data_dir, validation_days=30):
    historical_path = os.path.join(raw_data_dir, f"{stock}_daily.csv")
    if not os.path.exists(historical_path):
        print(f" - Historical data for {stock} not found at '{historical_path}'.")
        return None, None
    try:
        df = pd.read_csv(historical_path)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values('Date', inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        # Split into training and validation sets
        if len(df) < validation_days + 1:
            print(f" - Not enough data for {stock} to perform backtesting. Required: {validation_days + 1}, Available: {len(df)}")
            return None, None
        
        training_df = df.iloc[:-validation_days].copy()
        validation_df = df.iloc[-validation_days:].copy()
        
        print(f" - Loaded historical data for {stock} from '{historical_path}'. Training Shape: {training_df.shape}, Validation Shape: {validation_df.shape}")
        return training_df, validation_df
    except Exception as e:
        print(f" - Error loading historical data for {stock}: {e}")
        return None, None

# Initialize dictionaries to store forecasts and historical data
forecasts = {}
historicals = {}

# Load forecasts and historical data
for stock in stocks:
    print(f"\nLoading data for {stock}:")
    forecasts[stock] = load_forecast(stock, forecast_dir)
    historicals[stock] = load_historical_data(stock, raw_data_dir, validation_days=30)

# Set display options for better readability
pd.set_option('display.max_rows', None)       # Display all rows
pd.set_option('display.max_columns', None)    # Display all columns
pd.set_option('display.width', None)          # No wrapping in output
pd.set_option('display.float_format', '{:.6f}'.format)  # Format floats

# Plotting the forecasts alongside historical data
for stock in stocks:
    forecast_df = forecasts.get(stock)
    historical_data = historicals.get(stock)
    
    if historical_data is not None and forecast_df is not None:
        training_df, validation_df = historical_data
        
        # Combine training and validation data
        historical_df = pd.concat([training_df, validation_df], ignore_index=True)
        historical_df['Date'] = pd.to_datetime(historical_df['Date'])
        
        # Get the last date from the historical data
        last_historical_date = historical_df['Date'].max()
        
        # Generate future dates for forecasts
        forecast_days = forecast_df.shape[0]
        
        # Use custom business days to account for weekends and US Federal Holidays
        us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
        future_dates = pd.date_range(start=last_historical_date + pd.Timedelta(days=1), periods=forecast_days, freq=us_bd)
        
        # Add dates to forecast_df
        forecast_df['Date'] = future_dates
        forecast_df.rename(columns={f'{stock}_Predicted': 'Predicted_Close'}, inplace=True)
        
        # Merge historical and forecast data
        combined_df = pd.merge(historical_df[['Date', 'Close']], forecast_df[['Date', 'Predicted_Close']], on='Date', how='outer')
        combined_df.sort_values('Date', inplace=True)
        combined_df.reset_index(drop=True, inplace=True)
        
        # Focus on the last year of data
        one_year_ago = last_historical_date - pd.DateOffset(years=1)
        mask = combined_df['Date'] >= one_year_ago
        combined_df_last_year = combined_df.loc[mask].reset_index(drop=True)
        
        # Print out the combined DataFrame for the last year
        print(f"\nCombined Data for {stock} - Last Year:")
        print(combined_df_last_year[['Date', 'Close', 'Predicted_Close']].to_string(index=False))
        
        # Plot the data
        plt.figure(figsize=(14,7))
        plt.plot(combined_df_last_year['Date'], combined_df_last_year['Close'], label='Actual Close Prices')
        plt.plot(combined_df_last_year['Date'], combined_df_last_year['Predicted_Close'], label='Predicted Close Prices', linestyle='--')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.title(f'Stock Price Prediction for {stock} - Last Year')
        plt.axvline(x=last_historical_date, color='grey', linestyle='--', label='Forecast Start')
        plt.legend()
        plt.show()
    else:
        print(f"Data not available for {stock}.")


In [None]:
import os
import joblib

# Set the model directory path
model_dir = '../models/meta_model'

# Define the model path
stock_symbol = 'AAPL'
model_path = os.path.join(model_dir, f'stacking_meta_model_{stock_symbol}.pkl')

# Try loading the model
try:
    loaded_model = joblib.load(model_path)
    print(f"Meta model for {stock_symbol} loaded successfully.")
except Exception as e:
    print(f"Error loading meta model for {stock_symbol}: {e}")


In [None]:
import os
import joblib
import numpy as np
import pandas as pd

# Set the model directory path
model_dir = '../models/meta_model'

# Define the model path
stock_symbol = 'AAPL'
model_path = os.path.join(model_dir, f'stacking_meta_model_{stock_symbol}.pkl')

# Try loading the model
try:
    loaded_model = joblib.load(model_path)
    print(f"Meta model for {stock_symbol} loaded successfully.")
except Exception as e:
    print(f"Error loading meta model for {stock_symbol}: {e}")

# Example input features for testing
gru_prediction = 0.5   # Example GRU model prediction
lstm_prediction = 0.6  # Example LSTM model prediction
rf_prediction = 0.4    # Example Random Forest model prediction
xgb_prediction = 0.7   # Example XGBoost model prediction

# Combine predictions into a DataFrame to match the trained model's feature names
predictions_df = pd.DataFrame({
    'XGB_Pred': [xgb_prediction],
    'RF_Pred': [rf_prediction],
    'LSTM_Pred': [lstm_prediction],
    'GRU_Pred': [gru_prediction]
})

# Make a prediction using the meta-model
try:
    final_prediction = loaded_model.predict(predictions_df)[0]
    print(f"Meta-model final prediction: {final_prediction}")
except Exception as e:
    print(f"Error making prediction with the meta model: {e}")

# (Optional) If you have a scaler to inverse transform the prediction, include it as well
try:
    from sklearn.preprocessing import MinMaxScaler
    scaler_y = joblib.load('../models/scalers/minmax_scaler_y_AAPL.joblib')
    final_prediction_rescaled = scaler_y.inverse_transform([[final_prediction]])[0][0]
    print(f"Final prediction rescaled to original scale: {final_prediction_rescaled}")
except Exception as e:
    print(f"Error with inverse scaling: {e}")


In [None]:
# Cell 16: Calculating and Summarizing Evaluation Metrics via Backtesting

import pandas as pd
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pandas.tseries.offsets import BDay

def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics between actual and predicted values."""
    rmse = mean_squared_error(actual, predicted, squared=False)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = (abs((actual - predicted) / actual).mean()) * 100
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2, 'MAPE': mape}

def inverse_scale_prediction(scaler_y, prediction_scaled):
    """Inverse scale the prediction using scaler_y."""
    return scaler_y.inverse_transform([[prediction_scaled]])[0][0]

def evaluate_forecasts_backtesting(models_per_stock, forecast_days=30):
    """Evaluate forecasts against actual data using backtesting."""
    metrics_summary = {}
    comparison_tables = {}

    for stock, data in models_per_stock.items():
        print(f"\nEvaluating Forecast for {stock}:")
        training_df, validation_df = data['training_validation']

        if training_df is None or validation_df is None:
            print(f" - Insufficient data for {stock}. Skipping evaluation.")
            continue

        try:
            # Extract models and scalers
            xgb_model, rf_model, lstm_model, gru_model, meta_model, scaler_X, scaler_y = data['models']
            
            # Prepare initial input (last 'timesteps' days from training data)
            timesteps = 60
            initial_input_unscaled = training_df.tail(timesteps).copy()
            updated_unscaled = initial_input_unscaled.copy()
            
            predicted_prices = []
            
            for day in range(forecast_days):
                # Generate features
                xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features = generate_next_day_features(
                    updated_unscaled, scaler_X, timesteps=timesteps
                )
                
                # Base model predictions
                xgb_pred_scaled = xgb_model.predict(xgb_rf_features_scaled)[0]
                rf_pred_scaled = rf_model.predict(xgb_rf_features_scaled)[0]
                lstm_pred_scaled = lstm_model.predict(lstm_gru_features_scaled)[0][0]
                gru_pred_scaled = gru_model.predict(lstm_gru_features_scaled)[0][0]
                
                # Meta-model prediction
                meta_features = pd.DataFrame([[xgb_pred_scaled, rf_pred_scaled, lstm_pred_scaled, gru_pred_scaled]])
                meta_pred_scaled = meta_model.predict(meta_features)[0]
                meta_pred = inverse_scale_prediction(scaler_y, meta_pred_scaled)
                
                # Append prediction
                predicted_prices.append(meta_pred)
                
                # Update the data with the new prediction
                new_row = latest_features.copy()
                new_row['Close'] = meta_pred
                new_row['Date'] = updated_unscaled['Date'].max() + BDay(1)
                new_row_df = pd.DataFrame([new_row])
                updated_unscaled = pd.concat([updated_unscaled, new_row_df], ignore_index=True)
                
                # Recalculate derived features
                updated_unscaled = add_close_price_features(updated_unscaled)
                updated_unscaled = updated_unscaled.tail(timesteps).reset_index(drop=True)
            
            # Actual vs Predicted
            actual = validation_df['Close'].values
            predicted = predicted_prices
            
            # Calculate metrics
            metrics = calculate_metrics(actual, predicted)
            metrics_summary[stock] = metrics
            
            # Create comparison table
            comparison_df = pd.DataFrame({
                'Date': validation_df['Date'],
                'Actual_Close': actual,
                'Predicted_Close': predicted
            })
            comparison_tables[stock] = comparison_df
            
            # Display metrics and comparison
            print(f" - RMSE: {metrics['RMSE']:.4f}")
            print(f" - MAE: {metrics['MAE']:.4f}")
            print(f" - R²: {metrics['R2']:.4f}")
            print(f" - MAPE: {metrics['MAPE']:.2f}%")
            
            print(f"\nComparison of Actual vs. Predicted Close Prices for {stock}:")
            display(comparison_df.head())
            display(comparison_df.tail())
            
        except Exception as e:
            print(f" - Error evaluating forecast for {stock}: {e}")
    
    if metrics_summary:
        # Summary DataFrame
        metrics_df = pd.DataFrame(metrics_summary).T
        metrics_df = metrics_df[['RMSE', 'MAE', 'R2', 'MAPE']]
        print("\nOverall Evaluation Metrics for All Stocks:")
        print(metrics_df)
        
        # Save metrics summary
        summary_save_path = os.path.join(forecast_dir, 'overall_evaluation_metrics_backtesting.csv')
        metrics_df.to_csv(summary_save_path)
        print(f"\n - Overall Evaluation Metrics table saved as '{summary_save_path}'.")
        
        # Save comparison tables
        for stock, comp_df in comparison_tables.items():
            comp_save_path = os.path.join(forecast_dir, f'comparison_actual_predicted_{stock}.csv')
            comp_df.to_csv(comp_save_path, index=False)
            print(f" - Comparison table for {stock} saved at '{comp_save_path}'.")
        
        return metrics_df, comparison_tables
    else:
        print("\nNo evaluation metrics to summarize.")
        return pd.DataFrame(), {}

# Placeholder Functions

def generate_next_day_features(updated_unscaled, scaler_X, timesteps=60):
    """Generate features for the next day based on updated data."""
    # Example: Using only 'Close' price for simplicity
    xgb_rf_features = updated_unscaled['Close'].values[-timesteps:].reshape(1, -1)
    xgb_rf_features_scaled = scaler_X.transform(xgb_rf_features)
    
    lstm_gru_features = updated_unscaled['Close'].values[-timesteps:].reshape(1, timesteps, 1)
    lstm_gru_features_scaled = scaler_X.transform(lstm_gru_features.reshape(-1, 1)).reshape(1, timesteps, 1)
    
    latest_features = updated_unscaled.iloc[-1].to_dict()
    
    return xgb_rf_features_scaled, lstm_gru_features_scaled, latest_features

def add_close_price_features(df):
    """Add or recalculate any derived features based on the 'Close' price."""
    # Example: Adding moving averages
    df['MA_5'] = df['Close'].rolling(window=5).mean()
    df['MA_10'] = df['Close'].rolling(window=10).mean()
    
    # Handle NaN values
    df.fillna(method='bfill', inplace=True)
    
    return df

# Execute Backtesting
metrics_summary_df, comparison_tables = evaluate_forecasts_backtesting(models_per_stock, forecast_days=30)