In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import datasets, ensemble

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
from auto_ts import auto_timeseries
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML
from pandas.tseries.offsets import BDay
import yfinance as yf

# Download data for S&P500 for a specific date range
sp500_data = yf.download("^GSPC", start="2023-04-03", end="2025-04-15")

sp500_data.index = pd.to_datetime(sp500_data.index).strftime('%m/%d/%Y')

# Convert the data to a CSV file
sp500_data.to_csv("SP500_original_data.csv")

Imported auto_timeseries version:0.0.92. Call by using:
model = auto_timeseries(score_type='rmse',
        time_interval='M', non_seasonal_pdq=None, seasonality=False,
        seasonal_period=12, model_type=['best'], verbose=2, dask_xgboost_flag=0)
model.fit(traindata, ts_column,target)
model.predict(testdata, model='best')

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [2]:
stock_data = pd.read_csv('Kaggle_Data/SP500_data.csv',parse_dates=['Date'],index_col='Date')

stock_data.head()
stock_data.columns

Unnamed: 0_level_0,S&P500_Close,S&P500_High,S&P500_Low,S&P500_Open,S&P500_Volume,S&P500_%Change,VIX_Open,VIX_High,VIX_Low,VIX_Close,VFIX_Open,VFIX_High,VFIX_Low,VFIX_Close,OilFutures_Price,OilFutures_Open,OilFutures_High,OilFutures_Low,OilFutures_Vol,OilFutures_Change%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-09-11,5554.13,5560.41,5406.96,5496.42,3839450000,0.02,19.41,21.41,17.55,17.69,513.72,513.72,513.72,513.72,66.6,65.73,67.14,64.99,237.2,0.02
2024-09-12,5595.76,5600.71,5535.5,5557.48,3655070000,0.06,17.62,18.59,16.89,17.07,517.58,517.58,517.58,517.58,68.15,66.68,68.93,66.55,221.59,0.02
2024-09-13,5626.02,5636.27,5601.65,5603.34,3500790000,0.14,17.03,17.18,16.23,16.56,520.46,520.46,520.46,520.46,67.75,68.35,69.35,67.58,230.3,-0.01
2024-09-16,5633.09,5636.05,5604.53,5615.21,3437070000,-0.19,17.16,17.69,16.91,17.14,521.21,521.21,521.21,521.21,69.02,68.22,69.61,67.7,235.88,0.02
2024-09-17,5634.58,5670.81,5614.05,5655.51,3443600000,0.4,17.16,18.08,16.67,17.61,521.36,521.36,521.36,521.36,69.96,69.35,70.65,68.51,289.93,0.01


Index(['S&P500_Close', 'S&P500_High', 'S&P500_Low', 'S&P500_Open',
       'S&P500_Volume', 'S&P500_%Change', 'VIX_Open', 'VIX_High', 'VIX_Low',
       'VIX_Close', 'VFIX_Open', 'VFIX_High', 'VFIX_Low', 'VFIX_Close',
       'OilFutures_Price', 'OilFutures_Open', 'OilFutures_High',
       'OilFutures_Low', 'OilFutures_Vol', 'OilFutures_Change%'],
      dtype='object')

In [3]:
len(stock_data)

subset_data = stock_data
for i in range(1, 5):
    subset_data[f'S&P500_Close_lag_{i}'] = subset_data['S&P500_Close'].shift(i)
    subset_data[f'S&P500_High_lag_{i}'] = subset_data['S&P500_High'].shift(i)
    subset_data[f'S&P500_Low_lag_{i}'] = subset_data['S&P500_Low'].shift(i)
    subset_data[f'S&P500_Open_lag_{i}'] = subset_data['S&P500_Open'].shift(i)
    subset_data[f'S&P500_Volume_lag_{i}'] = subset_data['S&P500_Volume'].shift(i)
    subset_data[f'S&P500_%Change_lag_{i}'] = subset_data['S&P500_%Change'].shift(i)
    subset_data[f'VIX_Open_lag_{i}'] = subset_data['VIX_Open'].shift(i)
    subset_data[f'VIX_High_lag_{i}'] = subset_data['VIX_High'].shift(i)
    subset_data[f'VIX_Low_lag_{i}'] = subset_data['VIX_Low'].shift(i)
    subset_data[f'VIX_Close_lag_{i}'] = subset_data['VIX_Close'].shift(i)
    subset_data[f'VFIX_Open_lag_{i}'] = subset_data['VFIX_Open'].shift(i)
    subset_data[f'VFIX_High_lag_{i}'] = subset_data['VFIX_High'].shift(i)
    subset_data[f'VFIX_Low_lag_{i}'] = subset_data['VFIX_Low'].shift(i)
    subset_data[f'VFIX_Close_lag_{i}'] = subset_data['VFIX_Close'].shift(i)
    subset_data[f'OilFutures_Price_lag_{i}'] = subset_data['OilFutures_Price'].shift(i)
    subset_data[f'OilFutures_Open_lag_{i}'] = subset_data['OilFutures_Open'].shift(i)
    subset_data[f'OilFutures_High_lag_{i}'] = subset_data['OilFutures_High'].shift(i)
    subset_data[f'OilFutures_Low_lag_{i}'] = subset_data['OilFutures_Low'].shift(i)
    subset_data[f'OilFutures_Vol_lag_{i}'] = subset_data['OilFutures_Vol'].shift(i)
    subset_data[f'OilFutures_Change%_lag_{i}'] = subset_data['OilFutures_Change%'].shift(i)

subset_data = subset_data.dropna()
subset_data.head()

len(subset_data)

147

Unnamed: 0_level_0,S&P500_Close,S&P500_High,S&P500_Low,S&P500_Open,S&P500_Volume,S&P500_%Change,VIX_Open,VIX_High,VIX_Low,VIX_Close,...,VFIX_Open_lag_4,VFIX_High_lag_4,VFIX_Low_lag_4,VFIX_Close_lag_4,OilFutures_Price_lag_4,OilFutures_Open_lag_4,OilFutures_High_lag_4,OilFutures_Low_lag_4,OilFutures_Vol_lag_4,OilFutures_Change%_lag_4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-17,5634.58,5670.81,5614.05,5655.51,3443600000,0.4,17.16,18.08,16.67,17.61,...,513.72,513.72,513.72,513.72,66.6,65.73,67.14,64.99,237.2,0.02
2024-09-18,5618.26,5689.75,5615.08,5641.68,3691390000,0.13,17.58,19.39,17.11,18.23,...,517.58,517.58,517.58,517.58,68.15,66.68,68.93,66.55,221.59,0.02
2024-09-19,5713.64,5733.57,5686.42,5702.63,4024530000,1.5,17.21,17.27,16.21,16.33,...,520.46,520.46,520.46,520.46,67.75,68.35,69.35,67.58,230.3,-0.01
2024-09-20,5702.55,5715.14,5674.49,5709.64,7867260000,-0.07,16.35,16.68,15.81,16.15,...,521.21,521.21,521.21,521.21,69.02,68.22,69.61,67.7,235.88,0.02
2024-09-23,5718.57,5725.36,5704.22,5711.9,3529550000,0.16,16.71,16.95,15.75,15.89,...,521.36,521.36,521.36,521.36,69.96,69.35,70.65,68.51,289.93,0.01


143

In [4]:
subset_data.isna().sum()
subset_data.columns

S&P500_Close                0
S&P500_High                 0
S&P500_Low                  0
S&P500_Open                 0
S&P500_Volume               0
                           ..
OilFutures_Open_lag_4       0
OilFutures_High_lag_4       0
OilFutures_Low_lag_4        0
OilFutures_Vol_lag_4        0
OilFutures_Change%_lag_4    0
Length: 100, dtype: int64

Index(['S&P500_Close', 'S&P500_High', 'S&P500_Low', 'S&P500_Open',
       'S&P500_Volume', 'S&P500_%Change', 'VIX_Open', 'VIX_High', 'VIX_Low',
       'VIX_Close', 'VFIX_Open', 'VFIX_High', 'VFIX_Low', 'VFIX_Close',
       'OilFutures_Price', 'OilFutures_Open', 'OilFutures_High',
       'OilFutures_Low', 'OilFutures_Vol', 'OilFutures_Change%',
       'S&P500_Close_lag_1', 'S&P500_High_lag_1', 'S&P500_Low_lag_1',
       'S&P500_Open_lag_1', 'S&P500_Volume_lag_1', 'S&P500_%Change_lag_1',
       'VIX_Open_lag_1', 'VIX_High_lag_1', 'VIX_Low_lag_1', 'VIX_Close_lag_1',
       'VFIX_Open_lag_1', 'VFIX_High_lag_1', 'VFIX_Low_lag_1',
       'VFIX_Close_lag_1', 'OilFutures_Price_lag_1', 'OilFutures_Open_lag_1',
       'OilFutures_High_lag_1', 'OilFutures_Low_lag_1', 'OilFutures_Vol_lag_1',
       'OilFutures_Change%_lag_1', 'S&P500_Close_lag_2', 'S&P500_High_lag_2',
       'S&P500_Low_lag_2', 'S&P500_Open_lag_2', 'S&P500_Volume_lag_2',
       'S&P500_%Change_lag_2', 'VIX_Open_lag_2', 'VIX_High_l

In [5]:
X = subset_data[[f'S&P500_Close_lag_{i}' for i in range(1, 5)] + #always include!
                [f'S&P500_High_lag_{i}' for i in range(1, 5)] +         #useful!
                #[f'S&P500_Low_lag_{i}' for i in range(1, 5)] +                                                                #bad!
                #[f'S&P500_Open_lag_{i}' for i in range(1, 5)] +                                                                #bad!
                [f'S&P500_Volume_lag_{i}' for i in range(1, 5)] +       #useful!
                #[f'S&P500_%Change_lag_{i}' for i in range(1, 5)] +                                                 #not great!
                #[f'VIX_Open_lag_{i}' for i in range(1, 5)] +                                                       #not great!
                #[f'VIX_High_lag_{i}' for i in range(1, 5)] +                                                                  #bad!
                #[f'VIX_Low_lag_{i}' for i in range(1, 5)] +                                                                   #bad!
                #[f'VIX_Close_lag_{i}' for i in range(1, 5)] +                                                                 #bad!
                #[f'VFIX_Open_lag_{i}' for i in range(1, 5)] +                              #seems good
                #[f'VFIX_High_lag_{i}' for i in range(1, 5)] +                              #seems good
                #[f'VFIX_Low_lag_{i}' for i in range(1, 5)] +                               #seems good
                #[f'VFIX_Close_lag_{i}' for i in range(1, 5)] +                             #seems good
                #[f'OilFutures_Price_lag_{i}' for i in range(1, 5)] +                                               #not great!
                #[f'OilFutures_Open_lag_{i}' for i in range(1, 5)] +                                    #seems okay
                #[f'OilFutures_High_lag_{i}' for i in range(1, 5)] +                                    #seems okay
                #[f'OilFutures_Low_lag_{i}' for i in range(1, 5)] +                                     #seems okay
                [f'OilFutures_Vol_lag_{i}' for i in range(1, 5)] +            #looks good!
                [f'OilFutures_Change%_lag_{i}' for i in range(1, 5)] +        #looks good!
                
                ['S&P500_Open']].copy()

y = subset_data['S&P500_Close']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train
X_test
y_train
y_test

Unnamed: 0_level_0,S&P500_Close_lag_1,S&P500_Close_lag_2,S&P500_Close_lag_3,S&P500_Close_lag_4,S&P500_High_lag_1,S&P500_High_lag_2,S&P500_High_lag_3,S&P500_High_lag_4,S&P500_Volume_lag_1,S&P500_Volume_lag_2,...,S&P500_Volume_lag_4,OilFutures_Vol_lag_1,OilFutures_Vol_lag_2,OilFutures_Vol_lag_3,OilFutures_Vol_lag_4,OilFutures_Change%_lag_1,OilFutures_Change%_lag_2,OilFutures_Change%_lag_3,OilFutures_Change%_lag_4,S&P500_Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-17,5633.09,5626.02,5595.76,5554.13,5636.05,5636.27,5600.71,5560.41,3437070000.00,3500790000.00,...,3839450000.00,235.88,230.30,221.59,237.20,0.02,-0.01,0.02,0.02,5655.51
2024-09-18,5634.58,5633.09,5626.02,5595.76,5670.81,5636.05,5636.27,5600.71,3443600000.00,3437070000.00,...,3655070000.00,289.93,235.88,230.30,221.59,0.01,0.02,-0.01,0.02,5641.68
2024-09-19,5618.26,5634.58,5633.09,5626.02,5689.75,5670.81,5636.05,5636.27,3691390000.00,3443600000.00,...,3500790000.00,367.19,289.93,235.88,230.30,-0.00,0.01,0.02,-0.01,5702.63
2024-09-20,5713.64,5618.26,5634.58,5633.09,5733.57,5689.75,5670.81,5636.05,4024530000.00,3691390000.00,...,3437070000.00,354.93,367.19,289.93,235.88,0.02,-0.00,0.01,0.02,5709.64
2024-09-23,5702.55,5713.64,5618.26,5634.58,5715.14,5733.57,5689.75,5670.81,7867260000.00,4024530000.00,...,3443600000.00,307.00,354.93,367.19,289.93,-0.00,0.02,-0.00,0.01,5711.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-25,5983.25,6013.13,6117.52,6144.15,6043.65,6114.82,6134.50,6147.43,4990120000.00,5434560000.00,...,4562330000.00,205.47,268.47,26.71,69.01,0.00,-0.03,0.00,0.01,5982.73
2025-02-26,5955.25,5983.25,6013.13,6117.52,5992.65,6043.65,6114.82,6134.50,5374690000.00,4990120000.00,...,4813690000.00,267.31,205.47,268.47,26.71,-0.03,0.00,-0.03,0.00,5970.87
2025-02-27,5956.06,5955.25,5983.25,6013.13,6009.82,5992.65,6043.65,6114.82,4869580000.00,5374690000.00,...,5434560000.00,255.60,267.31,205.47,268.47,-0.00,-0.03,0.00,-0.03,5981.88
2025-02-28,5861.57,5956.06,5955.25,5983.25,5993.69,6009.82,5992.65,6043.65,5057680000.00,4869580000.00,...,4990120000.00,265.93,255.60,267.31,205.47,0.03,-0.00,-0.03,0.00,5856.74


Unnamed: 0_level_0,S&P500_Close_lag_1,S&P500_Close_lag_2,S&P500_Close_lag_3,S&P500_Close_lag_4,S&P500_High_lag_1,S&P500_High_lag_2,S&P500_High_lag_3,S&P500_High_lag_4,S&P500_Volume_lag_1,S&P500_Volume_lag_2,...,S&P500_Volume_lag_4,OilFutures_Vol_lag_1,OilFutures_Vol_lag_2,OilFutures_Vol_lag_3,OilFutures_Vol_lag_4,OilFutures_Change%_lag_1,OilFutures_Change%_lag_2,OilFutures_Change%_lag_3,OilFutures_Change%_lag_4,S&P500_Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-04,5849.72,5954.5,5861.57,5956.06,5986.09,5959.4,5993.69,6009.82,5613850000.0,6441140000.0,...,4869580000.0,332.75,250.07,265.93,255.6,-0.02,-0.01,0.03,-0.0,5811.98
2025-03-05,5778.15,5849.72,5954.5,5861.57,5865.08,5986.09,5959.4,5993.69,6138110000.0,5613850000.0,...,5057680000.0,386.75,332.75,250.07,265.93,-0.0,-0.02,-0.01,0.03,5781.36
2025-03-06,5842.63,5778.15,5849.72,5954.5,5860.59,5865.08,5986.09,5959.4,5285970000.0,6138110000.0,...,6441140000.0,382.49,386.75,332.75,250.07,-0.03,-0.0,-0.02,-0.01,5785.87
2025-03-07,5738.52,5842.63,5778.15,5849.72,5812.08,5860.59,5865.08,5986.09,5165080000.0,5285970000.0,...,5613850000.0,341.63,382.49,386.75,332.75,0.0,-0.03,-0.0,-0.02,5726.01
2025-03-10,5770.2,5738.52,5842.63,5778.15,5783.01,5812.08,5860.59,5865.08,5705140000.0,5165080000.0,...,6138110000.0,329.71,341.63,382.49,386.75,0.01,0.0,-0.03,-0.0,5705.37
2025-03-11,5614.56,5770.2,5738.52,5842.63,5705.37,5783.01,5812.08,5860.59,6409370000.0,5705140000.0,...,5285970000.0,173.46,329.71,341.63,382.49,-0.02,0.01,0.0,-0.03,5603.65
2025-03-12,5572.07,5614.56,5770.2,5738.52,5636.3,5705.37,5783.01,5812.08,6221240000.0,6409370000.0,...,5165080000.0,134.71,173.46,329.71,341.63,0.0,-0.02,0.01,0.0,5624.84
2025-03-13,5599.3,5572.07,5614.56,5770.2,5642.19,5636.3,5705.37,5783.01,5219830000.0,6221240000.0,...,5705140000.0,175.05,134.71,173.46,329.71,0.02,0.0,-0.02,0.01,5594.45
2025-03-14,5521.52,5599.3,5572.07,5614.56,5597.78,5642.19,5636.3,5705.37,5018980000.0,5219830000.0,...,6409370000.0,212.01,175.05,134.71,173.46,-0.02,0.02,0.0,-0.02,5563.85
2025-03-17,5638.94,5521.52,5599.3,5572.07,5645.27,5597.78,5642.19,5636.3,4863180000.0,5018980000.0,...,6221240000.0,161.84,212.01,175.05,134.71,0.01,-0.02,0.02,0.0,5635.6


Date
2024-09-17   5,634.58
2024-09-18   5,618.26
2024-09-19   5,713.64
2024-09-20   5,702.55
2024-09-23   5,718.57
               ...   
2025-02-25   5,955.25
2025-02-26   5,956.06
2025-02-27   5,861.57
2025-02-28   5,954.50
2025-03-03   5,849.72
Name: S&P500_Close, Length: 114, dtype: float64

Date
2025-03-04   5,778.15
2025-03-05   5,842.63
2025-03-06   5,738.52
2025-03-07   5,770.20
2025-03-10   5,614.56
2025-03-11   5,572.07
2025-03-12   5,599.30
2025-03-13   5,521.52
2025-03-14   5,638.94
2025-03-17   5,675.12
2025-03-18   5,614.66
2025-03-19   5,675.29
2025-03-20   5,662.89
2025-03-21   5,667.56
2025-03-24   5,767.57
2025-03-25   5,776.65
2025-03-26   5,712.20
2025-03-27   5,693.31
2025-03-28   5,580.94
2025-03-31   5,611.85
2025-04-01   5,633.07
2025-04-02   5,670.97
2025-04-03   5,396.52
2025-04-04   5,074.08
2025-04-07   5,062.25
2025-04-08   4,982.77
2025-04-09   5,456.90
2025-04-10   5,268.05
2025-04-11   5,363.36
Name: S&P500_Close, dtype: float64

In [7]:
closelast_model = LinearRegression(fit_intercept = True)
#closelast_model = Ridge(alpha = 0.25, fit_intercept = True)
closelast_model.fit(X_train, y_train) 

# The following gives the R-square score
closelast_model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
closelast_model.coef_

# This is the coefficient Beta_0
closelast_model.intercept_

0.9144761850825593

array([-9.74506240e-02, -6.54206394e-02, -3.27515279e-01,  1.43663704e-01,
        1.57119970e-01,  2.97191770e-01,  5.00184011e-02, -9.93157847e-02,
       -9.94780666e-09,  4.60749689e-09,  3.96966353e-09,  7.03754471e-10,
        9.96445407e-03, -9.94618082e-03,  9.80741738e-02, -5.58382433e-02,
        9.27829764e+01,  1.94836903e+02,  4.17319673e+01,  9.18950125e+01,
        9.24151391e-01])

85.23378071671505

In [8]:
training_residuals = y_train - closelast_model.predict(X_train)

In [9]:
#set to 0 for random forest or 1 for gradient boosting
treemodelchoice = 0
treemodel = None
if treemodelchoice == 0:
    treemodel = RandomForestRegressor(n_estimators=500, random_state=50, min_samples_leaf = 2, max_features = "sqrt")
    treemodel = treemodel.fit(X_train, training_residuals) 
    X_train.columns
    treemodel.feature_importances_
else:
    treemodel = GradientBoostingRegressor(n_estimators=500,random_state=50, min_samples_leaf = 2, max_depth = 10)
    treemodel = treemodel.fit(X_train, training_residuals)
    X_train.columns
    treemodel.feature_importances_


Index(['S&P500_Close_lag_1', 'S&P500_Close_lag_2', 'S&P500_Close_lag_3',
       'S&P500_Close_lag_4', 'S&P500_High_lag_1', 'S&P500_High_lag_2',
       'S&P500_High_lag_3', 'S&P500_High_lag_4', 'S&P500_Volume_lag_1',
       'S&P500_Volume_lag_2', 'S&P500_Volume_lag_3', 'S&P500_Volume_lag_4',
       'OilFutures_Vol_lag_1', 'OilFutures_Vol_lag_2', 'OilFutures_Vol_lag_3',
       'OilFutures_Vol_lag_4', 'OilFutures_Change%_lag_1',
       'OilFutures_Change%_lag_2', 'OilFutures_Change%_lag_3',
       'OilFutures_Change%_lag_4', 'S&P500_Open'],
      dtype='object')

array([0.03468289, 0.0387282 , 0.0379721 , 0.04368424, 0.03286702,
       0.0408431 , 0.04115811, 0.05410377, 0.0653072 , 0.04270291,
       0.05031294, 0.09785318, 0.0514831 , 0.04056183, 0.0414207 ,
       0.05132923, 0.04997813, 0.03942621, 0.05294997, 0.05604001,
       0.03659517])

In [10]:
pred_residuals = treemodel.predict(X_test)
y_pred = pred_residuals + closelast_model.predict(X_test)

In [11]:
test_output = pd.DataFrame(y_pred, index = X_test.index, columns = ['pred_S&P500_Close'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

test_output.tail()
mean_absolute_error = abs(test_output['pred_S&P500_Close'] - test_output['S&P500_Close']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_S&P500_Close'] - test_output['S&P500_Close']).mean()/test_output['S&P500_Close'].mean()


Unnamed: 0_level_0,pred_S&P500_Close,S&P500_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-03-04,5858.43,5778.15
2025-03-05,5775.02,5842.63
2025-03-06,5799.93,5738.52
2025-03-07,5731.72,5770.2
2025-03-10,5674.21,5614.56


Unnamed: 0_level_0,pred_S&P500_Close,S&P500_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-04-07,4917.42,5062.25
2025-04-08,5203.67,4982.77
2025-04-09,5069.92,5456.9
2025-04-10,5389.2,5268.05
2025-04-11,5393.75,5363.36


Mean absolute error is 
82.77434440205751


0.014870695916179181

In [12]:
from statsmodels.tsa.api import VAR
model = VAR(X)
model_fit = model.fit()
#using VAR in order to predict the next day variables that will affect the closing price of S&P500 
days = 4 #how many buisness days to predict for
var_forecast = model_fit.forecast(X.values, steps=days)
latest_data = pd.DataFrame(var_forecast, columns=X_train.columns)
latest_data = latest_data.fillna(0)
predictions = []
#predict for only the next buisness days
predicted_dates = [subset_data.index[-1] + BDay(i) for i in range(1, (days + 1))]

for i in range(days):
    current_features = latest_data.iloc[[i]].fillna(0)  # Fill any remaining NaNs

    # Get predictions from models
    decision_pred = treemodel.predict(current_features)[0]
    lr_pred = closelast_model.predict(current_features)[0]
    
    # Apply a weighted average to stabilize predictions
    next_day_prediction = decision_pred + lr_pred
    predictions.append(next_day_prediction)

    if i < (days - 1):  # Only update if we have more predictions to make
        latest_data.iloc[i+1, -1] = next_day_prediction  # Set the latest prediction


print("Predicted Closing Prices for the Next",days,"Business Days:")
for i, pred in enumerate(predictions):
    print(f"{predicted_dates[i].date()}: {pred}")

Predicted Closing Prices for the Next 4 Business Days:
2025-04-14: 5338.246686893906
2025-04-15: 5400.101081730278
2025-04-16: 5309.642337035574
2025-04-17: 5310.208604698536


In [13]:
#correct_predictions = [5776.65,5712.2,5693.31,5580.94] #for march 25,26,27,28 
#correct_predictions = [5633.07,5670.97,0,0] #for april 1,2,3,4 
#correct_predictions = [4982.77,5456.90,5268.05,5363.36]  #for april 8,9,10,11 
correct_predictions = [5405.97,5396.63,5275.70,5350]  #for april 14,15,16,17 
#correct_predictions = [5411,0,0,0]
error_sum = 0
for i in range(0,len(correct_predictions)):
    error = abs(correct_predictions[i] - predictions[i])
    print(error)
    error_sum += error
print("MAE Error is: ",error_sum/4)




67.72331310609388
3.4710817302775467
33.94233703557438
39.791395301464036
MAE Error is:  36.23203179335246


In [14]:
#best prediction is close, high, volume MAE 36
#other good predictions are:
#close, high, volume MAE 41