In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import datasets, ensemble

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
from auto_ts import auto_timeseries
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML
from pandas.tseries.offsets import BDay
import yfinance as yf

# Download data for S&P500 for a specific date range
sp500_data = yf.download("^GSPC", start="2023-04-03", end="2025-04-15")

sp500_data.index = pd.to_datetime(sp500_data.index).strftime('%m/%d/%Y')

# Convert the data to a CSV file
sp500_data.to_csv("SP500_original_data.csv")

Imported auto_timeseries version:0.0.92. Call by using:
model = auto_timeseries(score_type='rmse',
        time_interval='M', non_seasonal_pdq=None, seasonality=False,
        seasonal_period=12, model_type=['best'], verbose=2, dask_xgboost_flag=0)
model.fit(traindata, ts_column,target)
model.predict(testdata, model='best')

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [2]:
stock_data = pd.read_csv('Kaggle_Data/SP500_data.csv',parse_dates=['Date'],index_col='Date')

stock_data.head()
stock_data.columns

Unnamed: 0_level_0,S&P500_Close,S&P500_High,S&P500_Low,S&P500_Open,S&P500_Volume,S&P500_%Change,VIX_Open,VIX_High,VIX_Low,VIX_Close,VFIX_Open,VFIX_High,VFIX_Low,VFIX_Close,OilFutures_Price,OilFutures_Open,OilFutures_High,OilFutures_Low,OilFutures_Vol,OilFutures_Change%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-09-11,5554.13,5560.41,5406.96,5496.42,3839450000.0,0.02,19.41,21.41,17.55,17.69,513.72,513.72,513.72,513.72,66.6,65.73,67.14,64.99,237.2,0.02
2024-09-12,5595.76,5600.71,5535.5,5557.48,3655070000.0,0.06,17.62,18.59,16.89,17.07,517.58,517.58,517.58,517.58,68.15,66.68,68.93,66.55,221.59,0.02
2024-09-13,5626.02,5636.27,5601.65,5603.34,3500790000.0,0.14,17.03,17.18,16.23,16.56,520.46,520.46,520.46,520.46,67.75,68.35,69.35,67.58,230.3,-0.01
2024-09-16,5633.09,5636.05,5604.53,5615.21,3437070000.0,-0.19,17.16,17.69,16.91,17.14,521.21,521.21,521.21,521.21,69.02,68.22,69.61,67.7,235.88,0.02
2024-09-17,5634.58,5670.81,5614.05,5655.51,3443600000.0,0.4,17.16,18.08,16.67,17.61,521.36,521.36,521.36,521.36,69.96,69.35,70.65,68.51,289.93,0.01


Index(['S&P500_Close', 'S&P500_High', 'S&P500_Low', 'S&P500_Open',
       'S&P500_Volume', 'S&P500_%Change', 'VIX_Open', 'VIX_High', 'VIX_Low',
       'VIX_Close', 'VFIX_Open', 'VFIX_High', 'VFIX_Low', 'VFIX_Close',
       'OilFutures_Price', 'OilFutures_Open', 'OilFutures_High',
       'OilFutures_Low', 'OilFutures_Vol', 'OilFutures_Change%'],
      dtype='object')

In [3]:
len(stock_data)

subset_data = stock_data
for i in range(1, 5):
    subset_data[f'S&P500_Close_lag_{i}'] = subset_data['S&P500_Close'].shift(i)
    subset_data[f'S&P500_High_lag_{i}'] = subset_data['S&P500_High'].shift(i)
    subset_data[f'S&P500_Low_lag_{i}'] = subset_data['S&P500_Low'].shift(i)
    subset_data[f'S&P500_Open_lag_{i}'] = subset_data['S&P500_Open'].shift(i)
    subset_data[f'S&P500_Volume_lag_{i}'] = subset_data['S&P500_Volume'].shift(i)
    subset_data[f'S&P500_%Change_lag_{i}'] = subset_data['S&P500_%Change'].shift(i)
    subset_data[f'VIX_Open_lag_{i}'] = subset_data['VIX_Open'].shift(i)
    subset_data[f'VIX_High_lag_{i}'] = subset_data['VIX_High'].shift(i)
    subset_data[f'VIX_Low_lag_{i}'] = subset_data['VIX_Low'].shift(i)
    subset_data[f'VIX_Close_lag_{i}'] = subset_data['VIX_Close'].shift(i)
    subset_data[f'VFIX_Open_lag_{i}'] = subset_data['VFIX_Open'].shift(i)
    subset_data[f'VFIX_High_lag_{i}'] = subset_data['VFIX_High'].shift(i)
    subset_data[f'VFIX_Low_lag_{i}'] = subset_data['VFIX_Low'].shift(i)
    subset_data[f'VFIX_Close_lag_{i}'] = subset_data['VFIX_Close'].shift(i)
    subset_data[f'OilFutures_Price_lag_{i}'] = subset_data['OilFutures_Price'].shift(i)
    subset_data[f'OilFutures_Open_lag_{i}'] = subset_data['OilFutures_Open'].shift(i)
    subset_data[f'OilFutures_High_lag_{i}'] = subset_data['OilFutures_High'].shift(i)
    subset_data[f'OilFutures_Low_lag_{i}'] = subset_data['OilFutures_Low'].shift(i)
    subset_data[f'OilFutures_Vol_lag_{i}'] = subset_data['OilFutures_Vol'].shift(i)
    subset_data[f'OilFutures_Change%_lag_{i}'] = subset_data['OilFutures_Change%'].shift(i)

subset_data = subset_data.dropna()
subset_data.head()

len(subset_data)

148

Unnamed: 0_level_0,S&P500_Close,S&P500_High,S&P500_Low,S&P500_Open,S&P500_Volume,S&P500_%Change,VIX_Open,VIX_High,VIX_Low,VIX_Close,...,VFIX_Open_lag_4,VFIX_High_lag_4,VFIX_Low_lag_4,VFIX_Close_lag_4,OilFutures_Price_lag_4,OilFutures_Open_lag_4,OilFutures_High_lag_4,OilFutures_Low_lag_4,OilFutures_Vol_lag_4,OilFutures_Change%_lag_4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-17,5634.58,5670.81,5614.05,5655.51,3443600000.0,0.4,17.16,18.08,16.67,17.61,...,513.72,513.72,513.72,513.72,66.6,65.73,67.14,64.99,237.2,0.02
2024-09-18,5618.26,5689.75,5615.08,5641.68,3691390000.0,0.13,17.58,19.39,17.11,18.23,...,517.58,517.58,517.58,517.58,68.15,66.68,68.93,66.55,221.59,0.02
2024-09-19,5713.64,5733.57,5686.42,5702.63,4024530000.0,1.5,17.21,17.27,16.21,16.33,...,520.46,520.46,520.46,520.46,67.75,68.35,69.35,67.58,230.3,-0.01
2024-09-20,5702.55,5715.14,5674.49,5709.64,7867260000.0,-0.07,16.35,16.68,15.81,16.15,...,521.21,521.21,521.21,521.21,69.02,68.22,69.61,67.7,235.88,0.02
2024-09-23,5718.57,5725.36,5704.22,5711.9,3529550000.0,0.16,16.71,16.95,15.75,15.89,...,521.36,521.36,521.36,521.36,69.96,69.35,70.65,68.51,289.93,0.01


143

In [4]:
subset_data.isna().sum()
subset_data.columns

S&P500_Close                0
S&P500_High                 0
S&P500_Low                  0
S&P500_Open                 0
S&P500_Volume               0
                           ..
OilFutures_Open_lag_4       0
OilFutures_High_lag_4       0
OilFutures_Low_lag_4        0
OilFutures_Vol_lag_4        0
OilFutures_Change%_lag_4    0
Length: 100, dtype: int64

Index(['S&P500_Close', 'S&P500_High', 'S&P500_Low', 'S&P500_Open',
       'S&P500_Volume', 'S&P500_%Change', 'VIX_Open', 'VIX_High', 'VIX_Low',
       'VIX_Close', 'VFIX_Open', 'VFIX_High', 'VFIX_Low', 'VFIX_Close',
       'OilFutures_Price', 'OilFutures_Open', 'OilFutures_High',
       'OilFutures_Low', 'OilFutures_Vol', 'OilFutures_Change%',
       'S&P500_Close_lag_1', 'S&P500_High_lag_1', 'S&P500_Low_lag_1',
       'S&P500_Open_lag_1', 'S&P500_Volume_lag_1', 'S&P500_%Change_lag_1',
       'VIX_Open_lag_1', 'VIX_High_lag_1', 'VIX_Low_lag_1', 'VIX_Close_lag_1',
       'VFIX_Open_lag_1', 'VFIX_High_lag_1', 'VFIX_Low_lag_1',
       'VFIX_Close_lag_1', 'OilFutures_Price_lag_1', 'OilFutures_Open_lag_1',
       'OilFutures_High_lag_1', 'OilFutures_Low_lag_1', 'OilFutures_Vol_lag_1',
       'OilFutures_Change%_lag_1', 'S&P500_Close_lag_2', 'S&P500_High_lag_2',
       'S&P500_Low_lag_2', 'S&P500_Open_lag_2', 'S&P500_Volume_lag_2',
       'S&P500_%Change_lag_2', 'VIX_Open_lag_2', 'VIX_High_l

In [5]:
X = subset_data[[f'S&P500_Close_lag_{i}' for i in range(1, 5)] + 
                [f'S&P500_High_lag_{i}' for i in range(1, 5)] + 
                [f'S&P500_Low_lag_{i}' for i in range(1, 5)] + 
                [f'S&P500_Open_lag_{i}' for i in range(1, 5)] + 
                [f'S&P500_Volume_lag_{i}' for i in range(1, 5)] + 
                [f'S&P500_%Change_lag_{i}' for i in range(1, 5)] + 
                [f'VIX_Open_lag_{i}' for i in range(1, 5)] + 
                [f'VIX_High_lag_{i}' for i in range(1, 5)] + 
                [f'VIX_Low_lag_{i}' for i in range(1, 5)] + 
                [f'VIX_Close_lag_{i}' for i in range(1, 5)] + 
                [f'VFIX_Open_lag_{i}' for i in range(1, 5)] +
                [f'VFIX_High_lag_{i}' for i in range(1, 5)] + 
                [f'VFIX_Low_lag_{i}' for i in range(1, 5)] + 
                [f'VFIX_Close_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_Price_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_Open_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_High_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_Low_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_Vol_lag_{i}' for i in range(1, 5)] + 
                [f'OilFutures_Change%_lag_{i}' for i in range(1, 5)] +
                
                ['S&P500_Open']].copy()

y = subset_data['S&P500_Close']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train
X_test
y_train
y_test

Unnamed: 0_level_0,S&P500_Close_lag_1,S&P500_Close_lag_2,S&P500_Close_lag_3,S&P500_Close_lag_4,S&P500_High_lag_1,S&P500_High_lag_2,S&P500_High_lag_3,S&P500_High_lag_4,S&P500_Low_lag_1,S&P500_Low_lag_2,...,OilFutures_Low_lag_4,OilFutures_Vol_lag_1,OilFutures_Vol_lag_2,OilFutures_Vol_lag_3,OilFutures_Vol_lag_4,OilFutures_Change%_lag_1,OilFutures_Change%_lag_2,OilFutures_Change%_lag_3,OilFutures_Change%_lag_4,S&P500_Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-17,5633.09,5626.02,5595.76,5554.13,5636.05,5636.27,5600.71,5560.41,5604.53,5601.65,...,64.99,235.88,230.30,221.59,237.20,0.02,-0.01,0.02,0.02,5655.51
2024-09-18,5634.58,5633.09,5626.02,5595.76,5670.81,5636.05,5636.27,5600.71,5614.05,5604.53,...,66.55,289.93,235.88,230.30,221.59,0.01,0.02,-0.01,0.02,5641.68
2024-09-19,5618.26,5634.58,5633.09,5626.02,5689.75,5670.81,5636.05,5636.27,5615.08,5614.05,...,67.58,367.19,289.93,235.88,230.30,-0.00,0.01,0.02,-0.01,5702.63
2024-09-20,5713.64,5618.26,5634.58,5633.09,5733.57,5689.75,5670.81,5636.05,5686.42,5615.08,...,67.70,354.93,367.19,289.93,235.88,0.02,-0.00,0.01,0.02,5709.64
2024-09-23,5702.55,5713.64,5618.26,5634.58,5715.14,5733.57,5689.75,5670.81,5674.49,5686.42,...,68.51,307.00,354.93,367.19,289.93,-0.00,0.02,-0.00,0.01,5711.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-25,5983.25,6013.13,6117.52,6144.15,6043.65,6114.82,6134.50,6147.43,5977.83,6008.56,...,71.71,205.47,268.47,26.71,69.01,0.00,-0.03,0.00,0.01,5982.73
2025-02-26,5955.25,5983.25,6013.13,6117.52,5992.65,6043.65,6114.82,6134.50,5908.49,5977.83,...,71.85,267.31,205.47,268.47,26.71,-0.03,0.00,-0.03,0.00,5970.87
2025-02-27,5956.06,5955.25,5983.25,6013.13,6009.82,5992.65,6043.65,6114.82,5932.69,5908.49,...,70.17,255.60,267.31,205.47,268.47,-0.00,-0.03,0.00,-0.03,5981.88
2025-02-28,5861.57,5956.06,5955.25,5983.25,5993.69,6009.82,5992.65,6043.65,5858.78,5932.69,...,69.80,265.93,255.60,267.31,205.47,0.03,-0.00,-0.03,0.00,5856.74


Unnamed: 0_level_0,S&P500_Close_lag_1,S&P500_Close_lag_2,S&P500_Close_lag_3,S&P500_Close_lag_4,S&P500_High_lag_1,S&P500_High_lag_2,S&P500_High_lag_3,S&P500_High_lag_4,S&P500_Low_lag_1,S&P500_Low_lag_2,...,OilFutures_Low_lag_4,OilFutures_Vol_lag_1,OilFutures_Vol_lag_2,OilFutures_Vol_lag_3,OilFutures_Vol_lag_4,OilFutures_Change%_lag_1,OilFutures_Change%_lag_2,OilFutures_Change%_lag_3,OilFutures_Change%_lag_4,S&P500_Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-04,5849.72,5954.5,5861.57,5956.06,5986.09,5959.4,5993.69,6009.82,5810.91,5837.66,...,68.36,332.75,250.07,265.93,255.6,-0.02,-0.01,0.03,-0.0,5811.98
2025-03-05,5778.15,5849.72,5954.5,5861.57,5865.08,5986.09,5959.4,5993.69,5732.59,5810.91,...,68.61,386.75,332.75,250.07,265.93,-0.0,-0.02,-0.01,0.03,5781.36
2025-03-06,5842.63,5778.15,5849.72,5954.5,5860.59,5865.08,5986.09,5959.4,5742.35,5732.59,...,69.14,382.49,386.75,332.75,250.07,-0.03,-0.0,-0.02,-0.01,5785.87
2025-03-07,5738.52,5842.63,5778.15,5849.72,5812.08,5860.59,5865.08,5986.09,5711.64,5742.35,...,67.89,341.63,382.49,386.75,332.75,0.0,-0.03,-0.0,-0.02,5726.01
2025-03-10,5770.2,5738.52,5842.63,5778.15,5783.01,5812.08,5860.59,5865.08,5666.29,5711.64,...,66.77,329.71,341.63,382.49,386.75,0.01,0.0,-0.03,-0.0,5705.37
2025-03-11,5614.56,5770.2,5738.52,5842.63,5705.37,5783.01,5812.08,5860.59,5564.02,5666.29,...,65.22,173.46,329.71,341.63,382.49,-0.02,0.01,0.0,-0.03,5603.65
2025-03-12,5572.07,5614.56,5770.2,5738.52,5636.3,5705.37,5783.01,5812.08,5528.41,5564.02,...,65.59,134.71,173.46,329.71,341.63,0.0,-0.02,0.01,0.0,5624.84
2025-03-13,5599.3,5572.07,5614.56,5770.2,5642.19,5636.3,5705.37,5783.01,5546.09,5528.41,...,66.12,175.05,134.71,173.46,329.71,0.02,0.0,-0.02,0.01,5594.45
2025-03-14,5521.52,5599.3,5572.07,5614.56,5597.78,5642.19,5636.3,5705.37,5504.65,5546.09,...,65.45,212.01,175.05,134.71,173.46,-0.02,0.02,0.0,-0.02,5563.85
2025-03-17,5638.94,5521.52,5599.3,5572.07,5645.27,5597.78,5642.19,5636.3,5563.85,5504.65,...,65.0,161.84,212.01,175.05,134.71,0.01,-0.02,0.02,0.0,5635.6


Date
2024-09-17   5,634.58
2024-09-18   5,618.26
2024-09-19   5,713.64
2024-09-20   5,702.55
2024-09-23   5,718.57
               ...   
2025-02-25   5,955.25
2025-02-26   5,956.06
2025-02-27   5,861.57
2025-02-28   5,954.50
2025-03-03   5,849.72
Name: S&P500_Close, Length: 114, dtype: float64

Date
2025-03-04   5,778.15
2025-03-05   5,842.63
2025-03-06   5,738.52
2025-03-07   5,770.20
2025-03-10   5,614.56
2025-03-11   5,572.07
2025-03-12   5,599.30
2025-03-13   5,521.52
2025-03-14   5,638.94
2025-03-17   5,675.12
2025-03-18   5,614.66
2025-03-19   5,675.29
2025-03-20   5,662.89
2025-03-21   5,667.56
2025-03-24   5,767.57
2025-03-25   5,776.65
2025-03-26   5,712.20
2025-03-27   5,693.31
2025-03-28   5,580.94
2025-03-31   5,611.85
2025-04-01   5,633.07
2025-04-02   5,670.97
2025-04-03   5,396.52
2025-04-04   5,074.08
2025-04-07   5,062.25
2025-04-08   4,982.77
2025-04-09   5,456.90
2025-04-10   5,268.05
2025-04-11   5,363.36
Name: S&P500_Close, dtype: float64

In [7]:
closelast_model = LinearRegression(fit_intercept = True)
#closelast_model = Ridge(alpha = 0.25, fit_intercept = True)
closelast_model.fit(X_train, y_train) 

# The following gives the R-square score
closelast_model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
closelast_model.coef_

# This is the coefficient Beta_0
closelast_model.intercept_

0.9518427523719258

array([ 1.87722840e-01, -5.75126681e-01,  1.60175051e+01,  1.11274587e+01,
        7.26942029e-01, -2.69388525e-01,  1.06769201e+00, -3.10664734e-01,
       -3.51257781e-01,  3.04141192e-01, -2.94047773e-01,  2.94704835e-01,
       -2.42767861e+00, -1.38298667e+01, -1.16150732e+01, -1.21254729e-01,
       -2.72054770e-08, -4.71376538e-09, -3.76497056e-10, -1.22632038e-09,
        1.32867222e+02,  7.83256633e+02,  6.49949213e+02,  4.44090210e+00,
       -1.76805677e+01, -2.34000866e+00,  6.17602762e+00, -7.94927288e+00,
       -1.56683061e+00, -1.75238551e+01, -1.77192026e+01,  3.84378328e+00,
        1.33802865e+01, -2.72846184e+01,  2.71188162e+00, -3.23137551e+00,
       -1.12854458e+01,  3.78943964e+01,  2.49151133e+01,  1.52953375e+01,
       -1.75621310e+00,  8.31645825e+00, -7.43403300e+00,  9.22885677e-01,
       -1.75621315e+00,  8.31645825e+00, -7.43403299e+00,  9.22885677e-01,
       -1.75621315e+00,  8.31645824e+00, -7.43403299e+00,  9.22885675e-01,
       -1.75621315e+00,  

-202.16188436102584

In [8]:
training_residuals = y_train - closelast_model.predict(X_train)

In [9]:
#set to 0 for random forest or 1 for gradient boosting
treemodelchoice = 0
treemodel = None
if treemodelchoice == 0:
    treemodel = RandomForestRegressor(n_estimators=500, random_state=50, min_samples_leaf = 2, max_features = "sqrt")
    treemodel = treemodel.fit(X_train, training_residuals) 
    X_train.columns
    treemodel.feature_importances_
else:
    treemodel = GradientBoostingRegressor(n_estimators=500,random_state=50, min_samples_leaf = 2, max_depth = 10)
    treemodel = treemodel.fit(X_train, training_residuals)
    X_train.columns
    treemodel.feature_importances_


Index(['S&P500_Close_lag_1', 'S&P500_Close_lag_2', 'S&P500_Close_lag_3',
       'S&P500_Close_lag_4', 'S&P500_High_lag_1', 'S&P500_High_lag_2',
       'S&P500_High_lag_3', 'S&P500_High_lag_4', 'S&P500_Low_lag_1',
       'S&P500_Low_lag_2', 'S&P500_Low_lag_3', 'S&P500_Low_lag_4',
       'S&P500_Open_lag_1', 'S&P500_Open_lag_2', 'S&P500_Open_lag_3',
       'S&P500_Open_lag_4', 'S&P500_Volume_lag_1', 'S&P500_Volume_lag_2',
       'S&P500_Volume_lag_3', 'S&P500_Volume_lag_4', 'S&P500_%Change_lag_1',
       'S&P500_%Change_lag_2', 'S&P500_%Change_lag_3', 'S&P500_%Change_lag_4',
       'VIX_Open_lag_1', 'VIX_Open_lag_2', 'VIX_Open_lag_3', 'VIX_Open_lag_4',
       'VIX_High_lag_1', 'VIX_High_lag_2', 'VIX_High_lag_3', 'VIX_High_lag_4',
       'VIX_Low_lag_1', 'VIX_Low_lag_2', 'VIX_Low_lag_3', 'VIX_Low_lag_4',
       'VIX_Close_lag_1', 'VIX_Close_lag_2', 'VIX_Close_lag_3',
       'VIX_Close_lag_4', 'VFIX_Open_lag_1', 'VFIX_Open_lag_2',
       'VFIX_Open_lag_3', 'VFIX_Open_lag_4', 'VFIX_High_lag

array([0.01053993, 0.0055349 , 0.01143415, 0.00838669, 0.00669131,
       0.01414644, 0.01129515, 0.01221001, 0.01493348, 0.01136949,
       0.01139966, 0.01158218, 0.0115138 , 0.0131973 , 0.01018482,
       0.01703406, 0.02448367, 0.01102425, 0.01693263, 0.02285001,
       0.02681222, 0.01499076, 0.01748237, 0.0165912 , 0.00945288,
       0.01230454, 0.01633569, 0.01311865, 0.00905817, 0.01350615,
       0.01132449, 0.013966  , 0.01035681, 0.01564497, 0.01699889,
       0.01253664, 0.01052359, 0.00921983, 0.01193912, 0.01230621,
       0.01028727, 0.00687384, 0.00986754, 0.01164418, 0.00796851,
       0.01014323, 0.01274808, 0.01217234, 0.01023331, 0.0094328 ,
       0.01098775, 0.00811968, 0.01066537, 0.00794429, 0.01104977,
       0.00980195, 0.00835269, 0.00762731, 0.01512093, 0.00835066,
       0.0109312 , 0.01489791, 0.00946762, 0.01924004, 0.01048794,
       0.01074487, 0.01137531, 0.01295465, 0.0072436 , 0.01204507,
       0.01178751, 0.01066446, 0.01908735, 0.01346174, 0.01132

In [10]:
pred_residuals = treemodel.predict(X_test)
y_pred = pred_residuals + closelast_model.predict(X_test)

In [11]:
test_output = pd.DataFrame(y_pred, index = X_test.index, columns = ['pred_S&P500_Close'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

test_output.tail()
mean_absolute_error = abs(test_output['pred_S&P500_Close'] - test_output['S&P500_Close']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_S&P500_Close'] - test_output['S&P500_Close']).mean()/test_output['S&P500_Close'].mean()


Unnamed: 0_level_0,pred_S&P500_Close,S&P500_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-03-04,5812.28,5778.15
2025-03-05,5715.26,5842.63
2025-03-06,5704.05,5738.52
2025-03-07,5664.44,5770.2
2025-03-10,5649.85,5614.56


Unnamed: 0_level_0,pred_S&P500_Close,S&P500_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-04-07,4705.79,5062.25
2025-04-08,4716.23,4982.77
2025-04-09,4332.31,5456.9
2025-04-10,5569.69,5268.05
2025-04-11,5006.77,5363.36


Mean absolute error is 
148.26205265048495


0.026635788139435942

In [12]:
from statsmodels.tsa.api import VAR
model = VAR(X)
model_fit = model.fit()
#using VAR in order to predict the next 4 day variables that will affect the closing price of S&P500 
var_forecast = model_fit.forecast(X.values, steps=4)
latest_data = pd.DataFrame(var_forecast, columns=X_train.columns)
latest_data = latest_data.fillna(0)
predictions = []
#predict for only the next 4 buisness days
predicted_dates = [subset_data.index[-1] + BDay(i) for i in range(1, 5)]

for i in range(4):
    current_features = latest_data.iloc[[i]].fillna(0)  # Fill any remaining NaNs

    # Get predictions from models
    decision_pred = treemodel.predict(current_features)[0]
    lr_pred = closelast_model.predict(current_features)[0]
    
    # Apply a weighted average to stabilize predictions
    next_day_prediction = decision_pred + lr_pred
    predictions.append(next_day_prediction)

    if i < 3:  # Only update if we have more predictions to make
        latest_data.iloc[i+1, -1] = next_day_prediction  # Set the latest prediction


print("Predicted Closing Prices for the Next 4 Business Days:")
for i, pred in enumerate(predictions):
    print(f"{predicted_dates[i].date()}: {pred}")

Predicted Closing Prices for the Next 4 Business Days:
2025-04-14: 5388.392323532627
2025-04-15: 5104.650912580664
2025-04-16: 5206.011753790418
2025-04-17: 5329.509783647027


In [13]:
#correct_predictions = [5776.65,5712.2,5693.31,5580.94] #for march 25,26,27,28 
#correct_predictions = [5633.07,5670.97,0,0] #for april 1,2,3,4 
#correct_predictions = [4982.77,5456.90,5268.05,5363.36]  #for april 8,9,10,11 
correct_predictions = [5411,0,0,0]
error_sum = 0
for i in range(0,len(correct_predictions)):
    error = abs(correct_predictions[i] - predictions[i])
    print(error)
    error_sum += error
print("MAE Error is: ",error_sum/4)




22.607676467372585
5104.650912580664
5206.011753790418
5329.509783647027
MAE Error is:  3915.6950316213706
