In [1]:
import pandas as pd

original_data_path = "../../data/SP500_all_time_more_data.csv"
#original_data_path = "../../data/SP500_all_time.csv"
#original_data_path = "../../data/main.csv"
df = pd.read_csv(original_data_path)
# turn all columns into lowercase
df.columns = df.columns.str.lower()
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,open,high,low,close,adj close,volume,dff,dtb3,dgs10,dfii10,dgs1,dgs2,dgs5,dfii5,bamlh0a0hym2
0,2003-01-02,879.820007,909.030029,879.820007,909.030029,909.030029,1229200000,1.3,1.2,4.07,2.43,1.42,1.8,3.05,1.75,8.65
1,2003-01-03,909.030029,911.25,903.070007,908.590027,908.590027,1130800000,1.12,1.2,4.05,2.43,1.41,1.79,3.03,1.75,8.57
2,2003-01-06,908.590027,931.77002,908.590027,929.01001,929.01001,1435900000,1.22,1.19,4.09,2.46,1.44,1.84,3.1,1.79,8.41
3,2003-01-07,929.01001,930.809998,919.929993,922.929993,922.929993,1545200000,1.2,1.17,4.04,2.42,1.4,1.77,3.04,1.76,8.26
4,2003-01-08,922.929993,922.929993,908.320007,909.929993,909.929993,1467600000,1.29,1.17,4.0,2.29,1.36,1.71,3.01,1.68,8.18


In [2]:
import plotly.graph_objects as go
# Line plot on the closing prices only
fig = go.Figure(data=go.Scatter(x=df.index, y=df['close'], mode='lines'))
fig.update_layout(title='Closing Prices', xaxis_title='Time', yaxis_title='Price')
fig.show()

In [3]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import ta

# Assuming your DataFrame is named 'df' and contains 'close', 'volume', 'high', and 'low' columns
df['index'] = df.index.astype(int)
df['log_volume'] = np.log(df['volume'] + 1)
df['pct_change'] = df['close'].pct_change()
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# introduce seasonal features
df['sin_day'] = np.sin(2 * np.pi * df['day']/365)
df['cos_day'] = np.cos(2 * np.pi * df['day']/365)

# add indicators
df['sma_10'] = ta.trend.sma_indicator(df['close'], window=10)
df['sma_30'] = ta.trend.sma_indicator(df['close'], window=30)
df['ema_10'] = ta.trend.ema_indicator(df['close'], window=10)
df['ema_30'] = ta.trend.ema_indicator(df['close'], window=30)
df['rsi'] = ta.momentum.rsi(df['close'], window=14)
df['macd'] = ta.trend.macd(df['close'], window_slow=26, window_fast=12)
df['macd_signal'] = ta.trend.macd_signal(df['close'], window_slow=26, window_fast=12)
df['macd_diff'] = ta.trend.macd_diff(df['close'], window_slow=26, window_fast=12)
df['bollinger_high'] = ta.volatility.bollinger_hband(df['close'], window=20, window_dev=2)
df['bollinger_low'] = ta.volatility.bollinger_lband(df['close'], window=20, window_dev=2)
df['stoch'] = ta.momentum.stoch(df['high'], df['low'], df['close'], window=14, smooth_window=3)
df['stoch_signal'] = ta.momentum.stoch_signal(df['high'], df['low'], df['close'], window=14, smooth_window=3)
df['adx'] = ta.trend.adx(df['high'], df['low'], df['close'], window=14)

# get the target
df['next_close'] = df['close'].shift(-1)
df['next_pct_change'] = df['pct_change'].shift(-1)

# 1_day_decision
df['1_day_decision'] = df['next_close'] > df['close']
# turn the boolean into 1 and 0
df['1_day_decision'] = df['1_day_decision'].astype(int)


#columns_to_ignore = ['index', 'date', 'next_close', 'close', 'open', 'high', 'low', 'average']
columns_to_ignore = ['index', 'date', 'next_close', 'next_pct_change', '1_day_decision']
target_column = 'next_close'

# Drop any missing values from the DataFrame
data = df.dropna()

In [4]:
# scroll to the right to see the new columns
pd.set_option('display.max_columns', None)

data.head()

Unnamed: 0,date,open,high,low,close,adj close,volume,dff,dtb3,dgs10,dfii10,dgs1,dgs2,dgs5,dfii5,bamlh0a0hym2,index,log_volume,pct_change,day,month,year,sin_day,cos_day,sma_10,sma_30,ema_10,ema_30,rsi,macd,macd_signal,macd_diff,bollinger_high,bollinger_low,stoch,stoch_signal,adx,next_close,next_pct_change,1_day_decision
33,2003-02-20,845.130005,849.369995,836.559998,837.099976,837.099976,1194100000,1.25,1.17,3.85,1.91,1.29,1.59,2.82,1.13,8.51,33,20.900659,-0.009502,20,2,2003,0.337523,0.941317,833.734998,867.748665,839.341877,859.777311,41.256527,-15.046713,-17.885887,2.839174,877.863376,813.025625,52.80202,63.436596,33.867578,848.169983,0.013224,1
34,2003-02-21,837.099976,852.280029,831.47998,848.169983,848.169983,1398200000,1.21,1.17,3.9,1.93,1.31,1.63,2.86,1.13,8.42,34,21.058452,0.013224,21,2,2003,0.353676,0.935368,834.736994,865.689998,840.946987,859.028451,46.466427,-13.357362,-16.980182,3.62282,869.679112,817.292885,71.773742,63.398332,32.679394,832.580017,-0.018381,0
35,2003-02-24,848.169983,848.169983,832.159973,832.580017,832.580017,1229200000,1.25,1.19,3.86,1.87,1.28,1.6,2.82,1.04,8.41,35,20.929629,-0.018381,24,2,2003,0.401488,0.915864,835.025995,862.523665,839.42572,857.322101,40.957297,-13.12522,-16.20919,3.08397,867.291362,816.798634,47.506372,57.360711,31.576081,838.570007,0.007194,1
36,2003-02-25,832.580017,839.549988,818.539978,838.570007,838.570007,1483700000,1.28,1.18,3.81,1.84,1.27,1.58,2.77,1.0,8.45,36,21.117805,0.007194,25,2,2003,0.417194,0.908818,835.285999,859.556999,839.270136,856.112288,43.718359,-12.315933,-15.430539,3.114606,866.760825,816.438174,58.330346,59.203486,31.358383,827.549988,-0.013141,0
37,2003-02-26,838.570007,840.099976,826.679993,827.549988,827.549988,1374400000,1.28,1.19,3.78,1.84,1.27,1.57,2.75,1.01,8.44,37,21.041283,-0.013141,26,2,2003,0.432776,0.901502,835.120996,856.266665,837.1392,854.269559,40.011269,-12.420614,-14.828554,2.40794,864.658338,815.441662,45.641911,50.492876,31.102072,837.280029,0.011758,1


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Prepare the data for linear regression
# everything except date and next_close
X = data.drop(columns=columns_to_ignore)
print(X.columns)
y = data[target_column]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Index(['open', 'high', 'low', 'close', 'adj close', 'volume', 'dff', 'dtb3',
       'dgs10', 'dfii10', 'dgs1', 'dgs2', 'dgs5', 'dfii5', 'bamlh0a0hym2',
       'log_volume', 'pct_change', 'day', 'month', 'year', 'sin_day',
       'cos_day', 'sma_10', 'sma_30', 'ema_10', 'ema_30', 'rsi', 'macd',
       'macd_signal', 'macd_diff', 'bollinger_high', 'bollinger_low', 'stoch',
       'stoch_signal', 'adx'],
      dtype='object')


In [24]:
# ridge regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Create a Ridge regression model
model = Ridge(alpha=0.5)

# Train the model
model.fit(X_train_scaled, y_train)

In [25]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Print explained variance on the test data
print("Explained Variance (R-squared) on Test Data:", model.score(X_test_scaled, y_test))

# Print mean squared error on the test data
mse = np.mean((y_pred - y_test)**2)
print("Mean Squared Error (MSE) on Test Data:", mse)

# Plot the predicted vs. actual values
# Create traces for actual and predicted values
trace_actual = go.Scatter(
    x=np.arange(len(y_test)),
    y=y_test,
    mode='lines',
    name='Actual'
)

trace_predicted = go.Scatter(
    x=np.arange(len(y_pred)),
    y=y_pred,
    mode='lines',
    name='Predicted'
)

# Create the layout for the figure
layout = go.Layout(
    title='Predicted vs. Actual Values',
    xaxis=dict(title='Data Points'),
    yaxis=dict(title='Values')
)

# Create the figure
fig = go.Figure(data=[trace_actual, trace_predicted], layout=layout)

# Display the figure
fig.show()

Explained Variance (R-squared) on Test Data: 0.9917536011269358
Mean Squared Error (MSE) on Test Data: 2534.2776019173452
