In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error

#For inline plotting 
%matplotlib inline                 
%config InlineBackend.figure_format = 'svg'

plt.style.use("seaborn-v0_8-dark")  

In [None]:
# get the stocks data 

!wget -nc https://lazyprogrammer.me/course_files/sp500sub.csv

In [4]:
df = pd.read_csv('sp500sub.csv', index_col = 0, parse_dates=True)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,9.22,9.51,9.17,9.5,9.5,1865400.0,INCY
2010-01-05,9.51,10.29,9.45,10.27,10.27,7608900.0,INCY
2010-01-06,10.38,11.09,10.35,11.0,11.0,8046700.0,INCY
2010-01-07,11.0,11.06,10.62,10.82,10.82,3680300.0,INCY
2010-01-08,10.82,11.0,10.75,10.94,10.94,1529300.0,INCY


In [5]:
# get IBM stock prices 

ibm = df[df.Name == 'IBM'][['Close']].copy()

ibm['LogClose'] = np.log(ibm['Close'])

ibm['LogReturn'] = ibm['LogClose'].diff()

ibm.head()

Unnamed: 0_level_0,Close,LogClose,LogReturn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04,132.449997,4.886205,
2010-01-05,130.850006,4.874052,-0.012154
2010-01-06,130.0,4.867534,-0.006517
2010-01-07,129.550003,4.864067,-0.003468
2010-01-08,130.850006,4.874052,0.009985


In [6]:
# train, test 

Ntest = 252

train, test = ibm[:-Ntest], ibm[-Ntest:]

In [7]:
# We will prepare feature and target (differenced log) data suitable to use previous 21 values in the series to predict the next

series = ibm['LogReturn'].to_numpy()[1:] # first entry in NaN due to differencing

target = (series > 0) * 1

T = 21

X = list()
Y =list()

for t in range(len(series)-T): # -T because there is no sliding window of 21 after a certain index towards the end of the series

    x = series[t:t+T] # take 0th to 20th, 1th to 21th, ... indexed values as the X 
    X.append(x)

    y = target[t+T] # set the 21th, 22th, ... as the corresponding target 
    Y.append(y)

X = np.array(X)
y = np.array(Y)
N = len(X) # number of observations

print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (2241, 21), y shape: (2241,)


In [8]:
X_train, y_train = X[:-Ntest], y[:-Ntest]
X_test, y_test = X[-Ntest:], y[-Ntest:]

X_train.shape, y_train.shape

((1989, 21), (1989,))

In [13]:

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

log_reg.score(X_train,y_train)

0.5093011563599799

In [14]:
log_reg.score(X_test, y_test)

0.49603174603174605

In [15]:
svc = SVC()
svc.fit(X_train,y_train)
svc.score(X_train,y_train)

0.7551533433886375

In [16]:
# model clearly overfits 
svc.score(X_test,y_test)

0.49603174603174605

In [18]:
rfc = RandomForestClassifier()

rfc.fit(X_train,y_train)
rfc.score(X_train,y_train)

1.0

In [19]:
# again incredible over fit 
rfc.score(X_test,y_test)

0.4880952380952381