In [None]:
%run init_notebookspace.py

In [None]:
!pip install yfinance

In [None]:
!pip install pandas_datareader

In [None]:
!pip install yahoo_fin

In [None]:
import pandas as pd
import numpy as np

import yfinance as yf
import pandas_datareader as pdread
from pandas_datareader import fred

import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA
from yahoo_fin import stock_info as ysi

from itertools import chain

In [None]:
from src.utils import apply_datetime_format

In [None]:
# get sp500 composits & market cap
sp500_tickers = ysi.tickers_sp500()
df = pd.DataFrame(index=sp500_tickers, 
                  columns=['market_cap'], 
                  data=[pdread.get_quote_yahoo(item)['marketCap'].values[0] for item in sp500_tickers])
sp500_largest = df.sort_values('market_cap', ascending=False).index[:5].values
sp500_ticker = ['^GSPC']

In [None]:
start = '2020-01-01'
end = '2022-12-31'

In [None]:
df_prices = pd.DataFrame()
df_prices.index = pd.date_range(start, periods=(apply_datetime_format(end, '%Y-%m-%d') - apply_datetime_format(start, '%Y-%m-%d')).days)

for item in [*chain(sp500_largest, sp500_ticker)]:
    data = yf.download(item, start, end)
    data.columns = list([f'{item}_{x}' for x in data.columns])
    df_prices = df_prices.join(data)
    
# get closing price
df_c = df_prices[[item for item in df_prices.columns if 'Adj Close' in item]].copy()
df_c.columns = [item[:-10] for item in df_c.columns]
df_c.dropna(inplace=True)

In [None]:
# get log returns
df_rets = np.log(df_c / df_c.shift(1)).dropna()
df_rets.sort_index(inplace=True)
df_rets.asfreq = "D"

df_rets['^GSPC_lead'] = df_rets['^GSPC'].shift(1)

df_rets.dropna(inplace=True)
df_rets.index.asfreq = 'd'


In [None]:
# save data to file
df_rets.to_csv(os.path.join(DATA_DIR, 'returns.csv'))

In [None]:
# load data from file
df_rets = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'))

In [None]:
# Checking for stationarity via ADF test below
# dickey-fuller attempt 1 
from statsmodels.tsa.stattools import adfuller  
from numpy import log  
import pandas as pd  
  
mydata = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

res = adfuller(mydata.value.dropna())  
                     
print('Augmented Dickey-Fuller Statistic: %f' % res[0])  
print('p-value: %f' % res[1])  

In [None]:
# determining the ADF for the 'd' value of ARIMA 


In [None]:
# Next looking at autocorrelation 
import numpy as np, pandas as pd  
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  
import matplotlib.pyplot as plt 
plt.rcParams.update({'figure.figsize' : (9,7), 'figure.dpi' : 120})
plt.show()

In [None]:
# Importing data  
df = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [None]:
#print (df)

In [None]:
# The Genuine Series 
fig, axes = plt.subplots(3, 2, sharex = True)  
axes[0, 0].plot(df.value); axes[0, 0].set_title('The Genuine Series')  
plot_acf(df.value, ax = axes[0, 1])

In [None]:
# Order of Differencing: First  
axes[1, 0].plot(mydata.value.diff()); axes[1, 0].set_title('Order of Differencing: First')  
plot_acf(df.value.diff().dropna(), ax = axes[1, 1])  

In [None]:
# Order of Differencing: Second  
axes[2, 0].plot(df.value.diff().diff()); axes[2, 0].set_title('Order of Differencing: Second')  
plot_acf(df.value.diff().diff().dropna(), ax = axes[2, 1])  
plt.show()  


In [None]:
# Determine the differencing above. Weak/ strong stationarity? 
# Next, looking at how to make the data stationary via 3 tests. 

In [None]:
!pip install pmdarima

In [None]:
from pmdarima.arima.utils import ndiffs  
import pandas as pd
import os

In [None]:
df = pd.read_csv(pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)
X = df.value  

In [None]:
# Augmented Dickey Fuller Test  
adftest = ndiffs(df, test = 'adf')  

In [None]:
# KPSS Test  
kpsstest = ndiffs(df, test = 'kpss')  

In [None]:
# PP Test  
pptest = ndiffs(df, test = 'pp')  

In [None]:
print("ADF Test =", adftest)  
print("KPSS Test =", kpsstest)  
print("PP Test =", pptest) 

In [None]:
#Next finding the 'p' term for AR via a partial auocorrelation graph

In [None]:
import numpy as np, pandas as pd  
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  
import matplotlib.pyplot as plt  

In [None]:
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})  

In [None]:
import os

In [None]:
# importing data 
df = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [None]:
 plt.plot(df.to_numpy(), label='Close')

In [None]:
ndarray = np.asarray(df)
print(ndarray)

In [None]:
fig, axes = plt.subplots(1, 2, sharex = True)
axes[0].plot(ndarray); axes[0].set_title('Order of Differencing: First')  
axes[1].set(ylim = (0,5))  
plot_pacf(ndarray, ax = axes[1]) 

In [None]:
# Look a the blue region to determine if it is useful or not
# In the PAC graph, there should be a light blue box showing the significance-- need to fix this 
# Otherwise, for PAC I would say there is an order of 1 or 2 at the moment

In [None]:
#Next, finding the 'q' - Moving Averages
import numpy as np, pandas as pd  
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  
import matplotlib.pyplot as plt  
  
plt.rcParams.update({'figure.figsize' : (9,3), 'figure.dpi' : 120})  

In [None]:
# Importing data  
mydata = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [None]:
 plt.plot(df.to_numpy(), label='Close')

In [None]:
mydata = np.asarray(df)
print(mydata)

In [None]:
fig, axes = plt.subplots(1, 2, sharex = True)  
axes[0].plot(mydata); axes[0].set_title('Order of Differencing: First')  
axes[1].set(ylim = (0, 1.2))  
plot_acf(mydata, ax = axes[1])  
plt.show() 

In [None]:
# Looking at the autocorrelation graph, we can determine the number needed for q 
# Again, missing that blue box for significance 
#3 Otherwise, I would say that we have an order of 1 here 

In [None]:
# Now building the ARIMA model 

import numpy as np, pandas as pd  
import statsmodels.api as sm

In [None]:
# importing data  
mydata = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [None]:
# Creating ARIMA model 2
# change the () to whatever numbers we get from the results for p,d,q above 
mymodel = sm.tsa.arima.ARIMA(mydata.value, order = (1, 0, 1))  
modelfit = mymodel.fit()  

print(modelfit.summary())

In [None]:
# These are the results. We can alter the output by changing the AIC/ BIC if we have a close call. 

# Now we plot the residuals to ensure that there is no constant mean or varaince to worry about 

In [None]:
import numpy as np, pandas as pd  
import statsmodels.api as sm
import matplotlib.pyplot as plt  

In [None]:
# Plotting Residual Errors
myresiduals = pd.DataFrame(modelfit.resid)  
fig, ax = plt.subplots(1,2)  
myresiduals.plot(title = "Residuals", ax = ax[0])  
myresiduals.plot(kind = 'kde', title = 'Density', ax = ax[1])  
plt.show()  

In [None]:
# Finally, graphing our predicted versus actual values for trend analysis 

In [None]:
import numpy as np, pandas as pd  
from statsmodels.tsa.arima_model import ARIMA  
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_predict

In [None]:
plt.rcParams.update({'figure.figsize' : (9,3), 'figure.dpi' : 120})  


In [None]:
# importing data  
mydata = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [None]:
# The best fitting ARIMA model from before
# Change the () to whatever numbers we get from the results for p,d,q above 
mymodel = sm.tsa.arima.ARIMA(mydata.value, order = (1, 0, 1))  
modelfit = mymodel.fit()  

print(modelfit.summary()) 

In [None]:
# Actual vs Fitted  

import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_predict
from statsmodels.tsa.arima.model import ARIMA

df = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  
df.index = pd.date_range(start = '2020-01-01', end = '2022-01-01', freq='A')
res = ARIMA(dta, order=(1,0,1)).fit()
fig, ax = plt.subplots()
ax = dta.loc['2020-01-01':].plot(ax=ax)
plot_predict(res, '2020-01-01', '2022-01-01', ax=ax)
plt.show()

In [None]:
# Working on making these into a matrix 

In [81]:
# importing data  
mydata = pd.read_csv(os.path.join(DATA_DIR, 'returns.csv'), names = ['value'], header = 0)  

In [86]:
# The best fitting ARIMA model from before
# Change the () to whatever numbers we get from the results for p,d,q above 
p = 1
d = 0
q = 1
mymodel = sm.tsa.arima.ARIMA(mydata.value, order = (p, d, q))  
modelfit = mymodel.fit()  

print(modelfit.summary()) 

                               SARIMAX Results                                
Dep. Variable:                  value   No. Observations:                  754
Model:                 ARIMA(1, 0, 1)   Log Likelihood                2062.924
Date:                Thu, 06 Apr 2023   AIC                          -4117.847
Time:                        11:02:05   BIC                          -4099.346
Sample:                             0   HQIC                         -4110.720
                                - 754                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0002      0.001      0.405      0.686      -0.001       0.001
ar.L1         -0.5698      0.058     -9.858      0.000      -0.683      -0.457
ma.L1          0.3767      0.065      5.814      0.0

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [87]:
A = modelfit.model.ssm.transition[..., 0] 
print(A)

[[-0.56981815  1.        ]
 [ 0.          0.        ]]


In [88]:
print(modelfit.params)

const     0.000217
ar.L1    -0.569818
ma.L1     0.376662
sigma2    0.000246
dtype: float64


In [89]:
# Write p, d, and q values to a new Python file
with open('arima_values.py', 'w') as f:
    f.write(f'p = {p}\n')
    f.write(f'd = {d}\n')
    f.write(f'q = {q}\n')