In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from pmdarima.arima import auto_arima
import scipy
import seaborn as sns
import sklearn
import statsmodels.api as sm
import yfinance 
import warnings 
warnings.filterwarnings("ignore")
sns.set()


# Load data 

In [2]:
raw_data = yfinance.download(tickers="^GSPC ^FTSE ^N225 ^GDAXI",
                             start="1994-01-07", end="2018-01-29",
                             interval="1d", group_by="ticker", 
                             auto_adjust=True, treads=True)

[*********************100%***********************]  4 of 4 completed


In [3]:
df_comp = raw_data.copy()

In [4]:
df_comp.head()

Unnamed: 0_level_0,^GDAXI,^GDAXI,^GDAXI,^GDAXI,^GDAXI,^FTSE,^FTSE,^FTSE,^FTSE,^FTSE,^N225,^N225,^N225,^N225,^N225,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1994-01-07,2218.959961,2227.639893,2201.820068,2224.949951,0.0,3401.399902,3446.800049,3398.699951,3446.0,0.0,17842.980469,18131.410156,17787.480469,18124.009766,0.0,467.089996,470.26001,467.029999,469.899994,324920000.0
1994-01-10,2231.840088,2238.01001,2222.0,2225.0,0.0,3465.699951,3468.100098,3430.0,3440.600098,0.0,18186.519531,18567.060547,18186.519531,18443.439453,0.0,469.899994,475.269989,469.549988,475.269989,319490000.0
1994-01-11,2225.429932,2235.610107,2225.179932,2228.100098,0.0,3442.5,3442.5,3413.5,3413.800049,0.0,18481.849609,18671.669922,18373.039062,18485.25,0.0,475.269989,475.279999,473.269989,474.130005,305490000.0
1994-01-12,2227.120117,2227.790039,2182.060059,2182.060059,0.0,3394.800049,3402.399902,3372.0,3372.0,0.0,18447.339844,18807.080078,18301.929688,18793.880859,0.0,474.130005,475.059998,472.140015,474.170013,310690000.0
1994-01-13,2171.5,2183.709961,2134.100098,2142.370117,0.0,3380.699951,3383.300049,3356.899902,3360.0,0.0,18770.380859,18823.380859,18548.75,18577.259766,0.0,474.170013,474.170013,471.799988,472.470001,277970000.0


In [5]:
df_comp['spx'] = df_comp['^GSPC'].Close[:]
df_comp['dax'] = df_comp['^GDAXI'].Close[:]
df_comp['ftse'] = df_comp['^FTSE'].Close[:]
df_comp['nikkei'] = df_comp['^N225'].Close[:]

In [6]:
df_comp = df_comp.iloc[1:]
df_comp.drop(['^GSPC', '^GDAXI', '^FTSE', '^N225'], axis=1, inplace=True)
df_comp = df_comp.asfreq('b')
df_comp = df_comp.fillna(method='ffill')

# Creating Returns

In [7]:
df_comp['ret_spx'] = df_comp.spx.pct_change(1)*100
df_comp['ret_ftse'] = df_comp.ftse.pct_change(1)*100
df_comp['ret_dax'] = df_comp.dax.pct_change(1)*100
df_comp['ret_nikkei'] = df_comp.nikkei.pct_change(1)*100

In [8]:
df_comp.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei,ret_spx,ret_ftse,ret_dax,ret_nikkei
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994-01-10,475.269989,2225.0,3440.600098,18443.439453,,,,
1994-01-11,474.130005,2228.100098,3413.800049,18485.25,-0.23986,-0.778935,0.13933,0.226696
1994-01-12,474.170013,2182.060059,3372.0,18793.880859,0.008438,-1.224443,-2.066336,1.669606
1994-01-13,472.470001,2142.370117,3360.0,18577.259766,-0.358524,-0.355872,-1.818921,-1.152615
1994-01-14,474.910004,2151.050049,3400.600098,18973.699219,0.516435,1.208336,0.405156,2.134004


# Splitting the Data

In [9]:
size = int(len(df_comp)*0.8)
df, df_test = df_comp.iloc[:size].copy(), df_comp.iloc[size:].copy()

# Fitting a Model

In [10]:
model_auto = auto_arima(df.ret_ftse[1:])

In [12]:
model_auto

      with_intercept=False)

In [11]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,5019.0
Model:,"SARIMAX(2, 0, 5)",Log Likelihood,-7885.535
Date:,"Mon, 15 Feb 2021",AIC,15787.07
Time:,22:50:38,BIC,15839.238
Sample:,0,HQIC,15805.351
,- 5019,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.1761,0.039,4.535,0.000,0.100,0.252
ar.L2,-0.8130,0.035,-22.993,0.000,-0.882,-0.744
ma.L1,-0.1996,0.038,-5.225,0.000,-0.274,-0.125
ma.L2,0.7659,0.037,20.461,0.000,0.693,0.839
ma.L3,-0.0947,0.011,-8.399,0.000,-0.117,-0.073
ma.L4,0.0115,0.009,1.266,0.205,-0.006,0.029
ma.L5,-0.1108,0.008,-13.112,0.000,-0.127,-0.094
sigma2,1.3558,0.014,94.484,0.000,1.328,1.384

0,1,2,3
Ljung-Box (L1) (Q):,0.01,Jarque-Bera (JB):,6576.07
Prob(Q):,0.91,Prob(JB):,0.0
Heteroskedasticity (H):,1.99,Skew:,-0.18
Prob(H) (two-sided):,0.0,Kurtosis:,8.6


Although the model summary says it is SARIMAX, the resulting model is actually ARMA(2, 5). 

We know that a seasonal model contains four more orders (P, D, Q, s), which we do not see here, so this is not a seasonal model. Also, the order (2, 0, 5) suggests that d=0 and there is no integration. Lastly, we did not set any exogenous variable and there is no coefficient for that so there is no "MAX" in the model. 

ARMA(2, 5) was not our choice of model in our analysis because the p-values for coefficients are not all zero. The rules of model selection are rather "rules of thumb" than "fixed". So we can attribute this to different standards in model selection. Also, the auto ARIMA only considers a single feature, the AIC, regardless of the significance of some coefficients. This can be perceived as a flaw of the method. However, empirical research has sometimes shown that omitting certain lags can be beneficial to model estimation when clustering is apparent. 

## Important arguments

In [13]:
model_auto = auto_arima(df.ret_ftse[1:], exogenous=df[['ret_spx', 'ret_dax', 'ret_nikkei']][1:],
                        m=5, max_order=None, max_p=7, max_q=7, max_d=2, max_P=4, max_Q=4, max_D=2,
                        maxiter=50, alpha=0.05, n_jobs=-1)

"""
m: The number of periods in each season
max_order: The maximum value of p+q+P+Q. If None, no constraints. 
max_p: The maximum number of AR components 
max_q: The maximum number of MA components 
max_d: The maximum value of integration
max_P: The maximum number of seasonal AR components 
max_Q: The maximum number of seasonal MA components 
max_D: The maximum value of seasonal integration
maxiter: The maximum number of function evaluations. Default is 50. (when trying to converge)
alpha: testing significance. Default is 0.05
n_jobs: The number of models to fit in parallel in the case of a grid search. -1 is as many as possible.
"""