In [13]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm
import sklearn
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
import warnings
warnings.filterwarnings("ignore")
sns.set()

In [14]:
df_gspc = yf.download('^GSPC', start="1994-01-07", end="2018-01-29", interval="1d")
df_gspc = df_gspc[["Close"]]
df_ftse = yf.download("^FTSE", start="1994-01-07", end="2018-01-29", interval="1d")
df_ftse = df_ftse[["Close"]]
df_n225 = yf.download("^N225", start="1994-01-07", end="2018-01-29", interval="1d")
df_n225 = df_n225[["Close"]]
df_gdaxi = yf.download("^GDAXI", start="1994-01-07", end="2018-01-29", interval="1d")
df_gdaxi = df_gdaxi[["Close"]]

#merge all dataframes
df = df_gspc.join(df_ftse, how="outer", lsuffix="_gspc", rsuffix="_ftse")
df = df.join(df_n225, how="outer", rsuffix="_n225")
df = df.join(df_gdaxi, how="outer", rsuffix="_gdaxi")
df.columns = ["gspc", "ftse", "n225", "gdaxi"]

# columns names spx, dax, ftse, nikkei
df = df.rename(columns={"gspc": "spx", "gdaxi": "dax", "ftse": "ftse", "n225": "nikkei"})
df = df.asfreq("b")  # Business day frequency
df = df.fillna(method="ffill")  # Forward fill to handle missing values
df = df.dropna()


#returns
df['ret_spx'] = df['spx'].pct_change(1)*100
df['ret_dax'] = df['dax'].pct_change(1)*100
df['ret_ftse'] = df['ftse'].pct_change(1)*100
df['ret_nikkei'] = df['nikkei'].pct_change(1)*100

df_full = df.copy()
df_full = df_full[1:]
# splitting the data into training and testing sets
train_size = int(len(df) * 0.8)
df, df_test = df[:train_size], df[train_size:]
df = df[1:]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [15]:
df.head()

Unnamed: 0_level_0,spx,ftse,nikkei,dax,ret_spx,ret_dax,ret_ftse,ret_nikkei
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994-01-10,475.269989,3440.600098,18443.439453,2225.0,1.142795,0.002249,-0.156701,1.762467
1994-01-11,474.130005,3413.800049,18485.25,2228.100098,-0.23986,0.13933,-0.778935,0.226696
1994-01-12,474.170013,3372.0,18793.880859,2182.060059,0.008438,-2.066336,-1.224443,1.669606
1994-01-13,472.470001,3360.0,18577.259766,2142.370117,-0.358524,-1.818921,-0.355872,-1.152615
1994-01-14,474.910004,3400.600098,18973.699219,2151.050049,0.516435,0.405156,1.208336,2.134004


In [16]:
len(df), len(df_test)

(5019, 1256)

### Fitting a model

In [17]:
from pmdarima.arima import auto_arima

In [18]:
model_auto = auto_arima(df.ret_ftse[1:], seasonal=True, stepwise=True, trace=True,)

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=15835.532, Time=3.44 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=15860.468, Time=0.22 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=15859.751, Time=0.35 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=15859.435, Time=0.46 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=15859.794, Time=0.17 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=15831.466, Time=1.73 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=15847.793, Time=0.57 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=15835.131, Time=1.74 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=15799.945, Time=1.33 sec
 ARIMA(0,0,3)(0,0,0)[0] intercept   : AIC=15812.871, Time=0.74 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=15799.595, Time=3.30 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=15797.384, Time=5.07 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=15799.926, Time=5.07 sec
 ARIMA(4,0,3)(0,0,0)[0] intercept   : AIC=15795.034, Time=5.03 sec
 ARIMA(4,0,2)(0,0,0

In [19]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,5018.0
Model:,"SARIMAX(4, 0, 5)",Log Likelihood,-7880.905
Date:,"Thu, 07 Aug 2025",AIC,15781.81
Time:,16:14:43,BIC,15847.018
Sample:,01-11-1994,HQIC,15804.66
,- 04-04-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.0172,0.082,0.210,0.833,-0.143,0.178
ar.L2,-0.6505,0.078,-8.366,0.000,-0.803,-0.498
ar.L3,-0.1568,0.071,-2.199,0.028,-0.297,-0.017
ar.L4,0.2034,0.074,2.731,0.006,0.057,0.349
ma.L1,-0.0410,0.081,-0.503,0.615,-0.201,0.119
ma.L2,0.6027,0.079,7.671,0.000,0.449,0.757
ma.L3,0.0568,0.069,0.828,0.407,-0.078,0.191
ma.L4,-0.1951,0.073,-2.664,0.008,-0.339,-0.052
ma.L5,-0.1047,0.010,-10.952,0.000,-0.123,-0.086

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,6373.37
Prob(Q):,0.95,Prob(JB):,0.0
Heteroskedasticity (H):,1.99,Skew:,-0.2
Prob(H) (two-sided):,0.0,Kurtosis:,8.51


### Important Arguments


In [20]:
model_auto = auto_arima(df_full.ret_ftse, exogenous=df_full[['ret_spx', 'ret_dax', 'ret_nikkei']] ,stepwise=True,m = 5, max_order = None, max_p = 7,max_q = 7, 
                        max_d = 2,max_P = 4,max_Q = 4, max_D = 2 , maxiter = 50, alpha=0.05, n_jobs = 8,trend="ct", information_criterion="oob", out_of_sample_size= int(len(df_full)*0.2))

# m -> seasonal cycle length
# max_order -> maximum order of AR and MA terms
# max_p -> maximum AR components
# max_q -> maximum MA components
# max_d -> maximum differencing order
# maxiter -> maximum number of iterations
# return_valid_fits -> whether or not the method should validate the results
# alpha -> level of significance, default is 0.05
# n_jobs -> number of parallel jobs to run, default is 1 (-1 indicates "as many as possible")
# trend -> "ct" usually
# information_criterion -> "aic", "bic","aicc", "hqic", "oob" 
# opt_of_sample -> validates the model selection (pass the entire dataset and set 20% to be out-of-sample)

In [21]:
print(type(model_auto))

<class 'pmdarima.arima.arima.ARIMA'>


In [22]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,6275.0
Model:,"SARIMAX(0, 0, 3)x(2, 0, [1, 2], 5)",Log Likelihood,-9582.181
Date:,"Thu, 07 Aug 2025",AIC,19184.363
Time:,16:23:04,BIC,19251.806
Sample:,0,HQIC,19207.731
,- 6275,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0287,0.040,0.724,0.469,-0.049,0.106
drift,-2.482e-06,1.06e-05,-0.234,0.815,-2.33e-05,1.83e-05
ma.L1,-0.0242,0.009,-2.756,0.006,-0.041,-0.007
ma.L2,-0.0502,0.008,-6.343,0.000,-0.066,-0.035
ma.L3,-0.0840,0.008,-10.739,0.000,-0.099,-0.069
ar.S.L5,-0.0962,0.729,-0.132,0.895,-1.525,1.332
ar.S.L10,-0.1823,0.202,-0.903,0.367,-0.578,0.213
ma.S.L5,0.0434,0.729,0.060,0.953,-1.385,1.472
ma.S.L10,0.1660,0.230,0.721,0.471,-0.286,0.618

0,1,2,3
Ljung-Box (L1) (Q):,0.14,Jarque-Bera (JB):,9010.62
Prob(Q):,0.71,Prob(JB):,0.0
Heteroskedasticity (H):,0.86,Skew:,-0.23
Prob(H) (two-sided):,0.0,Kurtosis:,8.85
