In [61]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import arch
from arch.univariate import Normal as ARCHNormal
# np.set_printoptions(precision=2, suppress=True, linewidth=100, floatmode='fixed')

# Set seed

In [62]:
seed = 42

# Random number generator

$\sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2$  
Below we have $y_t = 1$ for all $t$, a zero mean model and $\omega = 0$ and $\alpha = 1$ so $\sigma_t^2 = 1$ for all $t$  

In [63]:
# get the generated normal random numbers from the arch.arch_model.forecast function with the same seed
length = 10
test_df = pd.DataFrame(np.ones(length), columns=['y'])
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', p=1, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
normal_rvs = sims.simulations.values.squeeze()
normal_rvs

array([ 0.305, -1.04 ,  0.75 ,  0.941, -1.951, -1.302,  0.128, -0.316,
       -0.017, -0.853])

# Test vol='ARCH'

In [4]:
# create artificial data to test the arch.arch_model.forecast function
# data will contain integer values so that it is easier to compare the results
test_df = pd.DataFrame(np.arange(10), columns=['y'])
y = test_df['y'].values
y

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
# only one lag on residual with vol='ARCH'
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', p=1, rescale=False)
test_model.fit()

Iteration:      1,   Func. Count:      4,   Neg. LLF: 28.43922788393828
Iteration:      2,   Func. Count:      7,   Neg. LLF: 28.427130663050598
Iteration:      3,   Func. Count:     10,   Neg. LLF: 28.423630703038214
Iteration:      4,   Func. Count:     13,   Neg. LLF: 28.42333130448614
Iteration:      5,   Func. Count:     16,   Neg. LLF: 28.423323726474315
Iteration:      6,   Func. Count:     18,   Neg. LLF: 28.42332373612137
Optimization terminated successfully    (Exit mode 0)
            Current function value: 28.423323726474315
            Iterations: 6
            Function evaluations: 18
            Gradient evaluations: 6


                        Zero Mean - ARCH Model Results                        
Dep. Variable:                      y   R-squared:                       0.000
Mean Model:                 Zero Mean   Adj. R-squared:                  0.100
Vol Model:                       ARCH   Log-Likelihood:               -28.4233
Distribution:                  Normal   AIC:                           60.8466
Method:            Maximum Likelihood   BIC:                           61.4518
                                        No. Observations:                   10
Date:                Thu, Oct 31 2024   Df Residuals:                       10
Time:                        07:10:30   Df Model:                            0
                            Volatility Model                            
                 coef    std err          t      P>|t|  95.0% Conf. Int.
------------------------------------------------------------------------
omega          1.9896      1.452      1.370      0.171 [ -0.856,  4.83

## Example 1
$\sigma_t^2 = \omega + \alpha_1 \epsilon_{t-1}^2$  
Below we have $y_t = t$ for $t=0,\ldots,9$, a zero mean model, $\omega = 0$ and $\alpha_1 = 1$  
So $\sigma_t^2 = y_{t-1}^2$ for $t=1,\ldots,10$  
e.g. $\sigma_5^2 = y_4^2 = 4^2 = 16$  
Note that vars[i] $= \text{VAR}[\hat{y}_{i+1}]$ for $i=0,\ldots,9$ since start=0

In [6]:
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
print(means, vars)
np.allclose(vars, y**2)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0.  1.  4.  9. 16. 25. 36. 49. 64. 81.]


True

## Example 2
$\sigma_t^2 = \omega + \alpha_1 \epsilon_{t-1}^2 + \alpha_2 \epsilon_{t-2}^2$  
Below we have $y_t = t$ for $t=0,\ldots,9$, a zero mean model, $\omega = 0$ and $\alpha_1 = \alpha_2 = 1$  
So $\sigma_t^2 = y_{t-1}^2 + y_{t-2}^2$ for $t=1,\ldots,10$  
e.g. $\sigma_5^2 = y_4^2 + y_3^2 = 4^2 + 3^2 = 16 + 9 = 25$  
Note that vars[i] $= \text{VAR}[\hat{y}_{i+2}]$ for $i=0,\ldots,8$ since start=1

In [7]:
# two lags on residual with vol='ARCH' and start=1
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', lags=0, p=2, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1,1], horizon=1, start=1, method='simulation', simulations=2, x=None, align='origin')
series = sims.simulations.values.squeeze()
# print(f'series.shape: {series.shape}')
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
print(means, vars)
np.allclose(vars, y[1:]**2 + y[:-1]**2)

[0. 0. 0. 0. 0. 0. 0. 0. 0.] [  1.   5.  13.  25.  41.  61.  85. 113. 145.]


True

## Example 3
When start=None (default), the forecast starts from the last available observation.  
Therefore, vars[0] $= \text{VAR}[\hat{y}_{10}] = 9^2 + 8^2 = 145$ as in Example 2  
vars[1] $ = \text{VAR}[\hat{y}_{11}] = y_{10}^2 + y_{9}^2$  
vars[2] $ = \text{VAR}[\hat{y}_{12}] = y_{11}^2 + y_{10}^2$

In [8]:
# two lags on residual with vol='ARCH' and start=None
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', lags=0, p=2, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1,1], horizon=3, method='simulation', simulations=1, x=None, align='origin')
values = sims.simulations.values.squeeze()
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
print(f'values: {values}')
print(f'means: {means}')
print(f'vars: {vars}')
print(f'y_9^2 + y_8^2: {values[0]**2 + 9**2}')
print(f'y_10^2 + y_9^2: {values[1]**2 + values[0]**2}')

values: [  3.66927954 -10.10785438   8.06978747]
means: [0. 0. 0.]
vars: [145.          94.46361231 115.63233254]
y_9^2 + y_8^2: 94.46361231063993
y_10^2 + y_9^2: 115.63233253526381


# Show vol='ARCH' == vol='GARCH' with q=0

## Example 4
Below is the same as Example 1 but with vol='GARCH' and q=0  
Therefore values are expected to be the same as in Example 1


In [9]:
# one lag on residual with vol='GARCH'
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='GARCH', lags=1, p=1, q=0, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
vars = sims.variance.values.squeeze()
print(vars)
np.allclose(vars, y**2)

[ 0.  1.  4.  9. 16. 25. 36. 49. 64. 81.]


True

## Example 5
Below is the same as Example 2 but with vol='GARCH' and q=0  
Therefore values are expected to be the same as in Example 2

In [10]:
# two lags on residual with vol='GARCH' and start=1
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='GARCH', lags=1, p=2, q=0, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,1,1], horizon=1, start=1, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
vars = sims.variance.values.squeeze()
print(vars)
np.allclose(vars, y[1:]**2 + y[:-1]**2)

[  1.   5.  13.  25.  41.  61.  85. 113. 145.]


True

# Start=0 with insufficient lags

## Example 6

**<span style='color:red'>When there are insufficient lags to compute the variance, the code is drawing lag residuals from unknown source hence the values should not be used.</span>**  
For unknown reasons, the lag residuals that are not supplied by the user appear to be the same number for all unavailable lags as shown below.   
Constant value of (24.03524054)**0.5 is used regardless of the seed. It is unclear where this value is coming from.

In [11]:
# coefficient is one only for t-2 residual
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', lags=1, p=4, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,0,1,0,0], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
vars = sims.variance.values.squeeze()
print(vars)

[24.03524054  0.          1.          4.          9.         16.
 25.         36.         49.         64.        ]


In [12]:
# coefficient is one only for t-3 residual
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', lags=1, p=4, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,0,0,1,0], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
vars = sims.variance.values.squeeze()
print(vars)

[24.03524054 24.03524054  0.          1.          4.          9.
 16.         25.         36.         49.        ]


In [13]:
# coefficient is one only for t-4 residual
test_model = arch.arch_model(y=test_df['y'], x=None, mean='Zero', vol='ARCH', lags=1, p=4, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
sims = test_model.forecast(params=[0,0,0,0,1], horizon=1, start=0, method='simulation', simulations=1, x=None, align='origin')
series = sims.simulations.values.squeeze()
vars = sims.variance.values.squeeze()
print(vars)

[24.03524054 24.03524054 24.03524054  0.          1.          4.
  9.         16.         25.         36.        ]


# ARX model for mean with GARCH for volatility

In [14]:
# create artificial data to test the arch.arch_model.forecast function
# data will contain integer values so that it is easier to compare the results
# test_df = pd.DataFrame({'y':np.arange(10)*0.2, 'x':np.arange(10)})
test_df = pd.DataFrame({'y':np.array([0,1,1,2,3,5,8,13,21,34,55]).astype(float), 'x':np.arange(11).astype(float)})
y = test_df['y'].values
x = test_df.x.values
test_df

Unnamed: 0,y,x
0,0.0,0.0
1,1.0,1.0
2,1.0,2.0
3,2.0,3.0
4,3.0,4.0
5,5.0,5.0
6,8.0,6.0
7,13.0,7.0
8,21.0,8.0
9,34.0,9.0


## Example 7

In [15]:
large_df = pd.DataFrame({'y':np.arange(100).astype(float)})
test_model = arch.arch_model(y=large_df['y'], x=None, mean='ARX', vol='GARCH', lags=10, p=10, q=0, rescale=False)
param_res = test_model.fit()
param_res.params[:] = 0.
param_res.params[1] = 1.
param_res.params[12] = 1.
param_res.params

Iteration:      1,   Func. Count:     23,   Neg. LLF: -2656.8083929322147
Inequality constraints incompatible    (Exit mode 4)
            Current function value: -2656.808392273039
            Iterations: 1
            Function evaluations: 23
            Gradient evaluations: 1


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



Const        0.0
y[1]         1.0
y[2]         0.0
y[3]         0.0
y[4]         0.0
y[5]         0.0
y[6]         0.0
y[7]         0.0
y[8]         0.0
y[9]         0.0
y[10]        0.0
omega        0.0
alpha[1]     1.0
alpha[2]     0.0
alpha[3]     0.0
alpha[4]     0.0
alpha[5]     0.0
alpha[6]     0.0
alpha[7]     0.0
alpha[8]     0.0
alpha[9]     0.0
alpha[10]    0.0
Name: params, dtype: float64

**Need to pad the df with values else the test_model.fix() will fail due to insufficient data.**

In [16]:
temp_df = pd.concat([large_df, test_df], ignore_index=True)
temp_df

Unnamed: 0,y,x
0,0.0,
1,1.0,
2,2.0,
3,3.0,
4,4.0,
...,...,...
106,8.0,6.0
107,13.0,7.0
108,21.0,8.0
109,34.0,9.0


Let time $t$ be the last available observation.  
$E[y_{t+1}] = y_t = 55.$  
$\text{VAR}[y_{t+1}] = \epsilon_{t}^2 = (y_{t} - E[y_{t}])^2 = (55 - 34)^2 = 441$  
$E[y_{t+2}] = y_{t+1} = $ values[0]  
$\text{VAR}[y_{t+2}] = \epsilon_{t+1}^2 = (y_{t+1} - E[y_{t+1}])^2 = (\text{values[0]} - 55)^2$  
$E[y_{t+3}] = y_{t+2} = $ values[1]  
$\text{VAR}[y_{t+3}] = \epsilon_{t+2}^2 = (y_{t+2} - E[y_{t+2}])^2 = (\text{values[1]} - \text{values[0]})^2$  
$E[y_{t+4}] = y_{t+3} = $ values[2]  
$\text{VAR}[y_{t+4}] = \epsilon_{t+3}^2 = (y_{t+3} - E[y_{t+3}])^2 = (\text{values[2]} - \text{values[1]})^2$  

<span style='color:red'>**The sims.variance and sims.means values here are not correct beyond the first value. Do not use them.**</span>

We can calculate the effective variance of the residuals using the normal_rvs values derived from the seed in the first few cells:  
$y_{t+1} = \sigma_{t+1} z_{t+1} + E[y_{t+1}]$  

Therefore

$\sigma_{t+1} = \frac{(y_{t+1} - E[y_{t+1}])}{z_{t+1}} = \frac{\epsilon_{t+1}}{z_{t+1}}$  

$\sigma_{t+1}^2 = (\frac{\epsilon_{t+1}}{z_{t+1}})^2$  



In [17]:
np.set_printoptions(linewidth=100, suppress=True, precision=3)
test_model = arch.arch_model(y=temp_df['y'], mean='ARX', vol='GARCH', lags=10, p=10, q=0, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
test_res = test_model.fix(param_res.params)
sims = test_res.forecast(horizon=3, method='simulation', simulations=1, x=None, align='origin')
values = sims.simulations.values.squeeze()
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
residuals = sims._sim.residuals.squeeze()
print(f'y: {y}')
print('-----Means-----')
print(f'means: {means}')
print(f'y_t+1: {y[-1]}')
print(f'y_t+2: {values[0]}')
print(f'y_t+3: {values[1]}')
print('-----Residuals and variances-----')
print(f'values: {values}')
print(f'residuals: {residuals}')
print(f'eps_t+1: {values[0] - 55}')
print(f'eps_t+2: {values[1] - values[0]}')
print(f'eps_t+3: {values[2] - values[1]}')
print(f'vars: {vars}')
print(f'residuals**2: {residuals**2}')
print(f'vars based on residuals / normal_rvs: {(residuals / normal_rvs[:len(residuals)])**2}')
print(f'(y_t - E[y_t])^2: {(y[-1] - y[-2])**2}')
print(f'(y_t+1 - E[y_t+1])^2: {(values[0] - 55)**2}')
print(f'(y_t+2 - E[y_t+2])^2: {(values[1] - values[0])**2}')

y: [ 0.  1.  1.  2.  3.  5.  8. 13. 21. 34. 55.]
-----Means-----
means: [55. 55. 55.]
y_t+1: 55.0
y_t+2: 61.39905867484306
y_t+3: 54.74413935810591
-----Residuals and variances-----
values: [61.399 54.744 59.738]
residuals: [ 6.399 -6.655  4.994]
eps_t+1: 6.399058674843062
eps_t+2: -6.65491931673715
eps_t+3: 4.994192159240882
vars: [441.    481.948 526.236]
residuals**2: [40.948 44.288 24.942]
vars based on residuals / normal_rvs: [441.     40.948  44.288]
(y_t - E[y_t])^2: 441.0
(y_t+1 - E[y_t+1])^2: 40.94795192408424
(y_t+2 - E[y_t+2])^2: 44.287951112281256


## Adding exogenuous variable to mean equation

lag variable controls number of autoregressive y terms to predict mean while p is the number of residual lag terms to predict residual variance

In [18]:
# only one lag on residual with vol='ARCH'
test_model = arch.arch_model(y=test_df['y'], x=test_df['x'], mean='ARX', vol='GARCH', lags=1, p=1, q=0, rescale=False)
res = test_model.fit()
res.summary()

Iteration:      1,   Func. Count:      7,   Neg. LLF: 91.61411266529774
Iteration:      2,   Func. Count:     19,   Neg. LLF: 50.75017327614292
Iteration:      3,   Func. Count:     28,   Neg. LLF: 36.44096870845732
Iteration:      4,   Func. Count:     37,   Neg. LLF: 365.57387348210733
Iteration:      5,   Func. Count:     44,   Neg. LLF: 6.151970278719317
Iteration:      6,   Func. Count:     51,   Neg. LLF: 6.43027931491611
Iteration:      7,   Func. Count:     58,   Neg. LLF: 6.905866135869735
Iteration:      8,   Func. Count:     65,   Neg. LLF: 7.741695093707387
Iteration:      9,   Func. Count:     72,   Neg. LLF: 1322.8599913432877
Iteration:     10,   Func. Count:     83,   Neg. LLF: 2040.7559862722892
Iteration:     11,   Func. Count:     93,   Neg. LLF: 240.66942940093227
Iteration:     12,   Func. Count:    101,   Neg. LLF: -3.4950800976081555
Iteration:     13,   Func. Count:    107,   Neg. LLF: 0.3790684524240998
Iteration:     14,   Func. Count:    118,   Neg. LLF: 4298

0,1,2,3
Dep. Variable:,y,R-squared:,0.999
Mean Model:,AR-X,Adj. R-squared:,0.999
Vol Model:,ARCH,Log-Likelihood:,4.62669
Distribution:,Normal,AIC:,0.746623
Method:,Maximum Likelihood,BIC:,2.25955
,,No. Observations:,10.0
Date:,"Thu, Oct 31 2024",Df Residuals:,7.0
Time:,07:10:31,Df Model:,3.0

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
Const,0.0685,4.833e-02,1.417,0.156,"[-2.624e-02, 0.163]"
y[1],1.6187,5.025e-04,3221.476,0.000,"[ 1.618, 1.620]"
x,-9.3301e-03,6.372e-03,-1.464,0.143,"[-2.182e-02,3.159e-03]"

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
omega,1.4182e-09,3.217e-04,4.408e-06,1.000,"[-6.306e-04,6.306e-04]"
alpha[1],0.8825,0.465,1.897,5.787e-02,"[-2.945e-02, 1.794]"


## Example 8
start=0 results in first variance of residuals to be nan as mean cannot be computed with missing autoregressive y value when lag>0

In [19]:
y_coeff = 0 # auto-regressive coefficient for mean equation
x_coeff = 0 # coefficient for exogenous variable for mean equation
res_coeff = 2 # coefficient for residual variance equation
test_model.distribution = ARCHNormal(seed=seed)
test_res = test_model.fix([0,y_coeff,x_coeff,0,res_coeff])
sims = test_res.forecast(horizon=1, start=0, method='simulation', simulations=1, x=x[:,np.newaxis], align='origin')
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
print(means, vars)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [  nan    2.    2.    8.   18.   50.  128.  338.  882. 2312. 6050.]


## Example 9
start=1 results in first mean and variance of residuals to be computed

In [20]:
y_coeff = 0 # auto-regressive coefficient for mean equation
x_coeff = 0 # coefficient for exogenous variable for mean equation
res_coeff = 2 # coefficient for residual variance equation
test_model.distribution = ARCHNormal(seed=seed)
test_res = test_model.fix([0,y_coeff,x_coeff,0,res_coeff])
sims = test_res.forecast(horizon=1, start=1, method='simulation', simulations=1, x=x[:,np.newaxis], align='origin')
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()
print(f'y: {y}')
print(f'means: {means}')
print(f'vars: {vars}')
print(f'res_coeff*y[1:]**2: {res_coeff*y[1:]**2}')
np.allclose(vars, res_coeff*y[1:]**2)

y: [ 0.  1.  1.  2.  3.  5.  8. 13. 21. 34. 55.]
means: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
vars: [   2.    2.    8.   18.   50.  128.  338.  882. 2312. 6050.]
res_coeff*y[1:]**2: [   2.    2.    8.   18.   50.  128.  338.  882. 2312. 6050.]


True

## Example 10

**Testing the alignment of x and y variables when fitting vs forecasting with start not None which produces forecasts 1 time step ahead of each y value**

means[i] = $E[\hat{y}_{t+1}] = \phi y_{t} + \beta x_{t}$ so that we get true out of sample forecasted means where $x$ at time $t$ is used  

However, for the variance of the out of sample forecasted values e.g. $\hat{y}_{t+1}$, the in sample residuals are used to compute the variance and the in sample residuals are calculated as $y_t - (y_{t-1} + x_{t})$    

So vars[i] = VAR[$\hat{y}_{t+1}$] $= \alpha \epsilon_{t}^2 = \alpha (y_{t} - E[y_{t}])^2 = \alpha (y_{t} - (y_{t-1} + x_{t}))^2$

In other words, the $x_t$ is used both for the in sample residual at time $t$ and also for the out of sample forecasted mean at time $t+1$ if we do not use the shift in the $x$ variable

See [github issue](https://github.com/bashtage/arch/issues/749) and [docs](https://bashtage.github.io/arch/devel//univariate/univariate_forecasting_with_exogenous_variables.html#Conditional-Mean-Alignment-vs.-Forecast-Alignment)

In [112]:
y_coeff = 1.0
x_coeff = 1.0
res_coeff = 1.0
start = 0

test_df = pd.DataFrame({'y':np.array([0,7,2,5,9,3,8]).astype(float), 'x':np.array([1,2,4,2,1,9,6]).astype(float)})
y = test_df['y'].values
x = test_df['x'].values

test_model = arch.arch_model(y=test_df['y'], x=test_df['x'], mean='ARX', vol='GARCH', lags=1, p=1, q=0, rescale=False)
test_model.distribution = ARCHNormal(seed=seed)
test_res = test_model.fix([0., y_coeff, x_coeff, 0., res_coeff])
sims = test_res.forecast(horizon=1, start=start, method='simulation', simulations=1, x=x[:,np.newaxis], align='origin')

cal_means = y_coeff*y[:-1] + x_coeff*x[1:]
eps = y[1:] - cal_means
sim_eps = values - means

print(f'y:                          {y[start:]} with length {len(y[start:])}')
print(f'x:                          {x[start:]} with length {len(x[start:])}')

print('\n-----Expected means and residuals for fitting-----\n')
print(f'y[:-1] + x[1:]:             {cal_means} with length {len(cal_means)}')
print(f'y[1:] - (y[:-1] + x[1:]):   {eps} with length {len(eps)}')
print(f'test_res._resid:            {test_res._resid} with length {len(test_res._resid)}')
print('test_res._resid are in sample residuals calculated as y_t - (y_{t-1} + x_{t})')

print('\n-----Expected means for forecasting-----\n')
print(f'y + x:                                      {y + x} with length {len(y + x)}')
print(f'sims.mean:                                  {sims.mean.values.squeeze()}')
print(f'sims.simulations.values - sims.mean:        {sims.simulations.values.squeeze() - sims.mean.values.squeeze()}')
print(f'sims._sim.residuals:                        {sims._sim.residuals.squeeze()}')
print(f'sims._sim.residuals are out of sample residuals')

print('\n-----Expected variances-----\n')
print(f'test_res._resid^2:              {test_res._resid**2}')
print(f'sims._sim.residuals/normal_rvs: {(sims._sim.residuals.squeeze()[1:] / normal_rvs[:len(sims._sim.residuals.squeeze())-1])**2}')
print(f'sims.variance:                  {sims.variance.values.squeeze()}')
print('The variance of the out of sample forecast are based on the in sample residuals [y_t - (y_{t-1} + x_{t})]^2')


y:                          [0. 7. 2. 5. 9. 3. 8.] with length 7
x:                          [1. 2. 4. 2. 1. 9. 6.] with length 7

-----Expected means and residuals for fitting-----

y[:-1] + x[1:]:             [ 2. 11.  4.  6. 18.  9.] with length 6
y[1:] - (y[:-1] + x[1:]):   [  5.  -9.   1.   3. -15.  -1.] with length 6
test_res._resid:            [  5.  -9.   1.   3. -15.  -1.] with length 6
test_res._resid are in sample residuals calculated as y_t - (y_{t-1} + x_{t})

-----Expected means for forecasting-----

y + x:                                      [ 1.  9.  6.  7. 10. 12. 14.] with length 7
sims.mean:                                  [ 1.  9.  6.  7. 10. 12. 14.]
sims.simulations.values - sims.mean:        [    nan   1.524  -9.36    0.75    2.822 -29.266  -1.302]
sims._sim.residuals:                        [    nan   1.524  -9.36    0.75    2.822 -29.266  -1.302]
sims._sim.residuals are out of sample residuals

-----Expected variances-----

test_res._resid^2:              [ 2

## Example 11

**Using start=None i.e. forecast starts from the last available observation**

<span style='color:red'>**The sims.variance and sims.means values here are not correct beyond the first value. Do not use them.**</span>

In [47]:
test_df = pd.DataFrame({'y':np.array([0,1,1,2,3,5,8,13,21,34,55]).astype(float), 'x':np.arange(11).astype(float)})
y = test_df['y'].values
x = test_df.x.values
test_df

Unnamed: 0,y,x
0,0.0,0.0
1,1.0,1.0
2,1.0,2.0
3,2.0,3.0
4,3.0,4.0
5,5.0,5.0
6,8.0,6.0
7,13.0,7.0
8,21.0,8.0
9,34.0,9.0


In [24]:
# only one lag on residual with vol='ARCH'
test_model = arch.arch_model(y=test_df['y'], x=test_df['x'], mean='ARX', vol='GARCH', lags=1, p=1, q=0, rescale=False)
res = test_model.fit()
res.summary()

Iteration:      1,   Func. Count:      7,   Neg. LLF: 91.61411266529774
Iteration:      2,   Func. Count:     19,   Neg. LLF: 50.75017327614292
Iteration:      3,   Func. Count:     28,   Neg. LLF: 36.44096870845732
Iteration:      4,   Func. Count:     37,   Neg. LLF: 365.57387348210733
Iteration:      5,   Func. Count:     44,   Neg. LLF: 6.151970278719317
Iteration:      6,   Func. Count:     51,   Neg. LLF: 6.43027931491611
Iteration:      7,   Func. Count:     58,   Neg. LLF: 6.905866135869735
Iteration:      8,   Func. Count:     65,   Neg. LLF: 7.741695093707387
Iteration:      9,   Func. Count:     72,   Neg. LLF: 1322.8599913432877
Iteration:     10,   Func. Count:     83,   Neg. LLF: 2040.7559862722892
Iteration:     11,   Func. Count:     93,   Neg. LLF: 240.66942940093227
Iteration:     12,   Func. Count:    101,   Neg. LLF: -3.4950800976081555
Iteration:     13,   Func. Count:    107,   Neg. LLF: 0.3790684524240998
Iteration:     14,   Func. Count:    118,   Neg. LLF: 4298

0,1,2,3
Dep. Variable:,y,R-squared:,0.999
Mean Model:,AR-X,Adj. R-squared:,0.999
Vol Model:,ARCH,Log-Likelihood:,4.62669
Distribution:,Normal,AIC:,0.746623
Method:,Maximum Likelihood,BIC:,2.25955
,,No. Observations:,10.0
Date:,"Wed, Oct 30 2024",Df Residuals:,7.0
Time:,14:26:31,Df Model:,3.0

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
Const,0.0685,4.833e-02,1.417,0.156,"[-2.624e-02, 0.163]"
y[1],1.6187,5.025e-04,3221.476,0.000,"[ 1.618, 1.620]"
x,-9.3301e-03,6.372e-03,-1.464,0.143,"[-2.182e-02,3.159e-03]"

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
omega,1.4182e-09,3.217e-04,4.408e-06,1.000,"[-6.306e-04,6.306e-04]"
alpha[1],0.8825,0.465,1.897,5.787e-02,"[-2.945e-02, 1.794]"


In [25]:
y_coeff = 0.3
x_coeff = 0.7
res_coeff = 0.5

x_forecast = [10,20,30,2,3]
test_model.distribution = ARCHNormal(seed=seed)
test_res = test_model.fix([0., y_coeff, x_coeff, 0., res_coeff])
sims = test_res.forecast(horizon=5, method='simulation', simulations=1, x=x_forecast, align='origin')
values = sims.simulations.values.squeeze()
residuals = sims._sim.residuals.squeeze()
means = sims.mean.values.squeeze()
vars = sims.variance.values.squeeze()

values = sims.simulations.values.squeeze()
print(f'y:                     {y[start:]} with length {len(y[start:])}')
print(f'x:                     {x[start:]} with length {len(x[start:])}')
print(f'values:                {values} with length {len(values)}')
print('-----Means-----')
print(f'means:                 {means} with length {len(means)}')
print(f'means from residuals:  {values - residuals}')
print(f'E[y_t+1] =             {y_coeff*y[-1] + x_coeff*x_forecast[0]}')
print(f'E[y_t+2] =             {y_coeff*values[0] + x_coeff*x_forecast[1]}')
print(f'E[y_t+3] =             {y_coeff*values[1] + x_coeff*x_forecast[2]}')
print(f'E[y_t+4] =             {y_coeff*values[2] + x_coeff*x_forecast[3]}')
print(f'E[y_t+5] =             {y_coeff*values[3] + x_coeff*x_forecast[4]}')
print('-----Residuals and variances-----')
print(f'residuals:             {residuals} with length {len(residuals)}')
print(f'eps_t+1:               {values[0] - (y_coeff*y[-1] + x_coeff*x_forecast[0])}')
print(f'eps_t+2:               {values[1] - (y_coeff*values[0] + x_coeff*x_forecast[1])}')
print(f'eps_t+3:               {values[2] - (y_coeff*values[1] + x_coeff*x_forecast[2])}')
print(f'eps_t+4:               {values[3] - (y_coeff*values[2] + x_coeff*x_forecast[3])}')
print(f'eps_t+5:               {values[4] - (y_coeff*values[3] + x_coeff*x_forecast[4])}')
print(f'vars:                  {vars}')
print(f'var from residuals:    {res_coeff*residuals**2}')
print(f'residuals/normal_rvs:  {(residuals / normal_rvs[:len(residuals)])**2}')
print(f'(y_t - E[y_t])^2:      {res_coeff*(y[-1] - (y_coeff*y[-2] + x_coeff*x[-1]))**2}')
print(f'(y_t+1 - E[y_t+1])^2:  {res_coeff*(values[0] - (y_coeff*y[-1] + x_coeff*x_forecast[0]))**2}')
print(f'(y_t+2 - E[y_t+2])^2:  {res_coeff*(values[1] - (y_coeff*values[0] + x_coeff*x_forecast[1]))**2}')
print(f'(y_t+3 - E[y_t+3])^2:  {res_coeff*(values[2] - (y_coeff*values[1] + x_coeff*x_forecast[2]))**2}')
print(f'(y_t+4 - E[y_t+4])^2:  {res_coeff*(values[3] - (y_coeff*values[2] + x_coeff*x_forecast[3]))**2}')
print(f'(y_t+5 - E[y_t+5])^2:  {res_coeff*(values[4] - (y_coeff*values[3] + x_coeff*x_forecast[4]))**2}')


y:                     [ 1.  1.  2.  3.  5.  8. 13. 21. 34. 55.] with length 10
x:                     [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.] with length 10
values:                [31.645 17.504 29.429 12.343  2.887] with length 5
-----Means-----
means:                 [23.5   21.05  27.315  9.595  4.978] with length 5
means from residuals:  [23.5   23.493 26.251 10.229  5.803]
E[y_t+1] =             23.5
E[y_t+2] =             23.49340160238375
E[y_t+3] =             26.251192265196096
E[y_t+4] =             10.228843007992765
E[y_t+5] =             5.8027966281988785
-----Residuals and variances-----
residuals:             [ 8.145 -5.989  3.178  2.114 -2.916] with length 5
eps_t+1:               8.144672007945832
eps_t+2:               -5.989427385063433
eps_t+3:               3.1782844281131233
eps_t+4:               2.1138124193368313
eps_t+5:               -2.9161949242087943
vars:                  [714.42   97.466  26.709   7.455   2.905]
var from residuals:    [33.168 17.937 