In [2]:
import torch
import numpy as np
import yfinance as yf
import statsmodels as sm
import pandas as pd
from pytorch_forecasting import TimeSeriesDataSet
from torch.utils.data import random_split
from statsmodels.tsa.stattools import adfuller
import pytorch_lightning as pl 
from arch import arch_model

In [3]:
tickers = ['^GSPC', '^DJI', 'NQ=F', 'EURUSD=X', 'GC=F']
data = yf.download(tickers, start="2015-01-01", end="2025-01-01", group_by='ticker')

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  5 of 5 completed


In [4]:
closing_price = pd.DataFrame()

In [5]:
for ticker in tickers:
    closing_price[ticker] = data[ticker]['Close']

In [6]:
closing_price

Unnamed: 0_level_0,^GSPC,^DJI,NQ=F,EURUSD=X,GC=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01,,,,1.209863,
2015-01-02,2058.199951,17832.990234,4214.25,1.208941,1186.000000
2015-01-05,2020.579956,17501.650391,4161.75,1.194643,1203.900024
2015-01-06,2002.609985,17371.640625,4102.25,1.193902,1219.300049
2015-01-07,2025.900024,17584.519531,4151.50,1.187536,1210.599976
...,...,...,...,...,...
2024-12-25,,,,1.040258,
2024-12-26,6037.589844,43325.800781,22008.00,1.039955,2638.800049
2024-12-27,5970.839844,42992.210938,21698.50,1.042318,2617.199951
2024-12-30,5906.939941,42573.730469,21416.25,1.042938,2606.100098


In [7]:
closing_price.dropna(inplace=True)

In [8]:
log_returns = np.log(closing_price/closing_price.shift(-1))

In [9]:
log_returns.dropna(inplace=True)

In [10]:
log_returns

Unnamed: 0_level_0,^GSPC,^DJI,NQ=F,EURUSD=X,GC=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,0.018447,0.018755,0.012536,0.011897,-0.014980
2015-01-05,0.008933,0.007456,0.014400,0.000621,-0.012711
2015-01-06,-0.011563,-0.012180,-0.011934,0.005346,0.007161
2015-01-07,-0.017730,-0.018221,-0.019264,0.003320,0.001819
2015-01-08,0.008439,0.009567,0.007471,0.003379,-0.006270
...,...,...,...,...,...
2024-12-23,-0.010982,-0.009050,-0.012574,0.002615,-0.002943
2024-12-24,0.000406,-0.000664,0.000931,0.000603,-0.007150
2024-12-26,0.011117,0.007729,0.014163,-0.002270,0.008219
2024-12-27,0.010760,0.009782,0.013093,-0.000594,0.004250


In [11]:
def getVariance(windowsize):
    volatility = ((log_returns**2).rolling(windowsize).sum())
    return volatility

In [12]:
volatility = getVariance(5)

In [13]:
volatility

Unnamed: 0_level_0,^GSPC,^DJI,NQ=F,EURUSD=X,GC=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,,,,,
2015-01-05,,,,,
2015-01-06,,,,,
2015-01-07,,,,,
2015-01-08,0.000939,0.000979,0.000934,0.000193,0.000480
...,...,...,...,...,...
2024-12-23,0.001187,0.000907,0.003024,0.000249,0.000539
2024-12-24,0.000291,0.000222,0.001668,0.000245,0.000582
2024-12-26,0.000414,0.000281,0.001844,0.000057,0.000362
2024-12-27,0.000413,0.000240,0.001954,0.000056,0.000185


In [14]:
data = []
for i, ticker in enumerate(tickers):
    temp_df = pd.DataFrame({
        "time_idx": np.arange(len(log_returns)),  
        "group": i,  
        "log_returns": log_returns[ticker].values,  
        "volatility": volatility[ticker].values,  
    })
    data.append(temp_df)


data = pd.concat(data)


In [15]:
data.fillna(0,inplace=True)

In [16]:
data.isnull().sum()

time_idx       0
group          0
log_returns    0
volatility     0
dtype: int64

In [17]:
data = data.reset_index(drop=True)

In [18]:
data['log_returns'] = data['log_returns'] * 100
data['volatility'] = data['volatility'] * 100

In [19]:
data

Unnamed: 0,time_idx,group,log_returns,volatility
0,0,0,1.844721,0.000000
1,1,0,0.893325,0.000000
2,2,0,-1.156274,0.000000
3,3,0,-1.773017,0.000000
4,4,0,0.843932,0.093938
...,...,...,...,...
12540,2504,4,-0.294324,0.053944
12541,2505,4,-0.714997,0.058161
12542,2506,4,0.821926,0.036202
12543,2507,4,0.425014,0.018457


In [20]:
data_len =data.time_idx.max()

In [21]:
training_dataset = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= data_len*0.8],
    group_ids=["group"],  
    target="volatility",   
    time_idx="time_idx",  
    max_encoder_length=5,  
    max_prediction_length=1,  
    time_varying_known_reals=["log_returns"], 
    time_varying_unknown_reals=["volatility"],
    target_normalizer=None,
)

In [22]:
training_dataset

TimeSeriesDataSet[length=10010](
	time_idx='time_idx',
	target='volatility',
	group_ids=['group'],
	weight=None,
	max_encoder_length=5,
	min_encoder_length=5,
	min_prediction_idx=0,
	min_prediction_length=1,
	max_prediction_length=1,
	static_categoricals=[],
	static_reals=[],
	time_varying_known_categoricals=[],
	time_varying_known_reals=['log_returns'],
	time_varying_unknown_categoricals=[],
	time_varying_unknown_reals=['volatility'],
	variable_groups={},
	constant_fill_strategy={},
	allow_missing_timesteps=False,
	lags={},
	add_relative_time_idx=False,
	add_target_scales=False,
	add_encoder_length=False,
	target_normalizer=TorchNormalizer(method='identity', center=True, transformation=None, method_kwargs={}),
	categorical_encoders={'__group_id__group': NaNLabelEncoder(add_nan=False, warn=True)},
	scalers={'log_returns': StandardScaler()},
	randomize_length=None,
	predict_mode=False
)

In [23]:
training_dataset = TimeSeriesDataSet(
    data[lambda x: (x.time_idx > data_len * 0.8) & (x.time_idx <= data_len *0.9)],
    group_ids=["group"],  
    target="volatility",   
    time_idx="time_idx",  
    max_encoder_length=5,  
    max_prediction_length=1,  
    time_varying_known_reals=["log_returns"], 
    time_varying_unknown_reals=["volatility"],
    target_normalizer=None,
)

In [24]:
validation_dataset = TimeSeriesDataSet(
    data[lambda x: x.time_idx>= data_len*0.9],
    group_ids=["group"],  
    target="volatility",   
    time_idx="time_idx",  
    max_encoder_length=5,  
    max_prediction_length=1,  
    time_varying_known_reals=["log_returns"], 
    time_varying_unknown_reals=["volatility"],
    target_normalizer=None,
)

In [25]:
summary_statistics = []

for group, group_data in data.groupby('group'):
    group_length = group_data['log_returns'].count()
    mean_volatility = group_data['log_returns'].mean()
    sd_volatility = group_data['log_returns'].std()

    adf_res = adfuller(group_data['log_returns'].dropna())
    adf_statistic = adf_res[0]
    adf_pvalue = adf_res[1]

    summary_statistics.append({
        'Ticker': tickers[group],
        'Length': group_length,
        'Mean': mean_volatility,
        'SD': sd_volatility,
        'ADF': adf_statistic,
        'p-value': adf_pvalue
    })

summary_statisticsdf = pd.DataFrame(summary_statistics)

In [26]:
summary_statisticsdf

Unnamed: 0,Ticker,Length,Mean,SD,ADF,p-value
0,^GSPC,2509,-0.041849,1.1284,-15.694171,1.421814e-28
1,^DJI,2509,-0.034655,1.110641,-15.728378,1.298412e-28
2,NQ=F,2509,-0.064439,1.382714,-16.34945,2.942271e-29
3,EURUSD=X,2509,0.005976,0.507394,-21.836893,0.0
4,GC=F,2509,-0.03173,0.925881,-51.323933,0.0


In [27]:
# ADF statistic << p-value -> time series are non-stationary

In [None]:
def GARCH_model(data, window_len, pred_len):
    