In [1]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

/content
Cloning into 'bsc-thesis'...
remote: Enumerating objects: 1231, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (92/92), done.[K
remote: Total 1231 (delta 106), reused 89 (delta 62), pack-reused 1077 (from 1)[K
Receiving objects: 100% (1231/1231), 31.14 MiB | 15.92 MiB/s, done.
Resolving deltas: 100% (763/763), done.
Filtering content: 100% (33/33), 1.75 GiB | 65.95 MiB/s, done.
/content/bsc-thesis/src
[0m[01;34mbacktesting[0m/  [01;34mdata[0m/      main.ipynb  [01;34mmodels[0m/         [01;34mutils[0m/
[01;34mconfig[0m/       [01;34mexternal[0m/  main.py     [01;34mpreprocessing[0m/


In [2]:
!pip install numpy==1.26.3 # necessary for bug fix
!pip install peft==0.10.0
!pip install pykalman
!pip install ta
!pip install scikit-optimize

## specific packages for time moe
# need a different version of accelerate because of bug "ImportError: cannot import name 'clear_device_cache' from 'accelerate.utils.memory'"
!pip install -U accelerate==0.32.0 # standard google colab version is 1.6.0 (apr 1, 2025), but for stability, we use time moe's 0.28.0 (mar 12, 2024)
!pip install transformers==4.40.1 # standard google colab version is 4.51.3, but time moe repo requirements mention/prefer 4.40.1 for stability
!pip install datasets==2.18.0

Collecting peft==0.10.0
  Using cached peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached peft-0.10.0-py3-none-any.whl (199 kB)
Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
Installing collected packages: nvidia-cudnn-cu12, nvidia-cusolver-cu12, peft
  Attempting uninstall: nvidia-cudnn-cu12
    Found existing installation: nvidia-cudnn-cu12 9.3.0.75
    Uninstalling nvidia-cudnn-cu12-9.3.0.75:
      Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
  Attempting uninstall: nvidia-cusolver-cu12
    Found existing ins

In [7]:
# bunch of the initialization code #

### RESULTS IMPORTS ###
# Module imports
import pandas as pd
import numpy as np
from typing import Optional, Callable, Dict, Any, Sequence
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm # note: using tqdm.auto usually automatically chooses the right import based on whether you're in CLI, notebook or somewhere else
import torch.nn as nn
import itertools
from pykalman import KalmanFilter
import ast
import re
from tabulate import tabulate
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoConfig
from torch.utils.data import DataLoader, TensorDataset

# Custom Imports
from models.statistical_models import kalman_filter_average, kalman_filter_regression
from models.transformer_model import TimeSeriesTransformerv1, get_cosine_schedule_with_warmup_and_min_lr
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from backtesting.trading_strategy import trade, get_gt_yoy_returns_test_dev
from backtesting.utils import calculate_return_uncertainty
from utils.visualization import plot_return_uncertainty, plot_comparison
from utils.helpers import _get_train_dev_frac
from external.time_moe_repo.training_wrapper import train_time_moe
from backtesting.trading_strategy import get_gt_yoy_returns_test_dev
from backtesting.utils import calculate_return_uncertainty

## semi-custom
from external.time_moe_repo.time_moe.models.modeling_time_moe import TimeMoeForPrediction

# important for time moe
import wandb
wandb.login()

## workflow imports
from models.statistical_models import execute_kalman_workflow
from models.transformer_model import execute_transformer_workflow
from models.time_moe_model import execute_timemoe_workflow

## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, _get_filename, gather_pairs_data_cached, gather_data_cached_using_truncate

# Any other changes to be made throughout the entire notebook
plt.style.use('seaborn-v0_8')

inspect_func = False
if inspect_func:
  import inspect
  print(inspect.getsource(execute_kalman_workflow)) # in this case, check whether the new trade function  is imported
### RESULTS IMPORTS ###


### HYPERPARAM OPTIMIZATION IMPORTS ###
## data gathering imports
from utils.helpers import _get_train_dev_frac
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, gather_data_cached_using_truncate, gather_pairs_data_cached, save_pairs_data_filtered

## workflow imports
from models.statistical_models import execute_kalman_workflow

## optimize-specific imports
from skopt import gp_minimize
from skopt import plots as skplots
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
import numpy as np
from typing import Callable, Any, List, Dict, Tuple
import time
import random
from sklearn.metrics import mean_squared_error
from functools import partial


from utils.helpers import return_score
from utils.visualization import results_to_latex
from utils.optimization import bayesian_optimize_workflow
### HYPERPARAM OPTIMIZATION IMPORTS ###

# 2. Results

In [8]:
### Unchanged variables ###
verbose = True
return_datasets = True
### Unchanged variables ###

## Kalman Filter

In [9]:
# Hard code hyperparameters based on results above
hyperparam_kwargs = {'delta': 0.014784091621725818, 'obs_cov_reg': 2.5600560664086465, 'trans_cov_avg': 0.09999999999999999, 'obs_cov_avg': 3.4796803986835676}

results_all_years = {}
outputs_all_years = {}
for end_year in [2020, 2021, 2022, 2023, 2024]:
  ### Year-specific data ###
  startDateStr = '2008-01-01'
  endDateStr = f'{end_year}-12-31'
  startDateStrTest = f'{end_year}-01-01'
  endDateStrTest = f'{end_year}-12-31'
  train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)

  instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
  data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
  data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

  pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  if pairs_data_filtered is None:
    scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
    pairs_data = {key:value[1]  for (key, value) in pairs.items()}
    pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
    pairs_data_filtered = filter_pairs_data(pairs_data) # filter based on cointegration in such a way that we can simply pick the highest pair of stocks in the list.
    save_pairs_data_filtered(pairs_data_filtered, startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  ### Year-specific data ###

  ### OPTIONAL: define worfklow here for debugging ###

  ### OPTIONAL: define worfklow here for debugging ###

  # Gather results for current_year
  results_kalman_current_year = []
  all_outputs_kalman_current_year = []
  num_results = min(len(pairs_data_filtered), 10)
  for i in tqdm(range(num_results), desc = "Gathering [...]"):
      ticker_a, ticker_b = pairs_data_filtered[i][0][0], pairs_data_filtered[i][0][1]
      pair_tup_str_current = f"({ticker_a},{ticker_b})"
      pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
      output_returns = get_gt_yoy_returns_test_dev(pairs_timeseries_df, dev_frac, train_frac, look_back=20)
      gt_yoy, gt_yoy_for_dev_dataset = output_returns['gt_yoy_test'], output_returns['gt_yoy_dev']
      output_model = execute_kalman_workflow(pairs_timeseries_df, verbose=verbose, pair_tup_str=pair_tup_str_current, train_frac=train_frac, dev_frac=dev_frac, return_datasets=return_datasets, **hyperparam_kwargs)
      # print(output_model
      yoy_str = f"{output_model['yoy_mean'] * 100:.2f}% +- {output_model['yoy_std'] * 100:.2f}%"
      returns_score = return_score(output_model['yoy_mean'], gt_yoy)
      cointegration_score = pairs_data_filtered[i][1]
      results_kalman_current_year.append((pair_tup_str_current, cointegration_score, output_model['val_mse'], output_model['test_mse'], yoy_str, gt_yoy, returns_score)) # (pair, cointegration_score, val, test, yoy_str, gt_yoy, returns_score)
      all_outputs_kalman_current_year.append(output_model)
  results_all_years[end_year] = results_kalman_current_year
  outputs_all_years[end_year] = all_outputs_kalman_current_year

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 3.6243235576283137
Test MSE: 6.248608346968268
YOY Returns: 4.96%
YOY Std: +- 0.35%
GT Yoy: 65.74%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,SMH)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 14.379142325299703
Test MSE: 7.3101386444566705
YOY Returns: 3.23%
YOY Std: +- 0.22%
GT Yoy: 31.61%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,ONEQ)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 12.98751330204443
Test MSE: 8.910647524611038
YOY Returns: 1.14%
YOY Std: +- 0.06%
GT Yoy: 14.34%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,PHO)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 12.110528150339471
Test MSE: 7.176724105279637
YOY Returns: 3.39%
YOY Std: +- 0.39%
GT Yoy: 41.27%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,PDP)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 14.020448995193819


Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 3.276339668783482
Test MSE: 1.2331741475643883
YOY Returns: 0.68%
YOY Std: +- 0.07%
GT Yoy: 0.33%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 3.256431723728658
Test MSE: 6.624486722255925
YOY Returns: 0.74%
YOY Std: +- 0.16%
GT Yoy: -0.26%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 1.733148824864574
Test MSE: 4.478434762535083
YOY Returns: 1.17%
YOY Std: +- 0.16%
GT Yoy: 0.29%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,SHV)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 0.9912803315607415
Test MSE: 31.002053380921172
YOY Returns: 0.32%
YOY Std: +- 0.09%
GT Yoy: 0.17%
Plot filepath parent dir: data/results
pair_tup_str: (IGSB,BND)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 0.8147759282791353
Te

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 0.5062403084695127
Test MSE: 20.345084909459995
YOY Returns: 4.49%
YOY Std: +- 0.77%
GT Yoy: 0.67%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 0.8722456379597389
Test MSE: 16.964695926437166
YOY Returns: 0.87%
YOY Std: +- 0.04%
GT Yoy: 0.42%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 0.8776255393343122
Test MSE: 31.513802084091083
YOY Returns: 5.10%
YOY Std: +- 0.26%
GT Yoy: 0.83%
Plot filepath parent dir: data/results
pair_tup_str: (IGSB,BND)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 14.206236713368304
Test MSE: 2.4906654682924234
YOY Returns: 0.73%
YOY Std: +- 0.14%
GT Yoy: 1.23%
Plot filepath parent dir: data/results
pair_tup_str: (USIG,IEI)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 12.19977655050616

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 0.4435697120512965
Test MSE: 2.444152392968234
YOY Returns: 0.26%
YOY Std: +- 0.04%
GT Yoy: 0.15%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 2.6791170509341202
Test MSE: 7.371186352659213
YOY Returns: 0.33%
YOY Std: +- 0.07%
GT Yoy: 0.59%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 77.92793095038874
Test MSE: 38.257958657280234
YOY Returns: 0.95%
YOY Std: +- 0.04%
GT Yoy: -100.00%
Plot filepath parent dir: data/results
pair_tup_str: (IGF,DVY)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 1.3839556754768192
Test MSE: 56.774184332990856
YOY Returns: 1.34%
YOY Std: +- 0.22%
GT Yoy: 0.51%
Plot filepath parent dir: data/results
pair_tup_str: (IGIB,IEI)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 1.0545163146385517

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 1.6259023499503462
Test MSE: 47.79868058495247
YOY Returns: 0.77%
YOY Std: +- 0.13%
GT Yoy: 0.11%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 38.033343183556184
Test MSE: 55.19024050447379
YOY Returns: 1.20%
YOY Std: +- 0.14%
GT Yoy: -100.00%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,MBB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 16.15571509514102
Test MSE: 2.2328734036422064
YOY Returns: 1.09%
YOY Std: +- 0.12%
GT Yoy: 0.70%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 12.008639902956745
Test MSE: 52.11113049758312
YOY Returns: 1.13%
YOY Std: +- 0.15%
GT Yoy: 0.04%
Plot filepath parent dir: data/results
pair_tup_str: (IGIB,IEI)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 11.49596844605826


In [11]:
for year, result_current in results_all_years.items():
  print(year)
  print(results_to_latex(result_current))

2020
\begin{table}[h]
\centering
\small
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
Pair & Cointegration Score & val MSE & test MSE & YoY Returns (std) & \makecell{Theoretical Return\\Under Perfect\\Information} & Return Score \\
\midrule
1. (SHV,SMH) & $2.46\times 10^{-4}$ & 3.62432 & 6.24861 & $4.96\% \pm 0.35\%$ & TLOE* & nan \\
2. (SHV,ONEQ) & $4.04\times 10^{-4}$ & 14.37914 & 7.31014 & $3.23\% \pm 0.22\%$ & 33.16\% & 0.78 \\
3. (SHV,PHO) & $4.13\times 10^{-4}$ & 12.98751 & 8.91065 & $1.14\% \pm 0.06\%$ & 14.67\% & 0.88 \\
4. (SHV,PDP) & $9.15\times 10^{-4}$ & 12.11053 & 7.17672 & $3.39\% \pm 0.39\%$ & 43.99\% & 0.72 \\
5. (DVY,PEY) & $1.41\times 10^{-3}$ & 14.02045 & 22.46558 & $0.03\% \pm 0.02\%$ & 0.10\% & 1.00 \\
6. (PFF,EMB) & $1.45\times 10^{-3}$ & 4.11138 & 10.39903 & $1.36\% \pm 0.24\%$ & 0.71\% & 1.01 \\
7. (IGSB,BND) & $1.56\times 10^{-3}$ & 0.47129 & 3.46026 & $0.10\% \pm 0.16\%$ & -0.36\% & 1.00 \\
8. (IFGL,SHV) & $3.95\times 10^{-3}$ & 0.84092 & 4.1739