In [1]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

/content
Cloning into 'bsc-thesis'...
remote: Enumerating objects: 1195, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 1195 (delta 78), reused 68 (delta 48), pack-reused 1077 (from 1)[K
Receiving objects: 100% (1195/1195), 30.73 MiB | 16.42 MiB/s, done.
Resolving deltas: 100% (735/735), done.
Filtering content: 100% (33/33), 1.75 GiB | 27.87 MiB/s, done.
/content/bsc-thesis/src
[0m[01;34mbacktesting[0m/  [01;34mdata[0m/      main.ipynb  [01;34mmodels[0m/         [01;34mutils[0m/
[01;34mconfig[0m/       [01;34mexternal[0m/  main.py     [01;34mpreprocessing[0m/


In [2]:
!pip install numpy==1.26.3 # necessary for bug fix
!pip install peft==0.10.0
!pip install pykalman
!pip install ta
!pip install scikit-optimize

## specific packages for time moe
# need a different version of accelerate because of bug "ImportError: cannot import name 'clear_device_cache' from 'accelerate.utils.memory'"
!pip install -U accelerate==0.32.0 # standard google colab version is 1.6.0 (apr 1, 2025), but for stability, we use time moe's 0.28.0 (mar 12, 2024)
!pip install transformers==4.40.1 # standard google colab version is 4.51.3, but time moe repo requirements mention/prefer 4.40.1 for stability
!pip install datasets==2.18.0

Collecting peft==0.10.0
  Using cached peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft==0.10.0)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1

In [3]:
# bunch of the initialization code #

### RESULTS IMPORTS ###
# Module imports
import pandas as pd
import numpy as np
from typing import Optional, Callable, Dict, Any, Sequence
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm # note: using tqdm.auto usually automatically chooses the right import based on whether you're in CLI, notebook or somewhere else
import torch.nn as nn
import itertools
from pykalman import KalmanFilter
import ast
import re
from tabulate import tabulate
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoConfig
from torch.utils.data import DataLoader, TensorDataset

# Custom Imports
from models.statistical_models import kalman_filter_average, kalman_filter_regression
from models.transformer_model import TimeSeriesTransformerv1, get_cosine_schedule_with_warmup_and_min_lr
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from backtesting.trading_strategy import trade, get_gt_yoy_returns_test_dev
from backtesting.utils import calculate_return_uncertainty
from utils.visualization import plot_return_uncertainty, plot_comparison
from utils.helpers import _get_train_dev_frac
from external.time_moe_repo.training_wrapper import train_time_moe
from backtesting.trading_strategy import get_gt_yoy_returns_test_dev
from backtesting.utils import calculate_return_uncertainty

## semi-custom
from external.time_moe_repo.time_moe.models.modeling_time_moe import TimeMoeForPrediction

# important for time moe
import wandb
wandb.login()

## workflow imports
from models.statistical_models import execute_kalman_workflow
from models.transformer_model import execute_transformer_workflow
from models.time_moe_model import execute_timemoe_workflow

## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, _get_filename, gather_pairs_data_cached, gather_data_cached_using_truncate

# Any other changes to be made throughout the entire notebook
plt.style.use('seaborn-v0_8')

inspect_func = False
if inspect_func:
  import inspect
  print(inspect.getsource(trade)) # in this case, check whether the new trade function  is imported
### RESULTS IMPORTS ###


### HYPERPARAM OPTIMIZATION IMPORTS ###
## data gathering imports
from utils.helpers import _get_train_dev_frac
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, gather_data_cached_using_truncate, gather_pairs_data_cached, save_pairs_data_filtered

## workflow imports
from models.statistical_models import execute_kalman_workflow

## optimize-specific imports
from skopt import gp_minimize
from skopt import plots as skplots
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
import numpy as np
from typing import Callable, Any, List, Dict, Tuple
import time
import random
from sklearn.metrics import mean_squared_error
from functools import partial


from utils.helpers import return_score
from utils.visualization import results_to_latex
from utils.optimization import bayesian_optimize_workflow
### HYPERPARAM OPTIMIZATION IMPORTS ###

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdavidmoerdijk[0m ([33mdavidmoerdijk-smart-backoffice[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# 2. Results

In [5]:
### Unchanged variables ###
verbose = True
return_datasets = True
### Unchanged variables ###

## Kalman Filter

In [10]:
# Hard code hyperparameters based on results above
hyperparam_kwargs = {'delta': 0.1, 'obs_cov_reg': 1.580094597096321, 'trans_cov_avg': 0.1, 'obs_cov_avg': 6.246725340878335}

results_all_years = {}
outputs_all_years = {}
for end_year in [2020, 2021, 2022, 2023, 2024]:
  ### Year-specific data ###
  startDateStr = '2008-01-01'
  endDateStr = f'{end_year}-12-31'
  startDateStrTest = f'{end_year}-01-01'
  endDateStrTest = f'{end_year}-12-31'
  train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)

  instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
  data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
  data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

  pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  if pairs_data_filtered is None:
    scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
    pairs_data = {key:value[1]  for (key, value) in pairs.items()}
    pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
    pairs_data_filtered = filter_pairs_data(pairs_data) # filter based on cointegration in such a way that we can simply pick the highest pair of stocks in the list.
    save_pairs_data_filtered(pairs_data_filtered, startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  ### Year-specific data ###

  ### OPTIONAL: define worfklow here for debugging ###

  ### OPTIONAL: define worfklow here for debugging ###

  # Gather results for current_year
  results_kalman_current_year = []
  all_outputs_kalman_current_year = []
  num_results = min(len(pairs_data_filtered), 10)
  for i in tqdm(range(num_results), desc = "Gathering [...]"):
      ticker_a, ticker_b = pairs_data_filtered[i][0][0], pairs_data_filtered[i][0][1]
      pair_tup_str_current = f"({ticker_a},{ticker_b})"
      pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
      output_returns = get_gt_yoy_returns_test_dev(pairs_timeseries_df, dev_frac, train_frac, look_back=20)
      gt_yoy, gt_yoy_for_dev_dataset = output_returns['gt_yoy_test'], output_returns['gt_yoy_dev']
      output_model = execute_kalman_workflow(pairs_timeseries_df, verbose=verbose, pair_tup_str=pair_tup_str_current, train_frac=train_frac, dev_frac=dev_frac, return_datasets=return_datasets, **hyperparam_kwargs)
      # print(output_model
      yoy_str = f"{output_model['yoy_mean'] * 100:.2f}% +- {output_model['yoy_std'] * 100:.2f}%"
      returns_score = return_score(output_model['yoy_mean'], gt_yoy)
      cointegration_score = pairs_data_filtered[i][1]
      results_kalman_current_year.append((pair_tup_str_current, cointegration_score, output_model['val_mse'], output_model['test_mse'], yoy_str, gt_yoy, returns_score)) # (pair, cointegration_score, val, test, yoy_str, gt_yoy, returns_score)
      all_outputs_kalman_current_year.append(output_model)
  results_all_years[end_year] = results_kalman_current_year
  outputs_all_years[end_year] = all_outputs_kalman_current_year

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 5.309847840365585
Test MSE: 7.839353298053407
YOY Returns: 4.65%
YOY Std: +- 0.33%
GT Yoy: 65.74%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,SMH)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 19.796463442082967
Test MSE: 10.182800289445067
YOY Returns: 2.85%
YOY Std: +- 0.14%
GT Yoy: 31.61%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,ONEQ)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 18.610652509065876
Test MSE: 10.649262778707833
YOY Returns: 0.63%
YOY Std: +- 0.04%
GT Yoy: 14.34%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,PHO)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 16.394093778180768
Test MSE: 9.24490852738189
YOY Returns: 3.05%
YOY Std: +- 0.34%
GT Yoy: 41.27%
Plot filepath parent dir: data/results
pair_tup_str: (SHV,PDP)
  
Split sizes — train: 2769, dev: 251, test: 253

Validation MSE: 14.083587642922394


Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 15.043930744996526
Test MSE: 3.595305552997576
YOY Returns: 0.47%
YOY Std: +- 0.04%
GT Yoy: 0.33%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 13.770335837612205
Test MSE: 1.9874565089508875
YOY Returns: 0.68%
YOY Std: +- 0.10%
GT Yoy: -0.26%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 6.252508825352912
Test MSE: 1.901641925004035
YOY Returns: 1.24%
YOY Std: +- 0.10%
GT Yoy: 0.29%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,SHV)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 38.14637663119086
Test MSE: 9.046789883014904
YOY Returns: 0.29%
YOY Std: +- 0.06%
GT Yoy: 0.17%
Plot filepath parent dir: data/results
pair_tup_str: (IGSB,BND)
  
Split sizes — train: 3023, dev: 250, test: 252

Validation MSE: 9.541112332625445
Tes

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 1.9656597706512327
Test MSE: 118.88822405209706
YOY Returns: 5.35%
YOY Std: +- 0.81%
GT Yoy: 0.67%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 4.315951523646572
Test MSE: 13.819617944769172
YOY Returns: 1.18%
YOY Std: +- 0.37%
GT Yoy: 0.42%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 21.349051003795473
Test MSE: 129.56107923907433
YOY Returns: 6.64%
YOY Std: +- 0.36%
GT Yoy: 0.83%
Plot filepath parent dir: data/results
pair_tup_str: (IGSB,BND)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 1.9825463921730828
Test MSE: 6.534869608537596
YOY Returns: 2.12%
YOY Std: +- 0.60%
GT Yoy: 1.23%
Plot filepath parent dir: data/results
pair_tup_str: (USIG,IEI)
  
Split sizes — train: 3274, dev: 250, test: 252

Validation MSE: 34.9506400444119
Te

Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 25.903021620103964
Test MSE: 4.19535062416549
YOY Returns: 0.13%
YOY Std: +- 0.02%
GT Yoy: 0.15%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 1.3661809719776001
Test MSE: 15.847920097667725
YOY Returns: 0.24%
YOY Std: +- 0.04%
GT Yoy: 0.59%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 194.79981843683504
Test MSE: 16.588137338108115
YOY Returns: 0.74%
YOY Std: +- 0.07%
GT Yoy: -100.00%
Plot filepath parent dir: data/results
pair_tup_str: (IGF,DVY)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 1.503736305579093
Test MSE: 2.887380004568335
YOY Returns: 0.64%
YOY Std: +- 0.07%
GT Yoy: 0.51%
Plot filepath parent dir: data/results
pair_tup_str: (IGIB,IEI)
  
Split sizes — train: 3524, dev: 250, test: 252

Validation MSE: 1.0856496331774108


Gathering [...]:   0%|          | 0/10 [00:00<?, ?it/s]

Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 17.602242945800246
Test MSE: 21.840578462379867
YOY Returns: 0.87%
YOY Std: +- 0.27%
GT Yoy: 0.11%
Plot filepath parent dir: data/results
pair_tup_str: (PFF,EMB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 64.24083603968337
Test MSE: 41.03698333055748
YOY Returns: 0.80%
YOY Std: +- 0.11%
GT Yoy: -100.00%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,MBB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 5.0234116516375575
Test MSE: 76.31407146762565
YOY Returns: 1.06%
YOY Std: +- 0.18%
GT Yoy: 0.70%
Plot filepath parent dir: data/results
pair_tup_str: (IFGL,EMB)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 5.219634894039263
Test MSE: 9.67679099567134
YOY Returns: 0.78%
YOY Std: +- 0.08%
GT Yoy: 0.04%
Plot filepath parent dir: data/results
pair_tup_str: (IGIB,IEI)
  
Split sizes — train: 3774, dev: 251, test: 252

Validation MSE: 28.27240666917558
Te

In [13]:
for year, result_current in results_all_years.items():
  print(year)
  print(results_to_latex(result_current))

2020
\begin{table}[h]
\centering
\small
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
Pair & Cointegration Score & val MSE & test MSE & YoY Returns (std) & \makecell{Theoretical Return\\Under Perfect\\Information} & Return Score \\
\midrule
1. (SHV,SMH) & $2.46\times 10^{-4}$ & 5.30985 & 7.83935 & $4.65\% \pm 0.33\%$ & TLOE* & nan \\
2. (SHV,ONEQ) & $4.04\times 10^{-4}$ & 19.79646 & 10.18280 & $2.85\% \pm 0.14\%$ & 33.16\% & 0.77 \\
3. (SHV,PHO) & $4.13\times 10^{-4}$ & 18.61065 & 10.64926 & $0.63\% \pm 0.04\%$ & 14.67\% & 0.88 \\
4. (SHV,PDP) & $9.15\times 10^{-4}$ & 16.39409 & 9.24491 & $3.05\% \pm 0.34\%$ & 43.99\% & 0.72 \\
5. (DVY,PEY) & $1.41\times 10^{-3}$ & 14.08359 & 23.33083 & $0.04\% \pm 0.02\%$ & 0.10\% & 1.00 \\
6. (PFF,EMB) & $1.45\times 10^{-3}$ & 45.18233 & 14.91943 & $0.93\% \pm 0.30\%$ & 0.71\% & 1.00 \\
7. (IGSB,BND) & $1.56\times 10^{-3}$ & 5.08239 & 41.93097 & $1.06\% \pm 0.12\%$ & -0.36\% & 1.01 \\
8. (IFGL,SHV) & $3.95\times 10^{-3}$ & 1.40722 & 33