In [1]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

/content
Cloning into 'bsc-thesis'...
remote: Enumerating objects: 805, done.[K
remote: Counting objects: 100% (185/185), done.[K
remote: Compressing objects: 100% (152/152), done.[K
remote: Total 805 (delta 98), reused 93 (delta 33), pack-reused 620 (from 1)[K
Receiving objects: 100% (805/805), 26.86 MiB | 28.13 MiB/s, done.
Resolving deltas: 100% (452/452), done.
Filtering content: 100% (32/32), 1.75 GiB | 149.02 MiB/s, done.
/content/bsc-thesis/src
[0m[01;34mbacktesting[0m/  [01;34mdata[0m/      main.ipynb  [01;34mmodels[0m/         [01;34mutils[0m/
[01;34mconfig[0m/       [01;34mexternal[0m/  main.py     [01;34mpreprocessing[0m/


In [2]:
# Goal: plot test MSE vs profitability
!pip install ta
!pip install pykalman
!pip install PyWavelets

FLASH_ATTN = False # set to true if using this
if FLASH_ATTN:
  !pip install flash-attn==2.6.3 # optional but recommended by the repo

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=a2a3507dddca2cae5a6c14c534ade4b53b5d4365550981f1f6277c671f9815ff
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0
Collecting pykalman
  Downloading pykalman-0.10.1-py2.py3-none-any.whl.metadata (9.5 kB)
Collecting scikit-base<0.13.0 (from pykalman)
  Downloading scikit_base-0.12.2-py3-none-any.whl.metadata (8.8 kB)
Downloading pykalman-0.10.1-py2.py3-none-any.whl (248 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.5/248.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_base-0.12.2-py3-none-any.whl (142 kB)
[2K   [90m

In [5]:
# Module imports
import pandas as pd
import numpy as np
from typing import Optional, Callable, Dict, Any
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm # note: using tqdm.auto usually automatically chooses the right import based on whether you're in CLI, notebook or somewhere else
import torch.nn as nn
import itertools
from pykalman import KalmanFilter
import ast
import re
from tabulate import tabulate
from datetime import datetime

# Custom Imports
from models.statistical_models import create_dataset, default_normalize, rmse_metric, acc_metric, kalman_filter_average, kalman_filter_regression, kalman_filter_regression_multivariate
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
from preprocessing.wavelet_denoising import wav_den
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from backtesting.trading_strategy import trade, get_gt_yoy_returns_test_dev
from backtesting.utils import calculate_return_uncertainty
from utils.visualization import plot_return_uncertainty, plot_comparison
from utils.helpers import _get_train_dev_frac

## workflow imports
from models.statistical_models import execute_kalman_workflow
from models.transformer_model import execute_transformer_workflow
# from models.time_moe_model import execute_timemoe_workflow

## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, _get_filename, gather_pairs_data_cached, gather_data_cached_using_truncate

# Any other changes to be made throughout the entire notebook
plt.style.use('seaborn-v0_8')

inspect_func = False
if inspect_func:
  import inspect
  print(inspect.getsource(trade)) # in this case, check whether tqdm was actually added

In [11]:
# Goal: plot test MSE vs profitability

### DATA GATHER ###
start_year = 2008
end_year = 2024
startDateStr = f'{start_year}-01-01'
endDateStr = f'{end_year}-12-31'
startDateStrTest = f'{end_year}-01-01'
endDateStrTest = f'{end_year}-12-31'
train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)

instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
if not pairs_data_filtered:
  scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
  pairs_data = {key:value[1]  for (key, value) in pairs.items()}
  pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
  pairs_data_filtered = filter_pairs_data(pairs_data) # filter based on cointegration in such a way that we can simply pick the highest pair of stocks in the list.


### DATA GATHER ###

### EXTRA VARS ###

verbose = False

### EXTRA VARS ###

In [None]:
### Definition of trade functions ###


### Definition of trade functions ###

In [15]:
### Gathering of results ###


# Extract the most highly cointegrated pairs
for i in range(len(pairs_data_filtered[:5])):
  ticker_a, ticker_b = pairs_data_filtered[i][0][0], pairs_data_filtered[i][0][1]
  pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)

  burn_in = 30

  pairs_timeseries_df_burned_in = pairs_timeseries_df.iloc[burn_in:].copy()

  total_len = len(pairs_timeseries_df_burned_in)
  train_size = int(total_len * train_frac)
  dev_size   = int(total_len * dev_frac)
  test_size  = total_len - train_size - dev_size # not used, but for clarity

  train = pairs_timeseries_df_burned_in.iloc[:train_size]
  dev   = pairs_timeseries_df_burned_in.iloc[train_size:train_size + dev_size]
  test  = pairs_timeseries_df_burned_in.iloc[train_size + dev_size:]

  ## Getting timeseries ##
  # 0. ground truth
  pairs_timeseries_df_spread_close_test = test['Spread_Close']

  # 1. artificial noise
  std_dev = 0.5
  noise = np.random.normal(0, std_dev, size=len(test))
  pairs_timeseries_df_spread_close_test_added_noise = test['Spread_Close'] + noise

  # 2. kalman predictions

  # 3. Transformer predictions

  # 4. Time-MoE predictions

  ## Getting timeseries ##

  print(f"Profits ground truth: ")
  print(f"Profits GT + artificial noise (0.5 std dev):")
  print(f"Profits Kalman Predictions (TA): ")
  print(f"Profits Kalman Predictions: (no TA)")
  print(f"Profits Transformer Predictions (TA): ")
  print(f"Profits Transformer Predictions (no TA): ")
  print(f"Profits TimeMoE Predictions: (no TA)")

### Gathering of results ###

GT YOY: -0.14205346835406496
Yoy Returns: 111.47630065356755 +- 4.151886878592863
TEST MSE: 4.550297884171458
GT YOY: -0.14205346835406496
Yoy Returns: 22.647256732075952 +- 1.842858730199155
TEST MSE: 9.174108958976943
GT YOY: -0.14205346835406507
Yoy Returns: 0.30034669394934466 +- 1.0174518594979556
TEST MSE: 4.04371371905866
GT YOY: -0.14205346835406507
Yoy Returns: -0.14205346835406463 +- 1.7290725485624172e-16
TEST MSE: 2.568115548357022
GT YOY: -0.14205346835406496
Yoy Returns: 5.801870150956745 +- 6.8502763685531916
TEST MSE: 0.08738206185959528
GT YOY: -0.14205346835406496
Yoy Returns: 263.47623474934926 +- 8.7237585852514
TEST MSE: 8.385346618107237
GT YOY: -0.14205346835406496
Yoy Returns: 17.82638268073145 +- 1.457368089849135
TEST MSE: 7.933472195539503
GT YOY: 12.7514979444988
Yoy Returns: 1.6100151302435912 +- 1.5399841657690125
TEST MSE: 3.0309693268915456
GT YOY: 8.202220572424721
Yoy Returns: -0.08113264050853308 +- 0.1173835460619129
TEST MSE: 2.169843550245091
GT YO

KeyboardInterrupt: 