In [12]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# this makes sure cached files are readily available (for calling e.g. `gather_data_cached`)
!apt-get install git-lfs
!git lfs install

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

/content
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Git LFS initialized.
Cloning into 'bsc-thesis'...
remote: Enumerating objects: 319, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 319 (delta 28), reused 37 (delta 13), pack-reused 256 (from 1)[K
Receiving objects: 100% (319/319), 10.69 MiB | 15.98 MiB/s, done.
Resolving deltas: 100% (175/175), done.
/content/bsc-thesis/src
[0m[01;34mbacktesting[0m/  [01;34mdata[0m/       main.py  [01;34mpreprocessing[0m/
[01;34mconfig[0m/       main.ipynb  [01;34mmodels[0m/  [01;34mutils[0m/


In [13]:
!pip install ta
!pip install prophet
!pip install pykalman
!pip install PyWavelets
!pip install curl-cffi



In [None]:
## specific packages for time moe
# !pip install accelerate==0.28.0 # standard google colab version is 1.6.0 (apr 1, 2025), but for stability, we use time moe's 0.28.0 (mar 12, 2024)
# !pip install transformers==4.40.1 # standard google colab version is 4.51.3, but time moe repo requirements mention/prefer 4.40.1 for stability
# !pip install datasets==2.18.0
!pip install flash-attn==2.6.3 # optional but recommended by the repo

Collecting flash-attn==2.6.3
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/2.6 MB[0m [31m32.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn


In [19]:
# Module imports
import pandas as pd
import numpy as np
from typing import Optional, Callable, Dict, Any
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch.nn as nn
import itertools
from transformers import AutoModelForCausalLM # contains Time MoE model

# Custom Imports
from models.statistical_models import create_dataset
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
from models.statistical_models import default_normalize
from preprocessing.wavelet_denoising import wav_den
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.sliding_window import create_sliding_dataset, SlidingWindowDataset

## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached


# Any other changes to be made throughout the entire notebook
plt.style.use('seaborn-v0_8')

# OPTIONAL: checking whether a specific function is the version you want or not
inspect_func = False
if inspect_func:
  import inspect
  print(inspect.getsource(find_cointegrated_pairs)) # in this case, check whether tqdm was actually added

In [16]:
### Configs - change these to the desired values to LOAD FROM cache as wanted
startDateStr = '2010-10-01'
endDateStr = '2024-10-02' # documentation said that endDateStr is exclusive for both yahoofinance and the original code, but actually printing the shapes showed otherwise..
instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
data = gather_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
pairs_data = {key:value[1]  for (key, value) in pairs.items()}
pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
pairs_data_filtered = filter_pairs_data(pairs_data) # filter based on cointegration in such a way that we can simply pick the highest pair of stocks in the list.
# Extract the most highly cointegrated pairs
ticker_a, ticker_b = pairs_data_filtered[0][0][0], pairs_data_filtered[0][0][1]
pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
# Note about pairs_timeseries_df: the timeseries output on which we should train are found in the key "Spread_Close"
# But, also the input features are the following keys: ['S1_rsi', 'S2_rsi', 'S1_mfi', 'S2_mfi', 'S1_adi', 'S2_adi', 'S1_vpt', 'S2_vpt', 'S1_atr', 'S2_atr', 'S1_bb_ma', 'S2_bb_ma', 'S1_adx', 'S2_adx', 'S1_ema', 'S2_ema', 'S1_macd', 'S2_macd', 'S1_dlr', 'S2_dlr']

Completed 1711 pairs


  alpha_c = -sm.OLS(df['S1_close'], df['S2_close']).fit().params[0]
  alpha_o = -sm.OLS(df['S1_open'], df['S2_open']).fit().params[0]
  alpha_h = -sm.OLS(df['S1_high'], df['S2_high']).fit().params[0]
  alpha_l = -sm.OLS(df['S1_low'], df['S2_low']).fit().params[0]


In [9]:
# Set a bunch of variables based on the existing functions `execute_kalman_workflow` and `execute_transformer_workflow` (Note: Some are changed already)
pairs_timeseries: pd.DataFrame = pairs_timeseries_df
target_col: str = "Spread_Close"
burn_in: int = 30 # we remove the first 30 elements, because the largest window used for technical indicators is
train_frac: float = 0.90
dev_frac: float = 0.05   # remaining part is test
look_back: int = 20
batch_size: int = 64
denoise_fn: Optional[Callable[[pd.Series], np.ndarray]] = wav_den
scaler_factory: Callable[..., MinMaxScaler] = MinMaxScaler
scaler_kwargs: Optional[Dict[str, Any]] = {"feature_range": (0, 1)}
normalise_fn: Callable[[pd.Series], pd.Series] = default_normalize
delta: float = 1e-3
obs_cov_reg: float = 2.
trans_cov_avg: float = 0.01
obs_cov_avg: float = 1.
return_datasets: bool = False
verbose: bool = True

In [10]:
# def execute_timemoe_workflow(...):
if not target_col in pairs_timeseries.columns:
  raise KeyError(f"pairs_timeseries must contain {target_col}")

# burn the first 30 elements
pairs_timeseries_burned = pairs_timeseries.iloc[burn_in:].copy()

total_len = len(pairs_timeseries_burned)
train_size = int(total_len * train_frac)
dev_size   = int(total_len * dev_frac)
test_size  = total_len - train_size - dev_size # not used, but for clarity

train = pairs_timeseries_burned[:train_size]
dev   = pairs_timeseries_burned[train_size:train_size+dev_size] # aka validation
test  = pairs_timeseries_burned[train_size+dev_size:]

if verbose:
    print(f"Split sizes — train: {len(train)}, dev: {len(dev)}, test: {len(test)}")

if denoise_fn is not None: # denoise using wavelet denoising
    train = pd.DataFrame({col: denoise_fn(train[col]) for col in train.columns}) # TODO: unsure whether dev and test should also be denoised?

x_scaler = scaler_factory(**scaler_kwargs) # important: the scaler learns parameters, so separate objects must be created for x and y
y_scaler = scaler_factory(**scaler_kwargs)

# We want a sliding window in our dataset
trainX_raw, trainX_scaled, trainY_raw, trainY_scaled = create_sliding_dataset(
    train.values, x_scaler=x_scaler, y_scaler=y_scaler, look_back=look_back) # train_X_scaled.shape: (2219, 20, 34) // [(t - look_back), look_back, features]
devX_raw,   devX_scaled,   devY_raw,   devY_scaled   = create_sliding_dataset(
    dev.values,  x_scaler=x_scaler, y_scaler=y_scaler, look_back=look_back)
testX_raw,  testX_scaled,  testY_raw,  testY_scaled  = create_sliding_dataset(
    test.values, x_scaler=x_scaler, y_scaler=y_scaler, look_back=look_back)

train_ds = SlidingWindowDataset(trainX_scaled, trainY_scaled)
dev_ds   = SlidingWindowDataset(devX_scaled, devY_scaled)
test_ds  = SlidingWindowDataset(testX_scaled, testY_scaled)

train_loader = DataLoader(train_ds, batch_size=batch_size,
                          shuffle=True,  drop_last=True,  num_workers=0)
dev_loader   = DataLoader(dev_ds,   batch_size=batch_size,
                          shuffle=False, drop_last=False, num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=batch_size,
                          shuffle=False, drop_last=False, num_workers=0) # extra note: shuffling is turned off for these datasets, because we want to be able to plot over the testing time period

# import torch
# from transformers import AutoModelForCausalLM
if verbose:
  print("Single tensor shape: {next(iter(train_loader))[0].shape}")   # torch.Size([64, 20, 34]) //  (batch_size, look_back, features)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    'Maple728/TimeMoE-50M',
    device_map=DEVICE,  # use "cpu" for CPU inference, and "cuda" for GPU inference.
    trust_remote_code=True, # interesting name for a
)

Split sizes — train: 3143, dev: 174, test: 176
