This file is meant for *development* of hyperparameter optimization. Final metrics for hyperparameter optimization can be found at *total_results.ipynb*.

In [None]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

In [None]:
!pip install ta
!pip install pykalman
!pip install PyWavelets
!pip install curl-cffi

In [None]:
!pip show scikit-optimize

In [None]:
## data gathering imports
from utils.helpers import _get_train_dev_frac
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, gather_data_cached_using_truncate

## workflow imports
from models.statistical_models import execute_kalman_workflow

## optimize-specific imports
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
import numpy as np
from typing import Callable, Any, List

In [None]:
### Standard Data Gathering Code ###
instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
if pairs_data_filtered is None:
  scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
  pairs_data = {key:value[1]  for (key, value) in pairs.items()}
  pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
  pairs_data_filtered = filter_pairs_data(pairs_data)

ticker_a, ticker_b = pairs_data_filtered[0][0][0], pairs_data_filtered[0][0][1]
pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
### Standard Data Gathering Code ###

In [None]:
search_space = [ # 'name' is used directly as a kwarg
    Real(1e-5, 1e-3, name='delta', prior='log-uniform'),
    Real(0.5, 2.0, name='obs_cov_reg', prior='log-uniform'),
    Real(0.01, 0.1, name='trans_cov_avg', prior='log-uniform'),
    Real(0.1, 1.0, name='obs_cov_avg', prior='log-uniform')
]

def bayesian_optimize_workflow(execute_workflow_fn: Callable, pairs_timeseries_df: pd.Series, train_frac: int, dev_frac: int, search_space: List[Real]):
    param_names = [dim.name for dim in search_space]

    # objective function to minimize
    @use_named_args(search_space)
    def objective(**params):
        output = execute_workflow_fn(
            pairs_timeseries_df,
            **params,
            verbose=False  # for speed
        )
        val_mse = output['test_mse']
        # test MSE gathered from a workflow is the MSE as gathered from the last year.
        # In the context inside the worfklow, it is test MSE.
        # In the context of our hyperparameter optimization algorithm, it is val MSE.
        return val_mse

    # run Bayesian optimization
    res = gp_minimize(
        func=objective,
        dimensions=search_space,
        n_calls=30, # Number of evaluations of execute_kalman_workflow
        n_random_starts=10, # Start with 10 random points before fitting a GP
        random_state=42
    )

    # print or save best hyperparameters
    best_params = {k: res.x[i] for i, k in enumerate(param_names)}
    print("Best hyperparameters:", best_params)
    print("Best validation MSE:", res.fun)

# time series cross validation: gather metrics over first 6 periods
# TODO: average across 5 pairs
start_year = 2008
min_end_year = 2016
max_end_year = 2016 # actually: 2021
for rolling_end_year in range(min_end_year, max_end_year + 1):
  startDateStr = f"{start_year}-01-01"
  endDateStr = f"{rolling_end_year}-12-31"
  startDateStrTest = f"{end_year}-01-01"
  endDateStrTest = endDateStr
  train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)
  bayesian_optimize_workflow(execute_kalman_workflow, pairs_timeseries_df, train_frac, dev_frac, search_space)