This file is meant for *development* of hyperparameter optimization. Final metrics for hyperparameter optimization can be found at *total_results.ipynb*.

In [None]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

In [None]:
!pip install ta
!pip install pykalman
!pip install PyWavelets
!pip install curl-cffi

In [None]:
!pip show scikit-optimize

In [None]:
## data gathering imports
from utils.helpers import _get_train_dev_frac
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, gather_data_cached_using_truncate

## workflow imports
from models.statistical_models import execute_kalman_workflow

## optimize-specific imports
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
import numpy as np
from typing import Callable, Any, List

In [None]:
### Standard Data Gathering Code ###
instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
if pairs_data_filtered is None:
  scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
  pairs_data = {key:value[1]  for (key, value) in pairs.items()}
  pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
  pairs_data_filtered = filter_pairs_data(pairs_data)

ticker_a, ticker_b = pairs_data_filtered[0][0][0], pairs_data_filtered[0][0][1]
pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
### Standard Data Gathering Code ###

In [None]:
search_space = [ # 'name' is used directly as a kwarg
    Real(1e-5, 1e-3, name='delta', prior='log-uniform'),
    Real(0.5, 2.0, name='obs_cov_reg', prior='log-uniform'),
    Real(0.01, 0.1, name='trans_cov_avg', prior='log-uniform'),
    Real(0.1, 1.0, name='obs_cov_avg', prior='log-uniform')
]

def bayesian_optimize_workflow(
    execute_workflow_fn: Callable,
    pairs_data_filtered: List,
    data_close_filtered_2: pd.DataFrame,
    data_open_filtered_2: pd.DataFrame,
    data_high_filtered_2: pd.DataFrame,
    data_low_filtered_2: pd.DataFrame,
    data_vol_filtered_2: pd.DataFrame,
    top_pair_count: int,
    start_year: int,
    min_end_year: int,
    max_end_year: int,
    search_space: List[Real]
) -> Tuple[
    Dict[int, Dict[int, Dict[str, Any]]],  # period_year -> pair_idx -> {"best_params":..., "best_val_mse":...}
    Dict[int, float], # period_year -> average_mse_across_pairs
    float # average_mse_across_all_periods
    # TODO: also return best_params?
]:
    param_names = [dim.name for dim in search_space]
    all_results = {} # end_year -> pair_idx -> results dict

    # time series cross validation: go over all periods
    for rolling_end_year in range(min_end_year, max_end_year + 1):
        period_results = {}
        startDateStr = f"{start_year}-01-01"
        endDateStr = f"{rolling_end_year}-12-31"
        startDateStrTest = f"{rolling_end_year}-01-01"
        endDateStrTest = endDateStr

        train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)

        # for more balanced results: use top x pairs, chosen to be 5 due to stay in within realistic time and resources
        for pair_idx in range(top_pair_count):
            ticker_a, ticker_b = pairs_data_filtered[pair_idx][0][0], pairs_data_filtered[pair_idx][0][1]
            pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)

            # define the objective for this pair & period (defined here because pairs_timeseries_df is hardcoded and changes based on rolling_end_year and pair_idx)
            @use_named_args(search_space)
            def objective(**params):
                output = execute_workflow_fn(
                    pairs_timeseries_df,
                    **params,
                    verbose=False
                )
                return output['test_mse']

            res = gp_minimize(
                func=objective,
                dimensions=search_space,
                n_calls=30,
                n_random_starts=10,
                random_state=42
            )

            best_params = {k: res.x[i] for i, k in enumerate(param_names)}
            best_val_mse = res.fun

            period_results[pair_idx] = {
              "best_params": best_params,
              "best_val_mse": best_val_mse,
              "period": rolling_end_year,
              "pair_idx": pair_idx,
              "pair": (ticker_a, ticker_b)
            }
        all_results[rolling_end_year] = period_results
    # The end goal of the TSCV is to find the best parameters.
    # For each parameter set, average your score over the 6 splits. Pick the parameter set with the best average score.
    return all_results