This file is meant for *development* of hyperparameter optimization. Final metrics for hyperparameter optimization can be found at *total_results.ipynb*.

In [1]:
# if "preprocessing" folder in current folders -> cd back to original folder
%cd /content
import os
if os.path.exists("bsc-thesis"):
  # if bsc-thesis folder already exists; completely remove
  !rm -rf bsc-thesis

# cloning repo
branch = "main"
!git clone --branch $branch https://github.com/maviddoerdijk/bsc-thesis.git

# moving into project dir
%cd bsc-thesis/src
%ls

/content
Cloning into 'bsc-thesis'...
remote: Enumerating objects: 1010, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 1010 (delta 100), reused 71 (delta 48), pack-reused 860 (from 2)[K
Receiving objects: 100% (1010/1010), 39.84 MiB | 12.42 MiB/s, done.
Resolving deltas: 100% (584/584), done.
Filtering content: 100% (32/32), 1.75 GiB | 43.57 MiB/s, done.
/content/bsc-thesis/src
[0m[01;34mbacktesting[0m/  [01;34mdata[0m/      main.ipynb  [01;34mmodels[0m/         [01;34mutils[0m/
[01;34mconfig[0m/       [01;34mexternal[0m/  main.py     [01;34mpreprocessing[0m/


In [2]:
!pip install ta # remove after updated kalman worfklow
!pip install pykalman
!pip install PyWavelets # remove after updated kalman worfklow
!pip install curl-cffi # remove after updated kalman worfklow
!pip install scikit-optimize



In [3]:
!pip install scikit-optimize



In [7]:
## data gathering imports
from utils.helpers import _get_train_dev_frac
from preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from preprocessing.cointegration import find_cointegrated_pairs
from preprocessing.data_preprocessing import filter_pairs_data
from preprocessing.technical_indicators import combine_pairs_data
## specific caching imports (should be changed in case you want to gather data live)
from data.scraper import load_cached_etf_tickers
from data.data_collection_cache import gather_data_cached, gather_data_cached_using_truncate, gather_pairs_data_cached, save_pairs_data_filtered

## workflow imports
from models.statistical_models import execute_kalman_workflow

## optimize-specific imports
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
import numpy as np
from typing import Callable, Any, List, Dict, Tuple

In [9]:
search_space = [ # 'name' is used directly as a kwarg
    Real(1e-5, 0.1, name='delta', prior='log-uniform'),
    Real(0.5, 4, name='obs_cov_reg', prior='log-uniform'),
    Real(0.001, 0.1, name='trans_cov_avg', prior='log-uniform'),
    Real(0.1, 10, name='obs_cov_avg', prior='log-uniform')
]
SEED = 3178749

def bayesian_optimize_workflow(
    execute_workflow_fn: Callable,
    top_pair_count: int,
    start_year: int,
    min_end_year: int,
    max_end_year: int,
    search_space: List[Real],
    seed: int,
    verbose: bool
) -> Tuple[
    Dict[str, Any], # best_params
    float # best_mean_mse
]:
    param_names = [dim.name for dim in search_space]
    instrumentIds = load_cached_etf_tickers()

    @use_named_args(search_space)
    def objective(**params):
      total_mse_list = []

      # time series cross validation: go over all periods
      for rolling_end_year in range(min_end_year, max_end_year + 1): # +1 such that end year is actually included!
        startDateStr = f"{start_year}-01-01"
        endDateStr = f"{rolling_end_year}-12-31"
        startDateStrTest = f"{rolling_end_year}-01-01"
        endDateStrTest = endDateStr

        train_frac, dev_frac = _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest)

        # when new startDateStr and endDateStr are created, we also need new pairs_data_filtered and data_..._filtered_2
        data = gather_data_cached_using_truncate(startDateStr, endDateStr, instrumentIds, cache_dir='../src/data/cache')
        data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
        data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

        pairs_data_filtered = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIds, cache_dir='../src/data/cache')
        if pairs_data_filtered is None:
          scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2)
          pairs_data = {key:value[1]  for (key, value) in pairs.items()}
          pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
          pairs_data_filtered = filter_pairs_data(pairs_data)
          # if it can not be retreived from cache, make sure it is saved for later
          save_pairs_data_filtered(pairs_data_filtered, startDateStr, endDateStr, instrumentIds, cache_dir='../src/data/cache')

        # for more balanced results: use top x pairs, chosen to be 5 due to stay in within realistic time and resources
        for pair_idx in range(top_pair_count):
          ticker_a, ticker_b = pairs_data_filtered[pair_idx][0][0], pairs_data_filtered[pair_idx][0][1]
          pairs_timeseries_df = combine_pairs_data(data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, ticker_a, ticker_b)
          output = execute_workflow_fn(
              pairs_timeseries_df,
              **params,
              verbose=False
          )
          total_mse_list.append(output['test_mse'])
      # get mean_mse across time periods and pairs for the current choice of hyperparameters
      mean_mse = np.mean(total_mse_list)
      return mean_mse

    # gather results
    res = gp_minimize(
        func=objective,
        dimensions=search_space,
        n_calls=30,
        n_random_starts=10,
        random_state=seed,
        verbose=verbose
    )

    best_params = {k: res.x[i] for i, k in enumerate(param_names)}
    best_mean_mse = res.fun

    return best_params, best_mean_mse # TODO: we might want to create some plots across timeperiods or something like that, implement a way to look at all results? But first draw what I would want the plot to look like

# call func
best_params, best_mean_mse = bayesian_optimize_workflow(
    execute_workflow_fn=execute_kalman_workflow,
    top_pair_count=3,
    start_year=2008,
    min_end_year=2016,
    max_end_year=2016,
    search_space=search_space,
    seed=SEED,
    verbose=True
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 11.1969
Function value obtained: 21.0202
Current minimum: 21.0202
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 6.8861
Function value obtained: 22.5121
Current minimum: 21.0202
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 8.2081
Function value obtained: 22.4492
Current minimum: 21.0202
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 7.8497
Function value obtained: 5.2494
Current minimum: 5.2494
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 6.8010
Function value obtained: 23.4910
Current minimum: 5.2494
Iteration No: 6 started. Ev



Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 7.5523
Function value obtained: 22.0593
Current minimum: 3.7692
Iteration No: 20 started. Searching for the next optimal point.




Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 8.8848
Function value obtained: 21.5859
Current minimum: 3.7692
Iteration No: 21 started. Searching for the next optimal point.
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 8.4003
Function value obtained: 3.7692
Current minimum: 3.7692
Iteration No: 22 started. Searching for the next optimal point.




Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 7.9583
Function value obtained: 9.9656
Current minimum: 3.7692
Iteration No: 23 started. Searching for the next optimal point.




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 8.7458
Function value obtained: 19.6827
Current minimum: 3.7692
Iteration No: 24 started. Searching for the next optimal point.




Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 9.2539
Function value obtained: 20.3000
Current minimum: 3.7692
Iteration No: 25 started. Searching for the next optimal point.




Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 7.9373
Function value obtained: 23.4925
Current minimum: 3.7692
Iteration No: 26 started. Searching for the next optimal point.




Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 9.3063
Function value obtained: 23.5144
Current minimum: 3.7692
Iteration No: 27 started. Searching for the next optimal point.




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 9.2828
Function value obtained: 13.9292
Current minimum: 3.7692
Iteration No: 28 started. Searching for the next optimal point.




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 7.4994
Function value obtained: 23.4789
Current minimum: 3.7692
Iteration No: 29 started. Searching for the next optimal point.
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 8.9414
Function value obtained: 3.7692
Current minimum: 3.7692
Iteration No: 30 started. Searching for the next optimal point.




Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 8.6631
Function value obtained: 3.7834
Current minimum: 3.7692


In [10]:
best_params, best_mean_mse

({'delta': 1e-05,
  'obs_cov_reg': 3.999999999999999,
  'trans_cov_avg': 0.1,
  'obs_cov_avg': 0.1},
 np.float64(3.769194848453971))