In [23]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# custom code for collecting data
from src.data.scraper import load_cached_etf_tickers
from src.data.data_collection import gather_data

# custom code for collecting cached data
from src.data.data_collection_cache import gather_data_cached, save_data, _tickers_to_hash, _get_filename

### Configs - change these to the desired values to cache as wanted
startDateStr = '2010-10-01'
endDateStr = '2024-10-02' # documentation said that endDateStr is exclusive for both yahoofinance and the original code, but actually printing the shapes showed otherwise..
instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
###
# live_data = gather_data(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE)

In [13]:
save_data(live_data, startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
live_data

{'close': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'open': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'high': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'low': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'vol': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'yfinance_formatted': Empty DataFrame
 Columns: [(MEDX, Open), (MEDX, High), (MEDX, Low), (MEDX, Close), (MEDX, Adj Close), (MEDX, Volume), (DFGX, Open), (DFGX, High), (DFGX, Low), (DFGX, Close), (DFGX, Adj Close), (DFGX, Volume)]
 Index: []}

In [None]:
cached_data = gather_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
assert len(live_data) == len(cached_data), "The lengths of live and cached data do not match."
cached_data

{'close': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'open': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'high': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'low': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'vol': Empty DataFrame
 Columns: [MEDX, DFGX]
 Index: [],
 'yfinance_formatted': Empty DataFrame
 Columns: [(MEDX, Open), (MEDX, High), (MEDX, Low), (MEDX, Close), (MEDX, Adj Close), (MEDX, Volume), (DFGX, Open), (DFGX, High), (DFGX, Low), (DFGX, Close), (DFGX, Adj Close), (DFGX, Volume)]
 Index: []}

In [None]:
filename = _get_filename(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE)
cached_data = gather_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')

In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [7]:
from src.preprocessing.filters import step_1_filter_remove_nans, step_2_filter_liquidity
from src.preprocessing.cointegration import find_cointegrated_pairs
from src.preprocessing.data_preprocessing import filter_pairs_data
from src.data.data_collection_cache import gather_pairs_data_cached, save_pairs_data_filtered
from src.data.scraper import load_cached_etf_tickers
from src.data.data_collection_cache import gather_data_cached

startDateStr = '2008-10-01'
endDateStr = '2018-10-02'
instrumentIdsNASDAQandNYSE = load_cached_etf_tickers()
data = gather_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2) # note, from all 820 pairs, only 95 are returned, because we filter out all pairs that have a cointegration score <0.05
pairs_data = {key:value[1]  for (key, value) in pairs.items()}
pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
pairs_data_filtered = filter_pairs_data(pairs_data) 

save_pairs_data_filtered(pairs_data_filtered, startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
pairs_data_from_cache = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
pairs_data_from_cache == pairs_data_filtered

Processing pairs: 100%|██████████| 820/820 [02:13<00:00,  6.15it/s]

Completed 820 pairs





True

In [9]:
# Now do it for all other time periods as well
from datetime import datetime
def _get_train_dev_frac(startDateStr, endDateStr, startDateStrTest, endDateStrTest, verbose=False):
  """
  For certain periods, we want a specific testing period, and must therefore calculate the train/dev split based on that.
  """
  # convert all 4 dates to datetime
  startDateStr = datetime.strptime(startDateStr, '%Y-%m-%d')
  endDateStr = datetime.strptime(endDateStr, '%Y-%m-%d')
  startDateStrTest = datetime.strptime(startDateStrTest, '%Y-%m-%d')
  endDateStrTest = datetime.strptime(endDateStrTest, '%Y-%m-%d')

  total_days = (endDateStr - startDateStr).days
  test_days = (endDateStrTest - startDateStrTest).days
  train_days = total_days - 2 * test_days

  train_frac = train_days / total_days
  test_frac = test_days / total_days
  dev_frac = test_frac

  if verbose:
    print(f"train_frac: {train_frac}")
    print(f"dev_frac: {dev_frac}")
    print(f"test_frac: {test_frac}")

  return train_frac, dev_frac

## PERIOD 1
startDateStr1 = '2010-10-01'
endDateStr1 = '2024-10-02'
train_frac1 = 0.90
dev_frac1 = 0.05

## PERIOD 2
startDateStr2 = '2008-10-01'
endDateStr2 = '2018-10-02'
train_frac2 = 0.90
dev_frac2 = 0.05

## PERIOD 3
startDateStr3 = '2007-01-01'
endDateStr3 = '2022-12-31'
# wanted test year: test 2022/01-2022/12
startDateStrTest3 = '2022-01-01'
endDateStrTest3 = '2022-12-31'
train_frac3, dev_frac3 = _get_train_dev_frac(startDateStr3, endDateStr3, startDateStrTest3, endDateStrTest3)

## PERIOD 4
startDateStr4 = '2007-01-01'
endDateStr4 = '2024-12-31'
# test 2024/01-2024/12
startDateStrTest4 = '2024-01-01'
endDateStrTest4 = '2024-12-31'
train_frac4, dev_frac4 = _get_train_dev_frac(startDateStr4, endDateStr4, startDateStrTest4, endDateStrTest4)

## PERIOD 5
startDateStr5 = '2007-01-01'
endDateStr5 = '2016-12-31'
# test 2014/07-2016
startDateStrTest5 = '2014-07-01'
endDateStrTest5 = '2016-12-31'
train_frac5, dev_frac5 = _get_train_dev_frac(startDateStr5, endDateStr5, startDateStrTest5, endDateStrTest5)

## PERIOD 6
startDateStr6 = '2007-01-01'
endDateStr6 = '2021-12-31'
# test 2020/01-2021/12
startDateStrTest6 = '2020-01-01'
endDateStrTest6 = '2021-12-31'
train_frac6, dev_frac6 = _get_train_dev_frac(startDateStr6, endDateStr6, startDateStrTest6, endDateStrTest6)

all_inputs = [
    (startDateStr1, endDateStr1, train_frac1, dev_frac1),
    # (startDateStr2, endDateStr2, train_frac2, dev_frac2),
    (startDateStr3, endDateStr3, train_frac3, dev_frac3),
    (startDateStr4, endDateStr4, train_frac4, dev_frac4),
    (startDateStr5, endDateStr5, train_frac5, dev_frac5),
    (startDateStr6, endDateStr6, train_frac6, dev_frac6)
]

for startDateStr, endDateStr, train_frac, dev_frac in all_inputs:
  data = gather_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1 = step_1_filter_remove_nans(data['close'], data['open'], data['high'], data['low'], data['vol'], data)
  data_close_filtered_2, data_open_filtered_2, data_high_filtered_2, data_low_filtered_2, data_vol_filtered_2, data_original_format_filtered_2 = step_2_filter_liquidity(data_close_filtered_1, data_open_filtered_1, data_high_filtered_1, data_low_filtered_1, data_vol_filtered_1, data_original_format_filtered_1)

  scores, pvalues, pairs = find_cointegrated_pairs(data_original_format_filtered_2) # note, from all 820 pairs, only 95 are returned, because we filter out all pairs that have a cointegration score <0.05
  pairs_data = {key:value[1]  for (key, value) in pairs.items()}
  pairs_data = sorted(pairs_data.items(), key=lambda x: x[1])
  pairs_data_filtered = filter_pairs_data(pairs_data) 

  save_pairs_data_filtered(pairs_data_filtered, startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  pairs_data_from_cache = gather_pairs_data_cached(startDateStr, endDateStr, instrumentIdsNASDAQandNYSE, cache_dir='../src/data/cache')
  print(f"Checking for time period {startDateStr} {endDateStr}, equal is {pairs_data_from_cache == pairs_data_filtered}")

Processing pairs: 100%|██████████| 1711/1711 [07:07<00:00,  4.00it/s]


Completed 1711 pairs
Checking for time period 2010-10-01 2024-10-02, equal is True


Processing pairs: 100%|██████████| 253/253 [01:14<00:00,  3.41it/s]


Completed 253 pairs
Checking for time period 2007-01-01 2022-12-31, equal is True


Processing pairs: 100%|██████████| 253/253 [01:26<00:00,  2.91it/s]


Completed 253 pairs
Checking for time period 2007-01-01 2024-12-31, equal is True


Processing pairs: 100%|██████████| 210/210 [00:32<00:00,  6.41it/s]


Completed 210 pairs
Checking for time period 2007-01-01 2016-12-31, equal is True


Processing pairs: 100%|██████████| 253/253 [01:05<00:00,  3.85it/s]

Completed 253 pairs
Checking for time period 2007-01-01 2021-12-31, equal is True



