# ETF Pipeline

> ETF flow decompositions pipeline.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# | default_exp etf_pipeline

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | hide
import sys
sys.path.append("../")

In [None]:
# | export
import os

import click
from arcticdb import Arctic, LibraryOptions
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
from pathlib import Path
from lobster_tools.config import MainConfig, register_configs
from lobster_tools.preprocessing import *
from lobster_tools.querying import *
from lobster_tools.flow_decomposition import *
import pandas as pd
from sklearn.linear_model import LinearRegression
from itertools import product
import datetime
from dataclasses import dataclass
from functools import partial
import json
import numpy as np
from pprint import pprint

In [None]:
# | export

# access config by normal python import
cfg = MainConfig()
# register configs and then build object
register_configs()
with initialize(version_base=None, config_path=None):
    cfg_omega = compose(config_name="config")
    cfg = OmegaConf.to_object(compose(config_name="config"))
    print(cfg)
    print(cfg.universe.equities)

MainConfig(hydra=CustomHydraConf(defaults=[{'output': 'default'}, {'launcher': 'basic'}, {'sweeper': 'basic'}, {'help': 'default'}, {'hydra_help': 'default'}, {'hydra_logging': 'default'}, {'job_logging': 'default'}, {'callbacks': None}, {'env': 'default'}], mode=None, searchpath=[], run=RunDir(dir='../custom_directory/${now:%Y-%m-%d}/${now:%H-%M-%S}'), sweep=SweepDir(dir='???', subdir='???'), hydra_logging='???', job_logging='???', sweeper='???', launcher='???', callbacks={}, help=HelpConf(app_name='???', header='???', footer='???', template='???'), hydra_help=HydraHelpConf(hydra_help='???', template='???'), output_subdir='.hydra', overrides=OverridesConf(hydra=[], task=[]), job=JobConf(name='???', chdir=None, override_dirname='???', id='???', num='???', config_name='???', env_set={}, env_copy=[], config=JobConf.JobConfig(override_dirname=JobConf.JobConfig.OverrideDirname(kv_sep='=', item_sep=',', exclude_keys=[]))), runtime=RuntimeConf(version='???', version_base='???', cwd='???', co

#### Note to self
the stuff with env variables might be good to set for stuff like clip times and all the other options. could set to None as default and look at env variable. and in the function call to get env variable you can provide a default.

In [None]:
# def set_environment_variables() -> None:
#     os.environ["LOBSTER_DATA_PATH"] = "/nfs/home/nicolasp/home/data/tmp"
#     os.environ["DEFAULT_TICKER"] = "OKE"

# set_environment_variables()

In [None]:
directory_path = cfg.data_paths.csv_files_path
etfs = cfg.universe.etfs
equities = cfg.universe.equities
date_range = cfg.hyperparameters.date_range
markouts = cfg.hyperparameters.markouts
finest_resample = cfg.hyperparameters.finest_resample
max_markout = cfg.hyperparameters.max_markout

load = "both"
clip_trading_hours = True
add_ticker_column = True

In [None]:
directory_path = "/home/petit/Documents/data/lobster/csv"
ticker = "AIG"
date_range = ("2019-01-02", "2019-01-02")
etfs = ["SPY"]
equities = ['GE', 'AIG']

### Single day

In [None]:
# | eval: false
equity_data = Data(directory_path=directory_path,
                   ticker="AIG",
                   date_range=date_range,
                   load=load,
                   clip_trading_hours=clip_trading_hours,
                   add_ticker_column=add_ticker_column)

equity_lobsters = Lobster(equity_data)

### Multi-day

In [None]:
# | eval: false
equity_data = [
    Data(
        directory_path=directory_path,
        ticker=ticker,
        date_range=date_range,
        load=load,
        clip_trading_hours=clip_trading_hours,
        add_ticker_column=add_ticker_column,
    )
    for ticker in equities
]

equity_lobsters = [Lobster(data) for data in equity_data]

equity_executions = pd.concat([lobster.messages.pipe(get_executions) for lobster in equity_lobsters])
equity_executions.sort_index(inplace=True)

In [None]:
# | eval: false
etf_data = [
    Data(
        directory_path=directory_path,
        ticker=ticker,
        date_range=date_range,
        load=load,
        clip_trading_hours=clip_trading_hours,
        add_ticker_column=add_ticker_column,
    )
    for ticker in etfs
]

etf_lobsters = [Lobster(data) for data in etf_data]

etf_executions = pd.concat([lobster.messages.pipe(get_executions) for lobster in etf_lobsters])
etf_executions.sort_index(inplace=True)

In [None]:
# | eval: false
%store etf_executions
%store equity_executions

In [None]:
# | eval: true
%store -r

In [None]:
# | eval: false
ofi_all = ofi(etf_executions, resample_freq="5T", suffix="all")
ofi_all

Unnamed: 0_level_0,_SPY_5T_ofi_all,_GE_5T_ofi_all
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02 09:35:00,0.123398,0.108859
2019-01-02 09:40:00,0.118186,0.068916
2019-01-02 09:45:00,0.074114,-0.037293
2019-01-02 09:50:00,0.065886,-0.113141
2019-01-02 09:55:00,0.240658,-0.165625
...,...,...
2019-01-02 15:40:00,0.304968,-0.313950
2019-01-02 15:45:00,0.224770,-0.297484
2019-01-02 15:50:00,-0.039497,-0.283260
2019-01-02 15:55:00,0.071907,0.235449


In [None]:
# | eval: false
markout_times = markout_returns(ofi_all, markouts=markouts)
markout_times

Unnamed: 0_level_0,_30S,_1min,_2min,_5min
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-02 09:35:00,2019-01-02 09:35:30,2019-01-02 09:36:00,2019-01-02 09:37:00,2019-01-02 09:40:00
2019-01-02 09:40:00,2019-01-02 09:40:30,2019-01-02 09:41:00,2019-01-02 09:42:00,2019-01-02 09:45:00
2019-01-02 09:45:00,2019-01-02 09:45:30,2019-01-02 09:46:00,2019-01-02 09:47:00,2019-01-02 09:50:00
2019-01-02 09:50:00,2019-01-02 09:50:30,2019-01-02 09:51:00,2019-01-02 09:52:00,2019-01-02 09:55:00
2019-01-02 09:55:00,2019-01-02 09:55:30,2019-01-02 09:56:00,2019-01-02 09:57:00,2019-01-02 10:00:00
...,...,...,...,...
2019-01-02 15:40:00,2019-01-02 15:40:30,2019-01-02 15:41:00,2019-01-02 15:42:00,2019-01-02 15:45:00
2019-01-02 15:45:00,2019-01-02 15:45:30,2019-01-02 15:46:00,2019-01-02 15:47:00,2019-01-02 15:50:00
2019-01-02 15:50:00,2019-01-02 15:50:30,2019-01-02 15:51:00,2019-01-02 15:52:00,2019-01-02 15:55:00
2019-01-02 15:55:00,2019-01-02 15:55:30,2019-01-02 15:56:00,2019-01-02 15:57:00,2019-01-02 16:00:00


In [None]:
# | eval: false
markout_times = markout_returns(ofi_all, markouts=markouts)
mids = [resample_mid(lobster.book, resample_freq=finest_resample).rename(lobster.data.ticker) for lobster in etf_lobsters]
mids = pd.concat(mids, axis=1)
mids

Unnamed: 0_level_0,SPY,GE
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02 09:30:30,246.060,7.445
2019-01-02 09:31:00,246.125,7.465
2019-01-02 09:31:30,246.510,7.455
2019-01-02 09:32:00,246.515,7.455
2019-01-02 09:32:30,246.520,7.445
...,...,...
2019-01-02 15:58:00,250.070,8.035
2019-01-02 15:58:30,250.005,8.025
2019-01-02 15:59:00,249.895,8.025
2019-01-02 15:59:30,250.115,8.035


In [None]:
# | eval: false
def compute_returns():
    index = clip_for_markout(etf_executions.resample(resample_freq, label="right").last(), max_markout=max_markout).index

    returns = {}
    for ticker in etfs:
        df = pd.DataFrame(index=index)
        for markout in ["0S"] + markouts:
            df[f"_{markout}"] = mids.loc[df.index + pd.Timedelta(markout), ticker].values

        for markout in markouts:
            df.eval(f"return_{markout} = (_{markout} / _0S ) - 1", inplace=True)

        df["return_contemp"] = mids[ticker].resample("5T").first().pct_change()
        df_returns = df.filter(regex="return")
        df_returns.columns = [column.replace("return_", "") for column in df_returns.columns]
        df_returns.columns = [("_" + column if column[0].isdigit() else column) for column in df_returns.columns ]
        returns[ticker] = df_returns
    return returns

returns = compute_returns()

NameError: name 'clip_times' is not defined

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()