In [1]:
# imports
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import time
import pickle
import traceback

In [2]:
# initial data clean to get rid of na values
do_this = True
if do_this:
    fnames = ['closeRussell1000', 'closeWilshire5000']

    for fname in fnames:
        # read in data
        df = pd.read_csv(f'./data/{fname}.csv', index_col=0)
        # drop columns with no data
        df.dropna(axis=1, how='all', inplace=True)
        # format column names
        df.columns = [x.strip('.Close') for x in df.columns]
        # write clean data
        df.to_csv(f'./data/{fname}_clean.csv', index=False)
        df = None

In [3]:
def time_the_dump(obj, fname: str, ndec=4) -> bool:
    '''Call pickle.dump() while timing how long it takes.'''
    try:
        start = time.time()
        with open(fname, 'wb') as f: pickle.dump(obj, f)
        print(f'Took { round(time.time() - start, ndec) } secs.')
        return True
    except Exception:
        print(traceback.format_exc())
        return False

In [4]:
# data processing to compute the delta array + other relevant objects. trying with only Russell 1000 for now
fnames = ['closeRussell1000']

for fname in fnames:
    start = time.time()

    df = pd.read_csv(f'./data/{fname}_clean.csv')
    n, p = df.shape

    # compute returns from close prices
    df = df.pct_change()
    # filling NA returns from the get-go to avoid downstream problems
    df = df.fillna(df.mean())

    # predefine empty delta array
    d = p * (p-1) // 2
    delta = np.zeros((n, d))
    delta_bar = np.zeros(d)
    delta_centered = np.zeros((n, d))
    pair_names = list()

    # filling up delta array according to definition
    k = 0
    for i in tqdm(range(p-1), leave=True):
        short_col = df.iloc[:, i]       # this is the column we're going to subtract (short position)
        long_cols = df.iloc[:, i+1:]    # these are the remaining columns we are subtracting from (long position)

        new_pairs = [long_col + '-' + short_col.name for long_col in long_cols.columns] # keeping track of the pair name
        num_new_pairs = len(new_pairs)
        pair_names += new_pairs

        difference = long_cols.sub(short_col, axis=0).values
        delta[:, k: k+num_new_pairs] = difference
        delta_bar[k: k+num_new_pairs] = np.nanmean(difference, axis=0)
        difference = None

        delta_centered[:, k: k+num_new_pairs] = delta[:, k: k+num_new_pairs] - delta_bar[k: k+num_new_pairs]
        k += num_new_pairs
        short_col = long_cols = new_pairs = None
    
    # export everything
    time_the_dump(delta, f'./data/{fname}_delta.pkl')
    delta = None
    time_the_dump(delta_bar, f'./data/{fname}_delta_bar.pkl')
    delta_bar = None
    time_the_dump(delta_centered, f'./data/{fname}_delta_centered.pkl')
    delta_centered = None
    time_the_dump(pair_names, f'./data/{fname}_pair_names.pkl')
    pair_names = None

  0%|          | 0/958 [00:00<?, ?it/s]