In [2]:
# imports
import numpy as np
import pandas as pd
import pickle
import time

In [3]:
# initial data clean to get rid of na values
do_this = False
if do_this:
    fnames = ['closeRussell1000', 'closeWilshire5000']

    for fname in fnames:
        # read in data
        df = pd.read_csv(f'./data/{fname}.csv', index_col=0)
        # drop columns with no data
        df.dropna(axis=1, how='all', inplace=True)
        # format column names
        df.columns = [x.strip('.Close') for x in df.columns]
        # write clean data
        df.to_csv(f'./data/{fname}_clean.csv', index=False)
        df = None

In [4]:
# data processing to compute the delta array. trying with only Russell 1000 for now
fnames = ['closeRussell1000']

for fname in fnames:
    start = time.time()

    df = pd.read_csv(f'./data/{fname}_clean.csv')
    n, p = df.shape

    # compute returns from close prices
    df = df.pct_change()

    # predefine empty delta array
    d = p * (p-1) // 2
    delta = np.zeros((n, d))
    delta_bar = np.zeros(d)

    # filling up delta array according to definition
    k = 0
    pair_names = list()

    for i in range(p-1):
        col = df.iloc[:, i]
        subset = df.iloc[:, i+1:]

        new_pairs = [col.name + '+' + sub_col for sub_col in subset.columns]
        num_new_pairs = len(new_pairs)
        pair_names += new_pairs

        diff = subset.sub(col, axis=0).values
        delta[:, k: k+num_new_pairs] = diff
        delta_bar[k: k+num_new_pairs] = np.nanmean(diff, axis=0)
        k += num_new_pairs

        col = subset = new_pairs = diff = None

    df = None
    print(f'Made the delta and delta_bar arrays for {fname}. Shapes {delta.shape} and {delta_bar.shape}.')
    print(f'Took {round(time.time() - start, 4)} secs.')
    
    # export everything
    start = time.time()
    with open(f'./data/{fname}_delta.pkl', 'wb') as f:
        pickle.dump(delta, f)
    print(f'Wrote delta.pkl for {fname}.')
    print(f'Took {round(time.time() - start, 4)} secs.')
    
    start = time.time()
    with open(f'./data/{fname}_delta_bar.pkl', 'wb') as f:
        pickle.dump(delta_bar, f)
    print(f'Wrote delta_bar.pkl for {fname}.')
    print(f'Took {round(time.time() - start, 4)} secs.')
    
    start = time.time()
    with open(f'./data/{fname}_pair_names.pkl', 'wb') as f:
        pickle.dump(pair_names, f)
    print(f'Wrote pair_names.pkl for {fname}.')
    print(f'Took {round(time.time() - start, 4)} secs.')
    
    delta = delta_bar = pair_names = None

Made the delta and delta_bar arrays for closeRussell1000. Shapes (2769, 459361) and (459361,).
Took 8.0408 secs.
Wrote delta.pkl for closeRussell1000.
Took 64.1727 secs.
Wrote delta_bar.pkl for closeRussell1000.
Took 0.0216 secs.
Wrote pair_names.pkl for closeRussell1000.
Took 0.2139 secs.


In [5]:
# example if we want to make r_diff in the future: r_diff = train.subtract(r_bar).values