In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import itertools
import random
from matplotlib import pyplot as plt
from pair_trading_foundations.data_generation import ExecutePairTrading, generate_training_data
random.seed(23)
import cProfile
import pstats
import pickle
import plotly.express as px
from time import time
import warnings
warnings.filterwarnings('ignore')

def chunker(seq, size):
    # split a list into chunks
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [2]:
# data = pd.read_csv('Data/sp500_full_20181231_to_20231229.csv')
data = pd.read_csv('Data/sp500_full_20150101_to_20191231.csv')
value_count_tb = data[['Ticker']].groupby('Ticker').size().reset_index()
value_count_tb.columns = ['Ticker', 'Count']
stock_to_keep = value_count_tb['Ticker'][value_count_tb.Count==value_count_tb.Count.max()]
data = data[data.Ticker.isin(stock_to_keep)]

# Generate for all pairs

In [None]:
tickers = list(set(data.Ticker.values))
combinations = list(itertools.combinations(tickers, 2))
len(combinations)
batches = list(chunker(combinations, 1000))

In [None]:
i = 0
for batch in batches:
    start_ts=time()
    print(f'Getting {i+1}th out of {len(batches)} batches')
    features_tb, labels_tb, pnl_metadata_tb = generate_training_data(
        data=data,
        training_len=300,
        test_len=120,
        calculate_label=True,
        verbose=False,
        combinations=batch
    )
    combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
    # combined = pd.merge(combined, pnl_metadata_tb[['Date', 'Ticker_P1','Ticker_P2', 'trade_executions']], how='left', on=['Date', 'Ticker_P1','Ticker_P2'])
    # combined = combined[combined.pnls.notnull()].reset_index(drop=True)
    combined.to_csv(f'Data/Training/pair_features{i+1}_300_120.csv', index=False)
    end_ts = time()
    print(f"Took {end_ts - start_ts} seconds")
    i+=1
    break

# Generate data for sampled pairs

In [3]:
sampled_tickers = random.sample(list(stock_to_keep.values), 50)
# data_tech = data[data['GICS Sector'].isin(['Information Technology'])]
data_sampled = data[data['Ticker'].isin(sampled_tickers)]

In [4]:
features_tb, labels_tb, pnl_metadata_tb = generate_training_data(
        data=data_sampled,
        training_len=300,
        test_len=20,
        calculate_label=True,
        verbose=False
    )

1225 stock pairs detected
Took 0.029092788696289062 to initilize. Entering ticker pair loop
Getting the 100th pair
Used 225.48755502700806 for the 100 pairs
Getting the 200th pair
Used 234.566015958786 for the 100 pairs
Getting the 300th pair
Used 240.5891661643982 for the 100 pairs
Getting the 400th pair
Used 280.18571400642395 for the 100 pairs
Getting the 500th pair
Used 332.15958523750305 for the 100 pairs
Getting the 600th pair
Used 379.5134711265564 for the 100 pairs
Getting the 700th pair
Used 433.6948826313019 for the 100 pairs
Getting the 800th pair
Used 409.1673150062561 for the 100 pairs
Getting the 900th pair
Used 540.499773979187 for the 100 pairs
Getting the 1000th pair
Used 605.3925397396088 for the 100 pairs
Getting the 1100th pair
Used 671.6702489852905 for the 100 pairs
Getting the 1200th pair
Used 513.206305027008 for the 100 pairs
Took 5228.9714179039 to finish


# Write data out

In [5]:
spy_df = pd.read_csv('Data/Training/1999-12-01-2023-12-31_SPY.csv')
spy_df = spy_df[['Date','Adj Close']]
spy_df.columns = ['Date','SPY_Close']

look_forward_d = 20
# Define a variable to calculate the return if we just buy SPY and sell in the next 60 days
spy_return = []
for i in range(spy_df.shape[0]):
    if (i + look_forward_d) < spy_df.shape[0]:
        spy_return.append(
            100*(spy_df.loc[i+look_forward_d]['SPY_Close'] - spy_df.loc[i]['SPY_Close'])/spy_df.loc[i]['SPY_Close']
        )
    else:
        spy_return.append(
            np.nan
        )
spy_df['SPY_return'] = spy_return

In [8]:
combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
combined = pd.merge(combined, spy_df[['Date','SPY_return']], how='left', on='Date')
combined['pnls'] = combined.pnls * 100
# combined['successful_pair_trading'] = (combined.both_legs_profited) & (combined.pnls > combined.SPY_return)

In [9]:
combined.to_csv(f'Data/Training/pair_features_300_20_l300meanstd.csv', index=False)

In [None]:
# combined = pd.merge(combined, pnl_metadata_tb[['Date', 'Ticker_P1','Ticker_P2', 'trade_executions']], how='left', on=['Date', 'Ticker_P1','Ticker_P2'])

In [None]:
combined.columns

In [None]:
1539825/1225

In [None]:
with open('Data/spotcheckout_output.pkl','wb') as file:
    pickle.dump(combined, file)