In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import itertools
import random
from matplotlib import pyplot as plt
from pair_trading_foundations.data_generation import ExecutePairTrading, generate_training_data
random.seed(23)
import cProfile
import pstats
import pickle
import plotly.express as px
from time import time

def chunker(seq, size):
    # split a list into chunks
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [2]:
data = pd.read_csv('Data/sp500_full_20181231_to_20231229.csv')

In [11]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,GICS Sector,GICS Sub-Industry
0,2018-12-31,190.339996,191.649994,188.5,190.539993,155.818451,1804400.0,MMM,Industrials,Industrial Conglomerates
1,2019-01-02,187.820007,190.990005,186.699997,190.949997,156.153717,2475200.0,MMM,Industrials,Industrial Conglomerates
2,2019-01-03,188.279999,188.279999,182.889999,183.759995,150.273972,3358200.0,MMM,Industrials,Industrial Conglomerates
3,2019-01-04,186.75,191.979996,186.029999,191.320007,156.456284,2995100.0,MMM,Industrials,Industrial Conglomerates
4,2019-01-07,191.360001,192.300003,188.660004,190.880005,156.096466,2162200.0,MMM,Industrials,Industrial Conglomerates


In [3]:
value_count_tb = data[['Ticker']].groupby('Ticker').size().reset_index()
value_count_tb.columns = ['Ticker', 'Count']
stock_to_keep = value_count_tb['Ticker'][value_count_tb.Count==value_count_tb.Count.max()]
data = data[data.Ticker.isin(stock_to_keep)]
sampled_tickers = random.sample(list(stock_to_keep.values), 10)
# data_tech = data[data['GICS Sector'].isin(['Information Technology'])]
data_sampled = data[data['Ticker'].isin(sampled_tickers)]

In [4]:
tickers = list(set(data.Ticker.values))
combinations = list(itertools.combinations(tickers, 2))

In [5]:
len(combinations)

119316

In [6]:
batches = list(chunker(combinations, 1000))

In [None]:
print(f"Used around {15/45}s per pair for features of full history")
print(f"Used around {30/45}s per pair for features and labels of full history")

# Generate for all pairs

In [None]:
i = 0
for batch in batches:
    start_ts=time()
    print(f'Getting {i+1}th out of {len(batches)} batches')
    features_tb, labels_tb, pnl_metadata_tb = generate_training_data(
        data=data,
        training_len=500,
        test_len=120,
        calculate_label=True,
        verbose=False,
        combinations=batch
    )
    combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
    # combined = pd.merge(combined, pnl_metadata_tb[['Date', 'Ticker_P1','Ticker_P2', 'trade_executions']], how='left', on=['Date', 'Ticker_P1','Ticker_P2'])
    # combined = combined[combined.pnls.notnull()].reset_index(drop=True)
    combined.to_csv(f'Data/Training/pair_features{i+1}.csv', index=False)
    end_ts = time()
    print(f"Took {end_ts - start_ts} seconds")
    i+=1

In [10]:
27/45

0.6

# Generate data for sample pairs

In [9]:
features_tb, labels_tb, pnl_metadata_tb = generate_training_data(
        data=data_sampled,
        training_len=500,
        test_len=120,
        calculate_label=True,
        verbose=False
    )

45 stock pairs detected
Took 0.014369010925292969 to initilize. Entering ticker pair loop


  features_tb = pd.concat(
  labels_tb = pd.concat(
  pnl_metadata_tb = pd.concat(


Took 27.82582712173462 to finish


In [None]:
combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)

In [None]:
combined = pd.merge(combined, pnl_metadata_tb[['Date', 'Ticker_P1','Ticker_P2', 'trade_executions']], how='left', on=['Date', 'Ticker_P1','Ticker_P2'])

In [None]:
with open('Data/spotcheckout_output.pkl','wb') as file:
    pickle.dump(combined, file)