In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import itertools
import random
from matplotlib import pyplot as plt
from pair_trading_foundations.data_generation import ExecutePairTrading, \
generate_training_data, calculate_beta_p1, calculate_beta_p2
from pair_trading_foundations.utils import GetSP500Data
import statsmodels.api as sm
from importlib import reload

random.seed(23)
import cProfile
import pstats
import pickle
import plotly.express as px
from time import time
import warnings
warnings.filterwarnings('ignore')

import os
if not os.path.exists('Generated'):
    os.makedirs('Generated')

def chunker(seq, size):
    # split a list into chunks
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# Extract data from API

In [2]:
# data = GetSP500Data('1999-12-01', '2024-03-01').get_all_sp_tickers().get_consolidated_data()
# spy_df = GetSP500Data('1999-12-01', '2024-03-01').get_single_stock_history('SPY')
# spy_df.to_csv('../../Data/1999-12-01-2024-03-01_SPY.csv', index=False)

In [3]:
data = pd.read_csv('../../Data/sp500_full_19991201_to_20240301.csv')
spy_df = pd.read_csv('../../Data/1999-12-01-2024-03-01_SPY.csv')
spy_df['Close_SPY'] = spy_df['Adj Close']

In [4]:
# data = pd.read_csv('../sp500_full_20181231_to_20231229.csv')
value_count_tb = data[['Ticker']].groupby('Ticker').size().reset_index()
value_count_tb.columns = ['Ticker', 'Count']
stock_to_keep = value_count_tb['Ticker'][value_count_tb.Count==value_count_tb.Count.max()]
data = data[data.Ticker.isin(stock_to_keep)]

# Generate for all pairs

In [5]:
tickers = list(set(data.Ticker.values))
combinations = list(itertools.combinations(tickers, 2))
len(combinations)
batches = list(chunker(combinations, 1000))

In [6]:
len(data)

2183800

In [None]:
# This code allows you to reimport the module if changes happen in data_generation.py
from importlib import reload
# import data_generation
# reload(data_generation)
from data_generation import ExecutePairTrading, generate_training_data

i = 0
for batch in batches:
    start_ts=time()
    print(f'Getting {i+1}th out of {len(batches)} batches')
    max_pairs = 1
    features_tb, labels_tb = generate_training_data(
        data=data,
        moving_average=60,
        training_len=300,
        test_len=60,
        entry_signal=2,
        exit_signal=0.5,
        calculate_label=True,
        verbose=False,
        max_combinations=max_pairs,
        combinations=batch
    )
    combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
    combined = combined[combined.pnls.notnull()].reset_index(drop=True)
    # combined.to_csv(f'./Generated/pair_features_{i+1}_pairs{max_pairs}_300_60.csv', index=False)
    end_ts = time()
    print(f"Took {end_ts - start_ts} seconds")
    i+=1
    break
    
len(combined)
combined

In [None]:
labels_tb
labels_tb = labels_tb.dropna()
labels_tb.pnls.plot.hist(bins=12, alpha=0.5)
labels_tb.pnls.describe()

# Playground

In [7]:
features_tb, labels_tb = generate_training_data(
        data=data,
        sp500_df=spy_df,
        moving_average=60,
        training_len=300,
        test_len=60,
        entry_signal=1.5,
        exit_signal=0.5,
        calculate_label=True,
        verbose=False,
        max_combinations=1,
        combinations=batches[0]
    )
combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
combined = combined[combined.pnls.notnull()].reset_index(drop=True)

1000 stock pairs detected
Took 0.324368953704834 to initilize. Entering ticker pair loop
Max combination = 1
Took 7.116888046264648 to finish


In [8]:
combined

Unnamed: 0,Ticker_P1,Date,High_P1,Low_P1,Volume_P1,Close_P1,Ticker_P2,High_P2,Low_P2,Volume_P2,...,abs_spread,abs_spread_mean,abs_spread_std,abs_spread_mean_MA,abs_spread_std_MA,beta_P1,beta_P2,pnls,actual_abs_spread,actual_abs_spread_std
0,NUE,2001-02-09,10.792500,10.687500,1240400.0,5.699734,NKE,6.888750,6.755000,6783200.0,...,0.418893,1.326150,1.100732,0.409952,0.306847,0.059166,0.055671,-0.009049,1.996158,0.691445
1,NUE,2001-02-12,10.872500,10.622500,843600.0,5.767510,NKE,6.821250,6.750000,4188800.0,...,0.500249,1.319244,1.099387,0.409022,0.306481,0.059434,0.055949,-0.009736,2.033780,0.670110
2,NUE,2001-02-13,11.050000,10.837500,1787600.0,5.849899,NKE,7.125000,6.788750,6788800.0,...,0.410003,1.312448,1.098682,0.406229,0.305682,0.059717,0.056265,-0.010042,2.073609,0.642880
3,NUE,2001-02-14,11.052500,10.725000,1142000.0,5.726313,NKE,7.032500,6.782500,4241600.0,...,0.379521,1.305132,1.097559,0.405062,0.305647,0.060001,0.056561,-0.010549,2.115259,0.611985
4,NUE,2001-02-15,11.240000,10.762500,2069600.0,5.960200,NKE,6.826250,6.526250,14766400.0,...,0.890790,1.298581,1.094131,0.411782,0.311861,0.060301,0.056783,-0.009546,2.148116,0.598128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,NUE,2023-11-27,161.740005,156.380005,1349400.0,160.293243,NKE,108.220001,106.900002,5785000.0,...,53.078545,40.419964,15.709403,53.922384,9.249025,0.355203,0.230448,-0.010665,66.835873,12.356978
5735,NUE,2023-11-28,163.449997,160.300003,1472500.0,160.223648,NKE,109.900002,108.150002,7287400.0,...,52.224403,40.557916,15.630283,53.636758,9.026075,0.354699,0.230654,-0.012877,67.413616,12.470166
5736,NUE,2023-11-29,163.860001,161.130005,1252600.0,162.828415,NKE,112.150002,110.150002,9601800.0,...,53.220352,40.700773,15.549028,53.372612,8.795763,0.354372,0.230989,-0.013239,67.990060,12.599474
5737,NUE,2023-11-30,170.779999,164.179993,5483700.0,168.982468,NKE,110.599998,108.959999,8690900.0,...,59.473717,40.865704,15.484516,53.237457,8.634532,0.354294,0.231282,-0.013506,68.460110,12.797515


In [None]:
df

In [None]:
df['Close_P2'].rolling(300).apply(calculate_beta, args=(df,))