In [1]:
import pandas as pd
import datetime as dt
import os

os.chdir("./data/stocks_data")

# 1. Get all trading dates

In [2]:
appl = pd.read_parquet("./history_data/AAPL.parquet")
abt = pd.read_parquet("./history_data/ABT.parquet")
appl.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Return_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989-01-03,0.359375,0.361607,0.357143,0.360491,0.281993,100016000,0.040248
1989-01-04,0.363839,0.376116,0.361607,0.375,0.293342,239948800,0.005952
1989-01-05,0.375,0.386161,0.368304,0.377232,0.295088,307328000,0.008875
1989-01-06,0.377232,0.388393,0.377232,0.38058,0.297707,198665600,0.0088
1989-01-09,0.383929,0.385045,0.377232,0.383929,0.300327,79307200,-0.008723


In [3]:
# SP500 constituents data begins in 1989-2
all_dates = appl.index.union(abt.index)
all_dates = all_dates[all_dates >= dt.datetime(1989, 2, 1)]
all_dates[:5], all_dates[-5:]

(DatetimeIndex(['1989-02-01', '1989-02-02', '1989-02-03', '1989-02-06',
                '1989-02-07'],
               dtype='datetime64[ns]', name='Date', freq=None),
 DatetimeIndex(['2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27',
                '2023-12-28'],
               dtype='datetime64[ns]', name='Date', freq=None))

# 2. binary matrix

In [4]:
SP500_binary_matrix = pd.read_parquet("./stocks_list/SP500_binary_matrix.parquet")
SP500_binary_matrix.head()

ticker,A,AAL,AAP,AAPL,ABBV,ABI,ABK,ABMD,ABNB,ABS,...,YELL,YHOO,YNR,YUM,ZBH,ZBRA,ZE,ZION,ZRN,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-02-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-03-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-04-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-05-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-06-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False


# 3. Test generate a dataset

In [5]:
# Test an example
date_range = all_dates[:1000]
uniq_months = date_range.to_period('M').unique().to_timestamp()
uniq_months[:10]

DatetimeIndex(['1989-02-01', '1989-03-01', '1989-04-01', '1989-05-01',
               '1989-06-01', '1989-07-01', '1989-08-01', '1989-09-01',
               '1989-10-01', '1989-11-01'],
              dtype='datetime64[ns]', name='Date', freq='MS')

In [6]:
stock_dates = {}

# iterate through each month
for month, stocks in SP500_binary_matrix.loc[uniq_months].iterrows():
    # all trading dates in the month
    month_dates = date_range[date_range.to_period('M') == month.to_period('M')]
    
    # iterate through all stocks
    for stock in stocks.index[stocks]:
        if stock not in stock_dates:
            stock_dates[stock] = []
        # add the month dates to the stock's dates
        stock_dates[stock].extend(month_dates)

# store the result
result_df = pd.DataFrame()

# read the stock data and select the dates
for stock, dates in stock_dates.items():
    # check if the stock data exists
    if not os.path.exists(f'./history_data/{stock}.parquet'):
        continue
    
    # read the stock data
    stock_data = pd.read_parquet(f'./history_data/{stock}.parquet')

    # drop Adj Close column if it exists
    stock_data = stock_data.drop(columns=['Adj Close'], errors='ignore')
    
    # select the stock data in the dates
    selected_data = stock_data.loc[stock_data.index.intersection(dates)]
    
    # add stock name
    selected_data['Stock'] = stock
    
    # add the stock data to the result
    result_df = pd.concat([result_df, selected_data])

# check the result
result_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989-02-01,0.337054,0.353795,0.333705,0.350446,487558400.0,0.012741,AAPL
1989-02-02,0.352679,0.359375,0.350446,0.354911,473491200.0,-0.012581,AAPL
1989-02-03,0.357143,0.359375,0.348214,0.350446,178908800.0,-0.019107,AAPL
1989-02-06,0.352679,0.352679,0.341518,0.34375,116737600.0,0.012986,AAPL
1989-02-07,0.341518,0.350446,0.341518,0.348214,165155200.0,-0.01923,AAPL


In [7]:
result_df.groupby('Stock').count()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow
Stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,1000,1000,1000,1000,1000,1000
ABT,1000,1000,1000,1000,1000,1000
ADM,1000,1000,1000,1000,1000,1000
ADP,1000,1000,1000,1000,1000,1000
ADSK,769,769,769,769,769,769
...,...,...,...,...,...,...
WY,1000,1000,1000,1000,1000,1000
X,411,411,411,411,411,411
XEL,1000,1000,1000,1000,1000,1000
XOM,1000,1000,1000,1000,1000,1000


# 4. Generate all datasets

In [8]:
def generate_single_dataset(date_range):
    # unique months
    uniq_months = date_range.to_period('M').unique().to_timestamp()

    # stock - dates dict, store the trading dates for each stock while it is in the S&P 500
    stock_dates = {}

    # iterate through each month
    for month, stocks in SP500_binary_matrix.loc[uniq_months].iterrows():
        # all trading dates in the month
        month_dates = date_range[date_range.to_period('M') == month.to_period('M')]

        # iterate through all stocks
        for stock in stocks.index[stocks]:
            if stock not in stock_dates:
                stock_dates[stock] = []
            # add the month dates to the stock's dates
            stock_dates[stock].extend(month_dates)
    
    # store the result
    result_df = pd.DataFrame()

    # read the stock data and select the dates
    for stock, dates in stock_dates.items():
        # check if the stock data exists
        if not os.path.exists(f'./history_data/{stock}.parquet'):
            continue

        # read the stock data
        stock_data = pd.read_parquet(f'./history_data/{stock}.parquet')

        # drop Adj Close column if it exists
        stock_data = stock_data.drop(columns=['Adj Close'], errors='ignore')

        # select the stock data in the dates
        selected_data = stock_data.loc[stock_data.index.intersection(dates)]

        # add stock name
        selected_data['Stock'] = stock

        # add the stock data to the result
        result_df = pd.concat([result_df, selected_data])
    
    return result_df

# example
date_range = all_dates[:1000]
result_df = generate_single_dataset(date_range)
result_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989-02-01,0.337054,0.353795,0.333705,0.350446,487558400.0,0.012741,AAPL
1989-02-02,0.352679,0.359375,0.350446,0.354911,473491200.0,-0.012581,AAPL
1989-02-03,0.357143,0.359375,0.348214,0.350446,178908800.0,-0.019107,AAPL
1989-02-06,0.352679,0.352679,0.341518,0.34375,116737600.0,0.012986,AAPL
1989-02-07,0.341518,0.350446,0.341518,0.348214,165155200.0,-0.01923,AAPL


In [9]:
result_df.groupby('Stock').count().head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow
Stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,1000,1000,1000,1000,1000,1000
ABT,1000,1000,1000,1000,1000,1000
ADM,1000,1000,1000,1000,1000,1000
ADP,1000,1000,1000,1000,1000,1000
ADSK,769,769,769,769,769,769


In [10]:
# generate all date_ranges
print(len(all_dates))

# drop some dates at the begining to make the number of dates dividable by 250
n_drop = len(all_dates) % 250
if n_drop != 0:
    all_dates = all_dates[n_drop:]

print(len(all_dates))

8795
8750


In [11]:
# generate date_ranges, each with length 1000 and step 250
ranges = []
for start in range(0, len(all_dates) - 1000 + 1, 250):
    end = start + 1000
    ranges.append(all_dates[start:end])
ranges[0]

DatetimeIndex(['1989-04-07', '1989-04-10', '1989-04-11', '1989-04-12',
               '1989-04-13', '1989-04-14', '1989-04-17', '1989-04-18',
               '1989-04-19', '1989-04-20',
               ...
               '1993-03-08', '1993-03-09', '1993-03-10', '1993-03-11',
               '1993-03-12', '1993-03-15', '1993-03-16', '1993-03-17',
               '1993-03-18', '1993-03-19'],
              dtype='datetime64[ns]', name='Date', length=1000, freq=None)

In [12]:
save_path = "./single_period_data"
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [13]:
from tqdm import tqdm

stock_number = {}

for i, date_range in tqdm(enumerate(ranges), total=len(ranges)):
    tqdm.write(f"Processing date range {i}")
    result_df = generate_single_dataset(date_range)
    stock_number[i] = result_df["Stock"].nunique()
    result_df.to_parquet(os.path.join(save_path, f"period_{i}.parquet"))

  0%|          | 0/32 [00:00<?, ?it/s]

Processing date range 0


  3%|▎         | 1/32 [00:10<05:13, 10.13s/it]

Processing date range 1


  6%|▋         | 2/32 [00:19<04:57,  9.92s/it]

Processing date range 2


  9%|▉         | 3/32 [00:29<04:43,  9.78s/it]

Processing date range 3


 12%|█▎        | 4/32 [00:39<04:37,  9.90s/it]

Processing date range 4


 16%|█▌        | 5/32 [00:49<04:28,  9.96s/it]

Processing date range 5


 19%|█▉        | 6/32 [01:00<04:23, 10.15s/it]

Processing date range 6


 22%|██▏       | 7/32 [01:11<04:20, 10.44s/it]

Processing date range 7


 25%|██▌       | 8/32 [01:22<04:18, 10.78s/it]

Processing date range 8


 28%|██▊       | 9/32 [01:34<04:15, 11.13s/it]

Processing date range 9


 31%|███▏      | 10/32 [01:46<04:12, 11.48s/it]

Processing date range 10


 34%|███▍      | 11/32 [01:58<04:03, 11.62s/it]

Processing date range 11


 38%|███▊      | 12/32 [02:10<03:52, 11.65s/it]

Processing date range 12


 41%|████      | 13/32 [02:22<03:40, 11.59s/it]

Processing date range 13


 44%|████▍     | 14/32 [02:33<03:28, 11.56s/it]

Processing date range 14


 47%|████▋     | 15/32 [02:45<03:18, 11.66s/it]

Processing date range 15


 50%|█████     | 16/32 [02:57<03:10, 11.88s/it]

Processing date range 16


 53%|█████▎    | 17/32 [03:10<03:03, 12.22s/it]

Processing date range 17


 56%|█████▋    | 18/32 [03:24<02:55, 12.52s/it]

Processing date range 18


 59%|█████▉    | 19/32 [03:37<02:44, 12.68s/it]

Processing date range 19


 62%|██████▎   | 20/32 [03:49<02:32, 12.73s/it]

Processing date range 20


 66%|██████▌   | 21/32 [04:02<02:19, 12.71s/it]

Processing date range 21


 69%|██████▉   | 22/32 [04:15<02:07, 12.73s/it]

Processing date range 22


 72%|███████▏  | 23/32 [04:28<01:55, 12.79s/it]

Processing date range 23


 75%|███████▌  | 24/32 [04:41<01:43, 12.92s/it]

Processing date range 24


 78%|███████▊  | 25/32 [04:55<01:31, 13.14s/it]

Processing date range 25


 81%|████████▏ | 26/32 [05:09<01:20, 13.36s/it]

Processing date range 26


 84%|████████▍ | 27/32 [05:23<01:07, 13.55s/it]

Processing date range 27


 88%|████████▊ | 28/32 [05:37<00:55, 13.76s/it]

Processing date range 28


 91%|█████████ | 29/32 [05:51<00:41, 13.90s/it]

Processing date range 29


 94%|█████████▍| 30/32 [06:05<00:27, 13.95s/it]

Processing date range 30


 97%|█████████▋| 31/32 [06:19<00:13, 13.94s/it]

Processing date range 31


100%|██████████| 32/32 [06:33<00:00, 12.29s/it]


In [16]:
result_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-09,86.459999,87.699997,86.169998,87.269997,1912700.0,0.003667,A
2020-01-10,87.720001,88.239998,87.32,87.589996,1417000.0,-0.001484,A
2020-01-13,87.809998,88.32,86.739998,87.459999,1630200.0,0.00606,A
2020-01-14,87.269997,88.209999,86.699997,87.989998,1675200.0,0.00716,A
2020-01-15,87.629997,89.110001,87.550003,88.620003,1630400.0,0.009479,A


In [18]:
pd.Series(stock_number)

0     210
1     216
2     223
3     240
4     253
5     275
6     290
7     317
8     342
9     353
10    362
11    366
12    368
13    378
14    395
15    428
16    456
17    476
18    479
19    480
20    481
21    489
22    497
23    511
24    531
25    543
26    557
27    571
28    576
29    570
30    565
31    561
dtype: int64