# 1. Convert into binary matrix

In [2]:
import pandas as pd
import os

os.chdir("data/stocks_data/stocks_list")

In [3]:
sp_original = pd.read_parquet("SP500_full_original_data.parquet")
sp_original.head()

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret
1,1989-01-31,24678,WARNER LAMBERT CO,93448810,11,1,2834,WLA,11288,1,1957-03-01,2000-06-20,0.039872
3,1989-01-31,17750,KIMBERLY CLARK CORP,49436810,11,1,2676,KMB,6435,1,1957-03-01,2023-12-29,0.096567
4,1989-01-31,66114,TEMPLE INLAND INC,87986810,11,1,2631,TIN,10426,1,1986-12-18,2007-12-28,0.146982
5,1989-01-31,38914,CONTROL DATA CORP DE,21236310,11,1,8742,CDA,3480,1,1969-05-22,2001-03-30,0.031847
7,1989-01-31,19553,AMOCO CORP,3190510,11,1,2911,AN,1609,1,1957-03-01,1998-12-31,0.036667


In [4]:
map1 = sp_original[["permno", "comnam", "ticker", "date"]].copy().sort_values(by=["permno", "date"])
map1 = map1.groupby("permno").tail(1).set_index("permno")
sp_original["comnam"] = sp_original["permno"].map(map1["comnam"])
sp_original["ticker"] = sp_original["permno"].map(map1["ticker"])

print(sp_original["permno"].nunique(), sp_original["comnam"].nunique(), sp_original["ticker"].nunique())
sp_original.head()

1288 1282 1224


Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret
1,1989-01-31,24678,WARNER LAMBERT CO,93448810,11,1,2834,WLA,11288,1,1957-03-01,2000-06-20,0.039872
3,1989-01-31,17750,KIMBERLY CLARK CORP,49436810,11,1,2676,KMB,6435,1,1957-03-01,2023-12-29,0.096567
4,1989-01-31,66114,TEMPLE INLAND INC,87986810,11,1,2631,TIN,10426,1,1986-12-18,2007-12-28,0.146982
5,1989-01-31,38914,CERIDIAN CORP,21236310,11,1,8742,CEN,3480,1,1969-05-22,2001-03-30,0.031847
7,1989-01-31,19553,AMOCO CORP,3190510,11,1,2911,AN,1609,1,1957-03-01,1998-12-31,0.036667


In [5]:
sp_original.reset_index(drop=True, inplace=True)

# Drop the columns that are not needed
sp_original.drop(columns=["ncusip", "shrcd", "exchcd", "gvkey", "iid", "ret"], axis=1, inplace=True)

# rename the columns
sp_original.rename(columns={"comnam": "company_name", "hsiccd": "industry_code"}, inplace=True)
sp_original.to_parquet("SP500_full_original_data_cleaned.parquet")

sp_original.head()

Unnamed: 0,date,permno,company_name,industry_code,ticker,start,ending
0,1989-01-31,24678,WARNER LAMBERT CO,2834,WLA,1957-03-01,2000-06-20
1,1989-01-31,17750,KIMBERLY CLARK CORP,2676,KMB,1957-03-01,2023-12-29
2,1989-01-31,66114,TEMPLE INLAND INC,2631,TIN,1986-12-18,2007-12-28
3,1989-01-31,38914,CERIDIAN CORP,8742,CEN,1969-05-22,2001-03-30
4,1989-01-31,19553,AMOCO CORP,2911,AN,1957-03-01,1998-12-31


In [6]:
df = sp_original.copy()

# generate the next month date
df['next_month'] = df['date'] + pd.DateOffset(months=1)
df['next_month'] = df['next_month'].values.astype('datetime64[M]')

# check if the company is a member of the index in the next month
df['is_member_next_month'] = (df['start'] <= df['next_month']) & (df['ending'] >= df['next_month'])

# generate the binary matrix
matrix = pd.pivot_table(df, values='is_member_next_month', index='next_month', columns='ticker', fill_value=False, aggfunc='max').infer_objects(copy=False)
matrix.index.name = 'date'
matrix.columns.name = 'ticker'
print(matrix.shape)
matrix.head()

(420, 1224)


ticker,A,AAL,AAP,AAPL,ABBV,ABI,ABK,ABMD,ABNB,ABS,...,YELL,YHOO,YNR,YUM,ZBH,ZBRA,ZE,ZION,ZRN,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-02-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-03-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-04-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-05-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-06-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False


In [7]:
matrix.to_parquet("SP500_binary_matrix.parquet")

# 2. Download stocks data

In [1]:
import pandas as pd
import yfinance as yf
import akshare as ak
from tqdm import tqdm
import datetime as dt
import os

os.chdir("data/stocks_data/history_data")

In [2]:
SP500_binary_matrix = pd.read_parquet("../stocks_list/SP500_binary_matrix.parquet")
SP500_binary_matrix.head()

ticker,A,AAL,AAP,AAPL,ABBV,ABI,ABK,ABMD,ABNB,ABS,...,YELL,YHOO,YNR,YUM,ZBH,ZBRA,ZE,ZION,ZRN,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-02-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-03-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-04-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-05-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False
1989-06-01,False,True,False,True,False,True,False,False,False,True,...,True,False,False,False,False,False,True,False,True,False


In [6]:
stock_ids = SP500_binary_matrix.columns.to_list()
print(len(stock_ids))
stock_ids[:5]

1224


['A', 'AAL', 'AAP', 'AAPL', 'ABBV']

In [7]:
files = [x.split(".")[0] for x in os.listdir() if x.endswith(".parquet")]
print(len(files))
files[:5]

1086


['A', 'AAL', 'AAP', 'AAPL', 'ABBV']

In [5]:
# in stock_ids, not int files
lst1 = list(set(stock_ids) - set(files))

# in files, not in stock_ids
lst2 = list(set(files) - set(stock_ids))

print(len(lst1), len(lst2))

138 0


In [13]:
# Download the stock data using yfinance or akshare
# This will take one hour approximately

onlist_stocks = {}

def download_stock(id):
    # download data from yahoo finance first
    data = yf.download(id, start="1989-01-01", end="2023-12-31", progress=False)
    if not data.empty:
        onlist_stocks[id] = {"begin_date": data.index.min(), "end_date": data.index.max()}
        return data
    else:
        # if the stock is delisted, try to download the data from akshare
        try:
            data = ak.stock_us_daily(symbol=id, adjust="")

            # adjust the format of the data to be the same as the one from yfinance
            data.rename(columns={"date": "Date", "open": "Open", "high": "High", "low": "Low", "close": "Close", "volume": "Volume"}, inplace=True)
            data.set_index("Date", inplace=True)

            # filter the data to be within the range of the our needs
            begin_date = max(data.index.min(), dt.datetime(1989, 1, 3))
            end_date = min(data.index.max(), dt.datetime(2023, 12, 29))
            onlist_stocks[id] = {"begin_date": begin_date, "end_date": end_date}
            return data.loc[begin_date:end_date]
        except Exception as e:
            onlist_stocks[id] = {"begin_date": None, "end_date": None}
            return None

for s_id in tqdm(stock_ids):
    if os.path.exists(f"{s_id}.parquet"):
        data = pd.read_parquet(f"{s_id}.parquet")
        onlist_stocks[s_id] = {"begin_date": data.index.min(), "end_date": data.index.max()}
    else:
        data = download_stock(s_id)
        if data is not None:
            data.to_parquet(f"{s_id}.parquet")

  0%|          | 0/1224 [00:00<?, ?it/s]
1 Failed download:
['ABI']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
  0%|          | 6/1224 [00:02<08:59,  2.26it/s]
1 Failed download:
['ACAS']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
  1%|          | 13/1224 [00:05<08:01,  2.52it/s]
1 Failed download:
['ACK']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
  2%|▏         | 29/1224 [00:06<02:58,  6.70it/s]Failed to get ticker 'AHM' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['AHM']: Exception('%ticker%: No timezone found, symbol may be delisted')
  5%|▌         | 65/1224 [00:07<01:01, 18.88it/s]
1 Failed download:
['ANDW']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
  6%|▌         | 75/1224 [00:09<01:23, 13.76it/s]
1 Failed download:
['APCC']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
  7%|▋         | 82/1224 [00:10<01:46, 10.68it/s]
1 Failed download:
['ARG']: JSONDecodeErro

In [14]:
# Save the stock data information
stock_info = pd.DataFrame(onlist_stocks).T
print(stock_info.shape)
stock_info.head()

(1224, 2)


Unnamed: 0,begin_date,end_date
A,1999-11-18 00:00:00,2023-12-29 00:00:00
AAL,2005-09-27 00:00:00,2023-12-29 00:00:00
AAP,2001-11-29 00:00:00,2023-12-29 00:00:00
AAPL,1989-01-03 00:00:00,2023-12-29 00:00:00
ABBV,2013-01-02 00:00:00,2023-12-29 00:00:00


In [18]:
stock_info.to_csv("../stocks_list/stock_info.csv")

In [17]:
stock_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1224 entries, A to ZTS
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   begin_date  1089 non-null   object
 1   end_date    1089 non-null   object
dtypes: object(2)
memory usage: 61.0+ KB


In [19]:
# some stocks may be delisted, so we need to separate the stocks that are still listed from the ones that are delisted

stocks_onlisted = stock_info.loc[stock_info["begin_date"].notnull()]
stocks_onlisted.to_csv("../stocks_list/stocks_onlisted.csv")
stock_delisted = stock_info.loc[stock_info["begin_date"].isnull()]
stock_delisted.to_csv("../stocks_list/stocks_delisted.csv")

In [20]:
stocks_onlisted.shape, stock_delisted.shape

((1089, 2), (135, 2))

# 3. Download other delisted stocks' data

In [16]:
import pandas as pd
import os

os.chdir("data/stocks_data")

(1404,)

# 4. Check the data

In [1]:
import pandas as pd
import os

os.chdir("data/stocks_data")

In [11]:
files = [x.split(".")[0] for x in os.listdir("history_data")]
print(len(files))
files[:5]

1089


['A', 'AAL', 'AAP', 'AAPL', 'ABBV']

In [3]:
onlisted_stocks = pd.read_csv("stocks_list/stocks_onlisted.csv", index_col=0)
onlisted_stocks = onlisted_stocks.index.to_list()
print(len(onlisted_stocks))
onlisted_stocks[:5]

1089


['A', 'AAL', 'AAP', 'AAPL', 'ABBV']

In [13]:
list(set(files) - set(onlisted_stocks)), list(set(onlisted_stocks) - set(files))

([], [])

In [4]:
delisted_stocks = pd.read_csv("stocks_list/stocks_delisted.csv", index_col=0)
delisted_stocks = delisted_stocks.index.to_list()
print(len(delisted_stocks))
delisted_stocks[:5]

135


['ABI', 'ACAS', 'ACK', 'AHM', 'ANDW']

In [14]:
dic = {}
for file in os.listdir("history_data"):
    data = pd.read_parquet(f"history_data/{file}")
    if data.empty:
        dic[file.split(".")[0]] = file

In [15]:
df = pd.DataFrame(dic, index=["missing_values"]).T
df.head()

Unnamed: 0,missing_values


# 5. Add a return column

In [1]:
import pandas as pd
import os

os.chdir("data/stocks_data/history_data")

In [2]:
files = os.listdir()
files[:5]

['A.parquet', 'AAL.parquet', 'AAP.parquet', 'AAPL.parquet', 'ABBV.parquet']

In [3]:
import numpy as np

def calc_lag_return(close: pd.Series, lag: int) -> pd.Series:
    close = close.to_numpy(dtype=np.float64)
    result = np.full_like(close, -np.inf)
    result[:lag] = np.nan
    np.divide(close[lag:], close[:-lag], out=result[lag:], where=close[:-lag] != 0)
    return pd.Series(result - 1)

def generate_return(file_name):
    data = pd.read_parquet(file_name)
    data["Return_tomorrow"] = calc_lag_return(data["Close"], 1).shift(-1).values
    data.dropna(how="any", axis=0, inplace=True)
    data.to_parquet(file_name)

In [4]:
for file in files:
    generate_return(file)
    print(f"Finish {file}")

Finish A.parquet
Finish AAL.parquet
Finish AAP.parquet
Finish AAPL.parquet
Finish ABBV.parquet
Finish ABK.parquet
Finish ABMD.parquet
Finish ABNB.parquet
Finish ABS.parquet
Finish ABT.parquet
Finish ABX.parquet
Finish ACGL.parquet
Finish ACN.parquet
Finish ACS.parquet
Finish ACV.parquet
Finish ACY.parquet
Finish ADBE.parquet
Finish ADCT.parquet
Finish ADI.parquet
Finish ADM.parquet
Finish ADNT.parquet
Finish ADP.parquet
Finish ADPT.parquet
Finish ADS.parquet
Finish ADSK.parquet
Finish ADT.parquet
Finish AEE.parquet
Finish AEP.parquet
Finish AES.parquet
Finish AET.parquet
Finish AFL.parquet
Finish AFS.parquet
Finish AGC.parquet
Finish AGN.parquet
Finish AIG.parquet
Finish AIT.parquet
Finish AIV.parquet
Finish AIZ.parquet
Finish AJG.parquet
Finish AKAM.parquet
Finish AKS.parquet
Finish AL.parquet
Finish ALB.parquet
Finish ALGN.parquet
Finish ALK.parquet
Finish ALL.parquet
Finish ALLE.parquet
Finish ALTR.parquet
Finish ALXN.parquet
Finish AM.parquet
Finish AMAT.parquet
Finish AMCC.parquet

: 