In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
import os

In [2]:
dataset_path = "./data/stocks_data/single_period_data"
features_path = "./data/model_data/period_features"
if not os.path.exists(features_path):
    os.makedirs(features_path)

In [3]:
def load_data(num_period: int, dataset_path: str = dataset_path) -> pd.DataFrame:
    data = pd.read_parquet(f"{dataset_path}/period_{num_period}.parquet")
    data = data.set_index("Date")
    useful_days = data.groupby("Ticker")["Adj_Close"].count()
    reserved_stocks = useful_days[useful_days >= 600].index
    print(f"Number of stocks: {len(reserved_stocks)}")
    print(f"Date range: {data.index.min()} - {data.index.max()}")
    data = data[data["Ticker"].isin(reserved_stocks)]
    return data

In [12]:
def calc_log_rtn(close, n=1):
    # convert the close price to numpy array
    close = np.array(close, dtype=np.float64)
    
    # check n
    if n < 1:
        raise ValueError("n must be at least 1")
    
    # compute the log return, from n-th day to the end
    # assert that the close price is not 0 and not NaN
    divi_resu = np.full_like(close, np.nan)
    np.divide(close[n:], close[:-n], out=divi_resu[n:], where=(close[:-n] != 0))
    
    # compute the log return securely
    log_result = np.full_like(close, np.nan)
    where_condition = (divi_resu != 0) & (divi_resu != np.nan)
    np.log(divi_resu, out=log_result, where=where_condition)
    
    return log_result


def calc_percent_return(close: pd.Series, lag: int) -> pd.Series:
    close = close.to_numpy(dtype=np.float64)
    result = np.full_like(close, -np.inf)
    result[:lag] = np.nan
    np.divide(close[lag:], close[:-lag], out=result[lag:], where=close[:-lag] != 0)
    return result - 1


def generate_features(period_data: pd.DataFrame, lags: list) -> pd.DataFrame:
    features = period_data[["Ticker", "Return_tomorrow"]].copy().reset_index()
    for lag in tqdm(lags):
        features[f"rtn_{lag}"] = period_data.groupby("Ticker")["Adj_Close"].transform(lambda x: calc_log_rtn(x, lag)).values
    features.dropna(how="any", axis=0, inplace=True)
    lag1_median = features.groupby("Date")["Return_tomorrow"].transform("median")
    features["Target"] = (features["Return_tomorrow"] > lag1_median).astype(int)
    return features.sort_values(["Date", "Ticker"]).reset_index(drop=True)

In [13]:
lags = list(range(1, 21)) + list(range(40, 241, 20))
print(lags[:5], lags[-5:], len(lags))

[1, 2, 3, 4, 5] [160, 180, 200, 220, 240] 31


In [14]:
period_data = load_data(0)
period_data.head()

Number of stocks: 461
Date range: 1996-07-05 00:00:00 - 2000-06-20 00:00:00


Unnamed: 0_level_0,Ticker,Adj_Close,Return_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1996-07-05,AAPL,0.147199,-0.019417
1996-07-08,AAPL,0.144369,-0.006557
1996-07-09,AAPL,0.143425,-0.013244
1996-07-10,AAPL,0.141538,-0.047794
1996-07-11,AAPL,0.134933,0.010434


In [15]:
features = generate_features(period_data, lags)
features.head()

  0%|          | 0/31 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:03<00:00,  8.96it/s]


Unnamed: 0,Date,Ticker,Return_tomorrow,rtn_1,rtn_2,rtn_3,rtn_4,rtn_5,rtn_6,rtn_7,...,rtn_80,rtn_100,rtn_120,rtn_140,rtn_160,rtn_180,rtn_200,rtn_220,rtn_240,Target
0,1997-06-17,AAPL,-0.025169,0.053003,0.033043,0.017357,0.001914,0.005753,-0.017068,-0.024558,...,-0.00191,-0.05397,-0.34707,-0.425031,-0.40483,-0.305687,-0.394574,-0.280005,-0.17657,0
1,1997-06-17,ABMD,-0.047931,0.018867,0.0,-0.018518,-0.072098,0.184177,0.118942,0.184177,...,0.129609,0.108555,0.047931,-0.018518,-0.131117,-0.209898,0.067733,0.028513,-0.131117,0
2,1997-06-17,ABT,-0.009607,-0.011406,-0.01518,-0.017062,-0.003817,0.011539,0.005753,0.021258,...,0.133135,0.195983,0.228583,0.15549,0.287136,0.302109,0.392356,0.386817,0.449313,0
3,1997-06-17,ACGL,0.019293,0.0,-0.012903,0.0,0.016367,0.04652,0.03974,0.053346,...,0.124298,0.146604,0.060219,0.095311,0.060219,0.013072,0.131678,0.081126,0.0,1
4,1997-06-17,ADBE,-0.032088,0.004522,0.007547,0.001505,-0.004501,0.001505,0.001505,-0.036904,...,0.120835,0.131057,0.066078,0.053339,0.210659,0.111988,0.17919,0.301014,0.172047,0


In [8]:
for period in range(25):
    print(f"Processing period {period}")
    period_data = load_data(period)
    features = generate_features(period_data, lags)
    features.to_parquet(f"{features_path}/features_period_{period}.parquet")

Number of stocks: 461
Date range: 1996-07-05 00:00:00 - 2000-06-20 00:00:00


100%|██████████| 31/31 [00:02<00:00, 11.67it/s]


Number of stocks: 472
Date range: 1997-07-01 00:00:00 - 2001-06-18 00:00:00


100%|██████████| 31/31 [00:02<00:00, 13.41it/s]


Number of stocks: 495
Date range: 1998-06-29 00:00:00 - 2002-06-20 00:00:00


100%|██████████| 31/31 [00:02<00:00, 12.66it/s]


Number of stocks: 515
Date range: 1999-06-25 00:00:00 - 2003-06-18 00:00:00


100%|██████████| 31/31 [00:02<00:00, 12.34it/s]


Number of stocks: 531
Date range: 2000-06-21 00:00:00 - 2004-06-16 00:00:00


100%|██████████| 31/31 [00:02<00:00, 11.92it/s]


Number of stocks: 538
Date range: 2001-06-19 00:00:00 - 2005-06-13 00:00:00


100%|██████████| 31/31 [00:03<00:00, 10.06it/s]


Number of stocks: 545
Date range: 2002-06-21 00:00:00 - 2006-06-09 00:00:00


100%|██████████| 31/31 [00:03<00:00, 10.14it/s]


Number of stocks: 568
Date range: 2003-06-19 00:00:00 - 2007-06-08 00:00:00


100%|██████████| 31/31 [00:03<00:00,  9.44it/s]


Number of stocks: 587
Date range: 2004-06-17 00:00:00 - 2008-06-05 00:00:00


100%|██████████| 31/31 [00:02<00:00, 10.82it/s]


Number of stocks: 606
Date range: 2005-06-14 00:00:00 - 2009-06-03 00:00:00


100%|██████████| 31/31 [00:02<00:00, 10.56it/s]


Number of stocks: 625
Date range: 2006-06-12 00:00:00 - 2010-06-01 00:00:00


100%|██████████| 31/31 [00:03<00:00, 10.22it/s]


Number of stocks: 640
Date range: 2007-06-11 00:00:00 - 2011-05-26 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.91it/s]


Number of stocks: 647
Date range: 2008-06-06 00:00:00 - 2012-05-23 00:00:00


100%|██████████| 31/31 [00:03<00:00,  9.81it/s]


Number of stocks: 668
Date range: 2009-06-04 00:00:00 - 2013-05-23 00:00:00


100%|██████████| 31/31 [00:03<00:00,  9.57it/s]


Number of stocks: 688
Date range: 2010-06-02 00:00:00 - 2014-05-21 00:00:00


100%|██████████| 31/31 [00:03<00:00,  9.13it/s]


Number of stocks: 705
Date range: 2011-05-27 00:00:00 - 2015-05-19 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.05it/s]


Number of stocks: 715
Date range: 2012-05-24 00:00:00 - 2016-05-16 00:00:00


100%|██████████| 31/31 [00:03<00:00,  7.93it/s]


Number of stocks: 723
Date range: 2013-05-24 00:00:00 - 2017-05-12 00:00:00


100%|██████████| 31/31 [00:04<00:00,  7.46it/s]


Number of stocks: 726
Date range: 2014-05-22 00:00:00 - 2018-05-10 00:00:00


100%|██████████| 31/31 [00:03<00:00,  7.93it/s]


Number of stocks: 724
Date range: 2015-05-20 00:00:00 - 2019-05-09 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.73it/s]


Number of stocks: 700
Date range: 2016-05-17 00:00:00 - 2020-05-06 00:00:00


100%|██████████| 31/31 [00:04<00:00,  7.22it/s]


Number of stocks: 692
Date range: 2017-05-15 00:00:00 - 2021-05-04 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.01it/s]


Number of stocks: 700
Date range: 2018-05-11 00:00:00 - 2022-04-29 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.96it/s]


Number of stocks: 700
Date range: 2019-05-10 00:00:00 - 2023-04-28 00:00:00


100%|██████████| 31/31 [00:03<00:00,  8.87it/s]


Number of stocks: 686
Date range: 2020-05-07 00:00:00 - 2024-04-26 00:00:00


100%|██████████| 31/31 [00:03<00:00,  9.14it/s]


In [9]:
data = pd.read_parquet(f"{features_path}/features_period_0.parquet")
data = data[["Date", "Ticker", "Return_tomorrow", "Target"]].copy()
data["median"] = data.groupby("Date")["Return_tomorrow"].transform("median")
data.head()

Unnamed: 0,Date,Ticker,Return_tomorrow,Target,median
0,1997-06-17,AAPL,-0.025169,0,0.0
1,1997-06-17,ABMD,-0.047931,0,0.0
2,1997-06-17,ABT,-0.009607,0,0.0
3,1997-06-17,ACGL,0.019293,1,0.0
4,1997-06-17,ADBE,-0.032088,0,0.0


In [10]:
data.tail()

Unnamed: 0,Date,Ticker,Return_tomorrow,Target,median
343025,2000-06-20,XRAY,-0.001974,0,0.0
343026,2000-06-20,XRX,-0.033388,0,0.0
343027,2000-06-20,YUM,0.011508,1,0.0
343028,2000-06-20,ZBRA,-0.010695,0,0.0
343029,2000-06-20,ZION,-0.022093,0,0.0
