In [11]:
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
import os

In [12]:
dataset_path = "./data/stocks_data/single_period_data"
features_path = "./data/model_data/period_features"
if not os.path.exists(features_path):
    os.makedirs(features_path)

In [13]:
def load_data(num_period: int, dataset_path: str = dataset_path) -> pd.DataFrame:
    data = pd.read_parquet(f"{dataset_path}/period_{num_period}.parquet")
    useful_days = data.groupby("Stock")["Open"].count()
    reserved_stocks = useful_days[useful_days == 1000].index
    print(f"Number of stocks: {len(reserved_stocks)}")
    print(f"Date range: {data.index.min()} - {data.index.max()}")
    data = data[data["Stock"].isin(reserved_stocks)]
    return data

In [14]:
test_df = load_data(31)
test_df.head()

Number of stocks: 437
Date range: 2020-01-09 00:00:00 - 2023-12-28 00:00:00


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return_tomorrow,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-09,86.459999,87.699997,86.169998,87.269997,1912700.0,0.003667,A
2020-01-10,87.720001,88.239998,87.32,87.589996,1417000.0,-0.001484,A
2020-01-13,87.809998,88.32,86.739998,87.459999,1630200.0,0.00606,A
2020-01-14,87.269997,88.209999,86.699997,87.989998,1675200.0,0.00716,A
2020-01-15,87.629997,89.110001,87.550003,88.620003,1630400.0,0.009479,A


In [15]:
def calc_log_rtn(close, n=1):
    # convert the close price to numpy array
    close = np.array(close, dtype=np.float64)
    
    # initialize the result with NaN
    result = np.full_like(close, np.nan)
    
    # check n
    if n < 1:
        raise ValueError("n must be at least 1")
    
    # compute the log return, from n-th day to the end
    # assert that the close price is not 0 and not NaN
    where_condition = (close[:-n] != 0) & (close[n:] != 0) & ~np.isnan(close[:-n]) & ~np.isnan(close[n:])
   
    safe_division = np.divide(close[n:], close[:-n], out=np.full_like(close, np.nan)[n:], where=where_condition)
    
    # compute the log return securely
    np.log(safe_division, out=result[n:], where=where_condition)
    
    return result


def calc_percent_return(close: pd.Series, lag: int) -> pd.Series:
    close = close.to_numpy(dtype=np.float64)
    result = np.full_like(close, -np.inf)
    result[:lag] = np.nan
    np.divide(close[lag:], close[:-lag], out=result[lag:], where=close[:-lag] != 0)
    return result - 1


def generate_features(period_data: pd.DataFrame, lags: list) -> pd.DataFrame:
    features = period_data[["Stock", "Return_tomorrow"]].copy().reset_index()
    for lag in tqdm(lags):
        features[f"rtn_{lag}"] = period_data.groupby("Stock")["Close"].transform(lambda x: calc_percent_return(x, lag)).values
    features.dropna(how="any", axis=0, inplace=True)
    lag1_median = features.groupby("Date")["Return_tomorrow"].transform("median")
    features["target"] = (features["Return_tomorrow"] > lag1_median).astype(int)
    return features.sort_values(["Date", "Stock"]).reset_index(drop=True)

In [16]:
lags = list(range(1, 21)) + list(range(40, 241, 20))
print(lags[:5], lags[-5:], len(lags))

[1, 2, 3, 4, 5] [160, 180, 200, 220, 240] 31


In [17]:
features = generate_features(test_df, lags)
features.head()

  0%|          | 0/31 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:04<00:00,  6.64it/s]


Unnamed: 0,Date,Stock,Return_tomorrow,rtn_1,rtn_2,rtn_3,rtn_4,rtn_5,rtn_6,rtn_7,...,rtn_80,rtn_100,rtn_120,rtn_140,rtn_160,rtn_180,rtn_200,rtn_220,rtn_240,target
0,2020-12-21,A,-0.003481,-0.012741,-0.010003,0.004606,0.003322,0.008909,-0.005908,-0.008669,...,0.189457,0.22777,0.336738,0.30158,0.542229,0.583916,0.664735,0.416136,0.349605,1
1,2020-12-21,AAL,-0.038509,-0.024833,-0.041667,-0.045077,-0.053498,-0.035928,-0.055718,-0.105058,...,0.212349,0.440072,0.256831,0.35865,0.692955,0.694737,0.091525,-0.432699,-0.423971,0
2,2020-12-21,AAPL,0.028465,0.012395,-0.003652,0.003286,0.002737,0.052964,0.047545,0.04049,...,0.025758,0.333091,0.408695,0.577633,0.723753,0.954204,0.927039,0.602725,0.656558,1
3,2020-12-21,ABBV,-0.008774,-0.007085,-0.01125,-0.008888,0.008656,-0.004702,-0.024732,-0.035166,...,0.099788,0.079863,0.046096,0.14105,0.21483,0.36947,0.216254,0.12374,0.149651,0
4,2020-12-21,ABT,0.001758,-0.008076,-0.006343,0.006987,0.005956,0.012173,0.009998,0.014739,...,-0.028754,0.064611,0.179507,0.182604,0.161883,0.306539,0.403402,0.230113,0.250463,1


In [18]:
features.groupby("Stock")["rtn_1"].count().value_counts()

rtn_1
760    437
Name: count, dtype: int64

In [19]:
features.to_parquet(f"{features_path}/features_period_31.parquet")

In [20]:
data = pd.read_parquet(f"{features_path}/features_period_31.parquet")
data = data[["Date", "Stock", "Return_tomorrow", "target"]].sort_values(by=["Date", "Stock"])
data["median"] = data.groupby("Date")["Return_tomorrow"].transform("median")
data.head()

Unnamed: 0,Date,Stock,Return_tomorrow,target,median
0,2020-12-21,A,-0.003481,1,-0.004234
1,2020-12-21,AAL,-0.038509,0,-0.004234
2,2020-12-21,AAPL,0.028465,1,-0.004234
3,2020-12-21,ABBV,-0.008774,0,-0.004234
4,2020-12-21,ABT,0.001758,1,-0.004234
