In [1]:
import pandas as pd
import os
import sklearn
import numpy as np
from utils import split_dataset

Load data

In [2]:
covariates = pd.read_csv("./data/bigdata.csv", index_col="Dates", skiprows=5)
covariates.index = pd.to_datetime(covariates.index)

labels = pd.read_csv("./data/jkse.csv", skiprows=5, index_col="Dates")
labels.index = pd.to_datetime(labels.index)

labels = labels.rename(columns={"PX_LAST": "JKSE_PRICE"})
labels['PCT_CHANGE_20_JKSE'] = ((labels['JKSE_PRICE'].shift(-20) - labels['JKSE_PRICE']) / labels['JKSE_PRICE']) * 100

covlist = split_dataset(covariates)

Split the dataset per ticker

## Create technical covariates


### PE Band & P/E Ratio  
$$
 \text{P/E Ratio} = \frac{1}{\text{EARN\_YLD}} 
$$

* $PE Band_q $ is the q-th percentile of P/E Ratio in a 60 trading days rolling window

### Mean Average Convergence/Divergence (MACD)

- MACD is the difference of short term $\text{EMA}_{12}$ and long term $\text{EMA}_{26}$ (signal for momentum)
- MACD Signal is the signal line of 9 day EMA of the MACD
- MACD Histogram is the difference between the MACD line and the signal line

In [3]:
colnames = covlist[0].columns.tolist()
for cov in covlist:
    # unify column names
    cov.columns = colnames
for cov in covlist:
    # Volume = Turnover / Close Price
    cov['VOLUME'] = cov['TURNOVER'] / cov['PX_LAST']
    # Calculate percent change * 100
    cov['PCT_CHANGE_20'] = ((cov['PX_LAST'].shift(-20) - cov['PX_LAST']) / cov['PX_LAST']) * 100
    # Ratio 10/30 = mean volume ratio for the last 10 days / mean volume ratio for the last 30 days
    cov['VOL_RATIO_10_20'] = cov['VOLUME'].rolling(window=10).mean() / cov['VOLUME'].rolling(window=20).mean()
    cov['VOL_RATIO_20_40'] = cov['VOLUME'].rolling(window=20).mean() / cov['VOLUME'].rolling(window=40).mean()
    cov['VOL_RATIO_40_80'] = cov['VOLUME'].rolling(window=40).mean() / cov['VOLUME'].rolling(window=80).mean()
    cov['VOL_RATIO_80_120'] = cov['VOLUME'].rolling(window=80).mean() / cov['VOLUME'].rolling(window=120).mean()
    
    # PE Band
    cov['PE_Ratio'] = 1 / cov['EARN_YLD']
    win = 60  # Set the rolling window period
    cov['PE_Band_25'] = cov['PE_Ratio'].rolling(win).quantile(0.25)
    cov['PE_Band_50'] = cov['PE_Ratio'].rolling(win).quantile(0.50)
    cov['PE_Band_75'] = cov['PE_Ratio'].rolling(win).quantile(0.75)

    # Calculate the 12-day EMA of PX_LAST
    ema_12 = cov['PX_LAST'].ewm(span=12, adjust=False).mean()

    # Calculate the 26-day EMA of PX_LAST
    ema_26 = cov['PX_LAST'].ewm(span=26, adjust=False).mean()

    # Calculate MACD
    cov['MACD'] = ema_12 - ema_26

    # Calculate the Signal line (9-day EMA of MACD)
    cov['MACD_Signal'] = cov['MACD'].ewm(span=9, adjust=False).mean()

    # Optionally, you can also calculate the MACD Histogram (the difference between MACD and Signal line)
    cov['MACD_Histogram'] = cov['MACD'] - cov['MACD_Signal']
    
    # Example: Momentum Indicator for various lags
    lags = [10, 20, 30, 60, 120]
    for lag in lags:
        cov[f'MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'TURNOVER_{lag}'] = cov['TURNOVER'].rolling(window=lag).mean()
        cov[f'PX_MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'PX_REVERSAL_{lag}'] = cov['PX_LAST'].shift(lag) / cov['PX_LAST']
        cov[f'VOLATILITY_{lag}'] = cov['PX_LAST'].rolling(window=lag).std()
        cov[f'VOLUME_STD_{lag}'] = cov['VOLUME'].rolling(window=lag).std()

In [4]:
# Remove duplicate indices if any
labels = labels[~labels.index.duplicated(keep='first')]

for i in range(len(covlist)):
    cov = covlist[i]
    cov = cov[~cov.index.duplicated(keep='first')]

    # Explicitly create a copy of the cov DataFrame to avoid SettingWithCopyWarning
    cov_copy = cov.copy()

    # Align the DataFrames on their indices (dates)
    aligned_df = labels.join(cov_copy[['PCT_CHANGE_20']], how='inner')

    # Calculate the difference and store it in cov_copy DataFrame
    cov_copy.loc[aligned_df.index, 'DELTA_20_CHANGE'] = aligned_df['PCT_CHANGE_20'] - aligned_df['PCT_CHANGE_20_JKSE']

    # Update the original DataFrame in covlist
    covlist[i] = cov_copy


## This is for hmm clean

In [None]:
deltas = []
for cov in covlist:
    delta = cov[['DELTA_20_CHANGE']]
    deltas.append(delta)

In [None]:
delta_df = pd.concat(deltas, axis=1)
tickernames = [col[:4] for col in pd.read_csv("./data/bigdata.csv", skiprows=3).columns if not col.startswith("Unnamed")]
delta_df.columns = tickernames

In [None]:
delta_df.drop(['MDKA', 'ICBP', 'ARTO', 'PGEO'], axis=1, inplace=True)
delta_df.dropna(inplace=True, axis=0, how='all')

In [None]:
delta_df.to_csv('./covset0/unnorm/hmm.csv')

In [5]:
def unique_index(df, suffix):
    df_copy = df.copy()
    new_index = [f"{date.strftime('%Y-%m-%d')}-{suffix}" for date in df.index]
    df_copy.index = new_index
    return df_copy

Create reindexed covlist + pool to find quantiles + insert index back to other covariates

In [6]:
covlist_reindex = []
for idx, cov in enumerate(covlist):
    covlist_reindex.append(unique_index(cov, idx))
    
pooled_df = pd.DataFrame()
for i, df in enumerate(covlist_reindex):
    pooled_df = pd.concat([pooled_df, df['DELTA_20_CHANGE']])

pooled_df['DELTA_20_QUINTILES'] = pd.qcut(pooled_df[0], q=5, labels=range(1, 6))

for i, df in enumerate(covlist_reindex):
    df['DELTA_20_QUINTILES'] = pooled_df.loc[df.index, 'DELTA_20_QUINTILES']
    covlist_reindex[i] = df

Drop NaNs

In [8]:
for i in range(len(covlist_reindex)):
    covlist_reindex[i] = covlist_reindex[i].dropna()
    covlist_reindex[i] = covlist_reindex[i].reset_index(drop=True)

In [9]:
def rolling_window_save(train_window=252, test_window=60):
    os.makedirs('train_data', exist_ok=True)
    
    for cov in covlist_reindex:
        # Also removes labels 
        cov.drop(['DELTA_20_CHANGE', 'PCT_CHANGE_20'], axis=1, inplace=True)
        tdf = []
        vdf = []
        for start in range(0, len(cov) - train_window - test_window, train_window + test_window):
            end_train = start + train_window
            end_test = end_train + test_window
            
            train_df = cov.iloc[start:end_train]
            test_df = cov.iloc[end_train:end_test]
            
            tdf.append(train_df)
            vdf.append(test_df)
    
    return tdf, vdf

tdf, vdf = rolling_window_save(252, 60)

save data

In [11]:
folder_name = 'covset1'
os.makedirs(folder_name, exist_ok=True)
for i in range(len(tdf)):
    tdf[i].to_csv(f'{folder_name}/train_{i}.csv', index=False)
    vdf[i].to_csv(f'{folder_name}/valid_{i}.csv', index=False)

In [13]:
vdf[1]

Unnamed: 0,EARN_YLD,PX_TO_BOOK_RATIO,PX_TO_SALES_RATIO,RETURN_ON_ASSET,RETURN_COM_EQY,CUR_MKT_CAP,TRAIL_12M_NET_INC_GROWTH,OPER_INC_GROWTH,VWAP_STANDARD_DEV,PX_LAST,...,PX_REVERSAL_60,VOLATILITY_60,VOLUME_STD_60,MOMENTUM_120,TURNOVER_120,PX_MOMENTUM_120,PX_REVERSAL_120,VOLATILITY_120,VOLUME_STD_120,DELTA_20_QUINTILES
564,8.5248,4.18,3.317,28.5283,39.691,37442146.3,33.2473,6.9424,33.0,3250.0,...,1.292308,238.121829,7682616.0,0.975976,45029950000.0,0.975976,1.024615,297.92888,7513552.0,1
565,8.5248,4.18,3.317,28.5283,39.691,37442146.3,33.2473,6.9424,18.0,3250.0,...,1.276923,252.949326,7636200.0,0.970149,45071100000.0,0.970149,1.030769,299.437837,7500991.0,1
566,8.3956,4.2443,3.368,28.5283,39.691,38018179.32,33.2473,6.9424,41.0,3300.0,...,1.260606,263.891938,7611785.0,0.982143,45245540000.0,0.982143,1.018182,300.288085,7476876.0,1
567,8.4211,4.2315,3.3578,28.5283,39.691,37902972.72,33.2473,6.9424,14.0,3290.0,...,1.276596,273.323343,7601717.0,0.967647,45394480000.0,0.967647,1.033435,301.792544,7451115.0,1
568,8.4726,4.2057,3.3374,28.5283,39.691,37672559.51,33.2473,6.9424,16.0,3270.0,...,1.266055,298.500017,7497507.0,0.942363,46002910000.0,0.942363,1.061162,309.173063,7429558.0,1
569,8.1487,4.3729,3.4701,28.5283,39.691,39170245.36,33.2473,6.9424,35.0,3400.0,...,1.208824,302.401826,7481553.0,0.988372,46055700000.0,0.988372,1.011765,309.617773,7421879.0,1
570,8.3956,4.2443,3.368,28.5283,39.691,38018179.32,33.2473,6.9424,27.0,3300.0,...,1.257576,307.804509,7166938.0,0.962099,46211070000.0,0.962099,1.039394,311.249301,7450382.0,1
571,8.1969,4.3472,3.4497,28.5283,39.691,38939832.15,33.2473,6.9424,20.0,3380.0,...,1.221893,310.726277,7144827.0,0.991202,46335060000.0,0.991202,1.008876,311.599785,7430671.0,1
572,8.5511,4.1671,3.3068,28.5283,39.691,37326939.7,33.2473,6.9424,30.0,3240.0,...,1.287037,316.111795,7241065.0,0.95858,46739910000.0,0.95858,1.04321,313.546623,7452093.0,1
573,8.4986,4.1929,3.3272,28.5283,39.691,37557352.91,33.2473,6.9424,18.0,3260.0,...,1.282209,319.871266,7190958.0,0.947674,46725560000.0,0.947674,1.055215,315.834185,7452449.0,1
