In [1]:
import pandas as pd
import os
import sklearn
import numpy as np

Load data

In [2]:
covariates = pd.read_csv("./data/bigdata.csv", index_col="Dates", skiprows=5)
covariates.index = pd.to_datetime(covariates.index)

labels = pd.read_csv("./data/jkse.csv", skiprows=5, index_col="Dates")
labels.index = pd.to_datetime(labels.index)

labels = labels.rename(columns={"PX_LAST": "JKSE_PRICE"})
labels['PCT_CHANGE_20'] = labels['JKSE_PRICE'].pct_change(periods=20) * 100

Split the dataset per ticker

In [3]:
def split_dataset(df):
    """
    splits dataset per ticker 
    :param df: dataframe to be split
    :return: list of dataframe split per ticker
    """
    tickerlist = []
    for i in range(27):
        start = i * 11
        end = start * 2
        ticker = df.iloc[:, 11:22]
        
        if i == 0:
            ticker = df.iloc[:, :11]
            
        tickerlist.append(ticker)
    return tickerlist

covlist = split_dataset(covariates)

Create technical covariates

In [4]:
colnames = covlist[0].columns.tolist()
for cov in covlist:
    # unify column names
    cov.columns = colnames
for cov in covlist:
    # Volume = Turnover / Close Price
    cov['VOLUME'] = cov['TURNOVER'] / cov['PX_LAST']
    # Calculate percent change * 100
    cov['PCT_CHANGE_20'] = cov['PX_LAST'].pct_change(periods=20) * 100
    # Ratio 10/30 = mean volume ratio for the last 10 days / mean volume ratio for the last 30 days
    cov['VOL_RATIO_10_30'] = cov['VOLUME'].rolling(window=10).mean() / cov['VOLUME'].rolling(window=30).mean()
    cov['VOL_RATIO_30_60'] = cov['VOLUME'].rolling(window=30).mean() / cov['VOLUME'].rolling(window=60).mean()
    cov['VOL_RATIO_60_120'] = cov['VOLUME'].rolling(window=60).mean() / cov['VOLUME'].rolling(window=120).mean()
    # Example: Momentum Indicator for various lags
    lags = [10, 20, 30, 60, 120]
    for lag in lags:
        cov[f'MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'TURNOVER_{lag}'] = cov['TURNOVER'].rolling(window=lag).mean()
        cov[f'PX_MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'PX_REVERSAL_{lag}'] = cov['PX_LAST'].shift(lag) / cov['PX_LAST']
        cov[f'VOLATILITY_{lag}'] = cov['PX_LAST'].rolling(window=lag).std()
        cov[f'VOLUME_STD_{lag}'] = cov['VOLUME'].rolling(window=lag).std()

In [5]:
# Remove duplicate indices if any
labels = labels[~labels.index.duplicated(keep='first')]

for i in range(len(covlist)):
    cov = covlist[i]
    cov = cov[~cov.index.duplicated(keep='first')]

    # Explicitly create a copy of the cov DataFrame to avoid SettingWithCopyWarning
    cov_copy = cov.copy()

    # Align the DataFrames on their indices (dates)
    aligned_df = labels.join(cov_copy[['PCT_CHANGE_20']], how='inner', lsuffix='_labels')

    # Calculate the difference and store it in cov_copy DataFrame
    cov_copy.loc[aligned_df.index, 'DELTA_20_CHANGE'] = aligned_df['PCT_CHANGE_20_labels'] - aligned_df['PCT_CHANGE_20']

    # Update the original DataFrame in covlist
    covlist[i] = cov_copy


In [6]:
def unique_index(df, suffix):
    df_copy = df.copy()
    new_index = [f"{date.strftime('%Y-%m-%d')}-{suffix}" for date in df.index]
    df_copy.index = new_index
    return df_copy

Create reindexed covlist + pool to find quantiles + insert index back to other covariates

In [7]:
covlist_reindex = []
for idx, cov in enumerate(covlist):
    covlist_reindex.append(unique_index(cov, idx))
    
pooled_df = pd.DataFrame()
for i, df in enumerate(covlist_reindex):
    pooled_df = pd.concat([pooled_df, df['DELTA_20_CHANGE']])

pooled_df['DELTA_20_QUINTILES'] = pd.qcut(pooled_df[0], q=5, labels=range(1, 6))

for i, df in enumerate(covlist_reindex):
    df['DELTA_20_QUINTILES'] = pooled_df.loc[df.index, 'DELTA_20_QUINTILES']
    covlist_reindex[i] = df

Drop NaNs

In [8]:
for i in range(len(covlist_reindex)):
    covlist_reindex[i] = covlist_reindex[i].dropna()
    covlist_reindex[i] = covlist_reindex[i].reset_index(drop=True)

In [9]:
def rolling_window_save(train_window=252, test_window=60):
    os.makedirs('train_data', exist_ok=True)
    
    for cov in covlist_reindex:
        tdf = []
        vdf = []
        for start in range(0, len(cov) - train_window - test_window, train_window):
            end_train = start + train_window
            end_test = end_train + test_window
            
            train_df = cov.iloc[start:end_train]
            test_df = cov.iloc[end_train:end_test]
            
            tdf.append(train_df)
            vdf.append(test_df)
    
    return tdf, vdf

tdf, vdf = rolling_window_save(252, 60)

save data

In [10]:
folder_name = 'covset0'
# os.makedirs(folder_name)
for i in range(len(tdf)):
    tdf[i].to_csv(f'{folder_name}/train_{i}.csv', index=False)
    vdf[i].to_csv(f'{folder_name}/valid_{i}.csv', index=False)

In [11]:
tdf[0].columns

Index(['EARN_YLD', 'PX_TO_BOOK_RATIO', 'PX_TO_SALES_RATIO', 'RETURN_ON_ASSET',
       'RETURN_COM_EQY', 'CUR_MKT_CAP', 'TRAIL_12M_NET_INC_GROWTH',
       'OPER_INC_GROWTH', 'VWAP_STANDARD_DEV', 'PX_LAST', 'TURNOVER', 'VOLUME',
       'PCT_CHANGE_20', 'VOL_RATIO_10_30', 'VOL_RATIO_30_60',
       'VOL_RATIO_60_120', 'MOMENTUM_10', 'TURNOVER_10', 'PX_MOMENTUM_10',
       'PX_REVERSAL_10', 'VOLATILITY_10', 'VOLUME_STD_10', 'MOMENTUM_20',
       'TURNOVER_20', 'PX_MOMENTUM_20', 'PX_REVERSAL_20', 'VOLATILITY_20',
       'VOLUME_STD_20', 'MOMENTUM_30', 'TURNOVER_30', 'PX_MOMENTUM_30',
       'PX_REVERSAL_30', 'VOLATILITY_30', 'VOLUME_STD_30', 'MOMENTUM_60',
       'TURNOVER_60', 'PX_MOMENTUM_60', 'PX_REVERSAL_60', 'VOLATILITY_60',
       'VOLUME_STD_60', 'MOMENTUM_120', 'TURNOVER_120', 'PX_MOMENTUM_120',
       'PX_REVERSAL_120', 'VOLATILITY_120', 'VOLUME_STD_120',
       'DELTA_20_CHANGE', 'DELTA_20_QUINTILES'],
      dtype='object')