In [1]:
import pandas as pd
import os
import sklearn
import numpy as np

Load data

In [2]:
covariates = pd.read_csv("./data/bigdata.csv", index_col="Dates", skiprows=5)
covariates.index = pd.to_datetime(covariates.index)

labels = pd.read_csv("./data/jkse.csv", skiprows=5, index_col="Dates")
labels.index = pd.to_datetime(labels.index)

labels = labels.rename(columns={"PX_LAST": "JKSE_PRICE"})
labels['PCT_CHANGE_20'] = labels['JKSE_PRICE'].pct_change(periods=20) * 100

Split the dataset per ticker

In [3]:
def split_dataset(df):
    """
    splits dataset per ticker 
    :param df: dataframe to be split
    :return: list of dataframe split per ticker
    """
    tickerlist = []
    for i in range(27):
        start = i * 11
        end = start * 2
        ticker = df.iloc[:, 11:22]
        
        if i == 0:
            ticker = df.iloc[:, :11]
            
        tickerlist.append(ticker)
    return tickerlist

covlist = split_dataset(covariates)

Create technical covariates

In [4]:
colnames = covlist[0].columns.tolist()
for cov in covlist:
    # unify column names
    cov.columns = colnames
for cov in covlist:
    # Volume = Turnover / Close Price
    cov['VOLUME'] = cov['TURNOVER'] / cov['PX_LAST']
    # Calculate percent change * 100
    cov['PCT_CHANGE_20'] = cov['PX_LAST'].pct_change(periods=20) * 100
    # Ratio 10/30 = mean volume ratio for the last 10 days / mean volume ratio for the last 30 days
    cov['VOL_RATIO_10_30'] = cov['VOLUME'].rolling(window=10).mean() / cov['VOLUME'].rolling(window=30).mean()
    cov['VOL_RATIO_30_60'] = cov['VOLUME'].rolling(window=30).mean() / cov['VOLUME'].rolling(window=60).mean()
    cov['VOL_RATIO_60_120'] = cov['VOLUME'].rolling(window=60).mean() / cov['VOLUME'].rolling(window=120).mean()
    # Example: Momentum Indicator for various lags
    lags = [10, 20, 30, 60, 120]
    for lag in lags:
        cov[f'MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'TURNOVER_{lag}'] = cov['TURNOVER'].rolling(window=lag).mean()
        cov[f'PX_MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'PX_REVERSAL_{lag}'] = cov['PX_LAST'].shift(lag) / cov['PX_LAST']
        cov[f'VOLATILITY_{lag}'] = cov['PX_LAST'].rolling(window=lag).std()
        cov[f'VOLUME_STD_{lag}'] = cov['VOLUME'].rolling(window=lag).std()

In [5]:
# Remove duplicate indices if any
labels = labels[~labels.index.duplicated(keep='first')]

for i in range(len(covlist)):
    cov = covlist[i]
    cov = cov[~cov.index.duplicated(keep='first')]

    # Explicitly create a copy of the cov DataFrame to avoid SettingWithCopyWarning
    cov_copy = cov.copy()

    # Align the DataFrames on their indices (dates)
    aligned_df = labels.join(cov_copy[['PCT_CHANGE_20']], how='inner', lsuffix='_labels')

    # Calculate the difference and store it in cov_copy DataFrame
    cov_copy.loc[aligned_df.index, 'DELTA_20_CHANGE'] = aligned_df['PCT_CHANGE_20_labels'] - aligned_df['PCT_CHANGE_20']

    # Update the original DataFrame in covlist
    covlist[i] = cov_copy


In [6]:
def unique_index(df, suffix):
    df_copy = df.copy()
    new_index = [f"{date.strftime('%Y-%m-%d')}-{suffix}" for date in df.index]
    df_copy.index = new_index
    return df_copy

Create reindexed covlist + pool to find quantiles + insert index back to other covariates

In [7]:
covlist_reindex = []
for idx, cov in enumerate(covlist):
    covlist_reindex.append(unique_index(cov, idx))
    
pooled_df = pd.DataFrame()
for i, df in enumerate(covlist_reindex):
    pooled_df = pd.concat([pooled_df, df['DELTA_20_CHANGE']])

pooled_df['DELTA_20_QUINTILES'] = pd.qcut(pooled_df[0], q=5, labels=range(1, 6))

for i, df in enumerate(covlist_reindex):
    df['DELTA_20_QUINTILES'] = pooled_df.loc[df.index, 'DELTA_20_QUINTILES']
    covlist_reindex[i] = df

Drop NaNs

In [8]:
for i in range(len(covlist_reindex)):
    covlist_reindex[i] = covlist_reindex[i].dropna()
    covlist_reindex[i] = covlist_reindex[i].reset_index(drop=True)

In [14]:
def rolling_window_save(train_window=252, test_window=60):
    os.makedirs('train_data', exist_ok=True)
    
    for cov in covlist_reindex:
        tdf = []
        vdf = []
        for start in range(0, len(cov) - train_window - test_window, train_window + test_window):
            end_train = start + train_window
            end_test = end_train + test_window
            
            train_df = cov.iloc[start:end_train]
            test_df = cov.iloc[end_train:end_test]
            
            tdf.append(train_df)
            vdf.append(test_df)
    
    return tdf, vdf

tdf, vdf = rolling_window_save(252, 60)

In [15]:
len(tdf)

11

In [19]:
tdf[1]

Unnamed: 0,EARN_YLD,PX_TO_BOOK_RATIO,PX_TO_SALES_RATIO,RETURN_ON_ASSET,RETURN_COM_EQY,CUR_MKT_CAP,TRAIL_12M_NET_INC_GROWTH,OPER_INC_GROWTH,VWAP_STANDARD_DEV,PX_LAST,...,VOLATILITY_60,VOLUME_STD_60,MOMENTUM_120,TURNOVER_120,PX_MOMENTUM_120,PX_REVERSAL_120,VOLATILITY_120,VOLUME_STD_120,DELTA_20_CHANGE,DELTA_20_QUINTILES
312,6.8449,4.2622,1.6568,14.2932,32.3926,228732056.4,40.0394,33.6618,17,5650,...,209.811238,1.768850e+07,1.021700,2.510712e+11,1.021700,0.978761,300.214416,3.259739e+07,2.262523,4
313,6.8570,4.2547,1.6539,14.2932,32.3926,228327220.9,40.0394,33.6618,8,5640,...,198.886093,1.806132e+07,1.025455,2.501726e+11,1.025455,0.975177,301.271114,3.274642e+07,1.870479,3
314,6.7908,4.2962,1.6700,14.2932,32.3926,230553816.1,40.0394,33.6618,10,5695,...,187.414086,1.815726e+07,1.050738,2.487637e+11,1.050738,0.951712,303.227630,3.282986e+07,0.035052,3
315,6.5272,4.4697,1.7375,14.2932,32.3926,239865032.6,40.0394,33.6618,61,5925,...,179.261885,1.931083e+07,1.102326,2.515439e+11,1.102326,0.907173,308.440734,3.301322e+07,-2.417836,2
316,6.6506,4.3867,1.7052,14.2932,32.3926,235411842.1,40.0394,33.6618,30,5815,...,174.265691,1.794452e+07,1.048693,2.514845e+11,1.048693,0.953568,311.161813,3.302578e+07,-4.026886,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,6.1597,4.5537,1.7290,12.6845,30.6500,294315407.1,15.8033,7.3733,33,7270,...,189.044131,2.357532e+07,1.058224,2.368856e+11,1.058224,0.944979,285.549301,2.146030e+07,3.949928,4
560,6.2631,4.4785,1.7005,12.6845,30.6500,289457381.1,15.8033,7.3733,20,7150,...,188.633860,2.345225e+07,1.056911,2.381606e+11,1.056911,0.946154,281.330494,2.144406e+07,5.098818,5
561,6.3161,4.4410,1.6862,12.6845,30.6500,287028368.1,15.8033,7.3733,26,7090,...,188.228791,2.343152e+07,1.022350,2.392729e+11,1.022350,0.978138,279.859659,2.146704e+07,5.072115,5
562,6.4294,4.3627,1.6565,12.6845,30.6500,281967924.4,15.8033,7.3733,16,6965,...,190.556926,2.361864e+07,1.024265,2.417462e+11,1.024265,0.976310,277.630250,2.158641e+07,4.956461,4


In [20]:
vdf[0]

Unnamed: 0,EARN_YLD,PX_TO_BOOK_RATIO,PX_TO_SALES_RATIO,RETURN_ON_ASSET,RETURN_COM_EQY,CUR_MKT_CAP,TRAIL_12M_NET_INC_GROWTH,OPER_INC_GROWTH,VWAP_STANDARD_DEV,PX_LAST,...,VOLATILITY_60,VOLUME_STD_60,MOMENTUM_120,TURNOVER_120,PX_MOMENTUM_120,PX_REVERSAL_120,VOLATILITY_120,VOLUME_STD_120,DELTA_20_CHANGE,DELTA_20_QUINTILES
252,7.3541,3.9613,1.5136,14.2382,32.2093,195333127.8,43.0876,31.8698,21,4825,...,265.349612,39845540.0,0.991778,249426000000.0,0.991778,1.00829,344.197395,31947940.0,-0.779941,3
253,7.3541,3.9613,1.5136,14.2382,32.2093,195333127.8,43.0876,31.8698,27,4825,...,264.244686,39854620.0,0.996901,250411900000.0,0.996901,1.003109,344.382734,31891780.0,-5.616021,1
254,7.3313,3.9736,1.5183,14.2382,32.2093,195940381.1,43.0876,31.8698,25,4840,...,261.363277,40119460.0,1.018947,252515600000.0,1.018947,0.981405,343.185879,32172280.0,-5.690951,1
255,7.1611,4.0681,1.5544,14.2382,32.2093,200595989.3,43.0876,31.8698,11,4955,...,257.175485,39985000.0,1.036611,253458700000.0,1.036611,0.964682,341.154529,32076290.0,-8.694998,1
256,7.1975,4.0475,1.5466,14.2382,32.2093,199583900.5,43.0876,31.8698,13,4930,...,252.786882,39969590.0,0.997976,253037300000.0,0.997976,1.002028,338.921909,32052750.0,-3.976358,2
257,6.9575,4.1871,1.5999,14.2382,32.2093,206466104.0,43.0876,31.8698,37,5100,...,245.645753,40035320.0,1.018981,254420400000.0,1.018981,0.981373,338.237396,32287050.0,-9.914631,1
258,6.695,4.3513,1.6626,14.2382,32.2093,214562814.0,43.0876,31.8698,42,5300,...,243.448898,40038870.0,1.06,256851700000.0,1.06,0.943396,336.784907,32555030.0,-13.334895,1
259,6.8172,4.2733,1.6328,14.2382,32.2093,210716876.7,43.0876,31.8698,37,5205,...,239.558725,40139610.0,0.993321,255145200000.0,0.993321,1.006724,336.892362,32616180.0,-9.957859,1
260,6.87,4.2405,1.6203,14.2382,32.2093,209097534.7,43.0876,31.8698,19,5165,...,239.380675,39974400.0,0.965421,256588000000.0,0.965421,1.035818,337.294265,32605730.0,-2.862318,2
261,6.8369,4.261,1.6281,14.2382,32.2093,210109623.5,43.0876,31.8698,34,5190,...,238.829818,39933310.0,0.970093,257395200000.0,0.970093,1.030829,337.585965,32560730.0,0.851056,3


save data

In [21]:
folder_name = 'covset0'
os.makedirs(folder_name, exist_ok=True)
for i in range(len(tdf)):
    tdf[i].to_csv(f'{folder_name}/train_{i}.csv', index=False)
    vdf[i].to_csv(f'{folder_name}/valid_{i}.csv', index=False)

In [None]:
tdf[0].columns