In [2]:
import pandas as pd
from utils import split_dataset
import os

In [8]:
bigdataname = "bigdata2"
csv = pd.read_csv(f"./data/{bigdataname}.csv", skiprows=5)
covariates = csv if csv.shape[1] != 1 else pd.read_csv(f"./data/{bigdataname}.csv", skiprows=5, delimiter=';')


In [9]:
covariates

Unnamed: 0,Dates,EARN_YLD,PX_TO_BOOK_RATIO,PX_TO_SALES_RATIO,RETURN_ON_ASSET,RETURN_COM_EQY,CUR_MKT_CAP,TRAIL_12M_NET_INC_GROWTH,OPER_INC_GROWTH,VWAP_STANDARD_DEV,...,PX_TO_SALES_RATIO.113,RETURN_ON_ASSET.113,RETURN_COM_EQY.113,CUR_MKT_CAP.113,TRAIL_12M_NET_INC_GROWTH.113,OPER_INC_GROWTH.113,VWAP_STANDARD_DEV.113,PX_LAST.113,PX_OPEN.113,TURNOVER.113
0,1/1/2009,10.8646,2.5222,1.8021,2.6493,28.5128,5.641018e+07,23.1576,-13.9342,4,...,,,,,,,,,,
1,1/2/2009,10.8646,2.5222,1.8021,2.6493,28.5128,5.641018e+07,23.1576,-13.9342,4,...,,,,,,,,,,
2,1/5/2009,10.2485,2.6738,1.9104,2.6493,28.5128,5.980095e+07,23.1576,-13.9342,4,...,,,,,,,,,,
3,1/6/2009,10.3553,2.6463,1.8907,2.6493,28.5128,5.918580e+07,23.1576,-13.9342,7,...,,,,,,,,,,
4,1/7/2009,10.5196,2.6049,1.8612,2.6493,28.5128,5.826380e+07,23.1576,-13.9342,10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4078,8/20/2024,8.0807,2.4398,3.3240,3.1926,20.1464,7.502171e+08,8.1974,4.6891,26,...,0.3016,1.1163,7.4556,1609704.450,104.5348,33.919,4.0,186.0,176.0,1.100717e+10
4079,8/21/2024,7.8431,2.5137,3.4247,3.1926,20.1464,7.729509e+08,8.1974,4.6891,31,...,0.2935,1.1163,7.4556,1566432.825,104.5348,33.919,2.0,181.0,186.0,3.507341e+09
4080,8/22/2024,7.9207,2.4891,3.3912,3.1926,20.1464,7.653730e+08,8.1974,4.6891,28,...,0.2854,1.1163,7.4556,1523161.200,104.5348,33.919,2.0,176.0,181.0,1.563136e+09
4081,8/23/2024,7.7669,2.5384,3.4583,3.1926,20.1464,7.805289e+08,8.1974,4.6891,13,...,0.2918,1.1163,7.4556,1557778.500,104.5348,33.919,1.0,180.0,176.0,1.001054e+09


In [10]:
covariates.index = pd.to_datetime(covariates.index)

labels = pd.read_csv("./data/jkse2.csv", index_col="Date")
labels.index = pd.to_datetime(labels.index)
labels = labels[['Close']]

labels = labels.rename(columns={"Close": "JKSE_PRICE"})
labels['PCT_CHANGE_20_JKSE'] = ((labels['JKSE_PRICE'].shift(-20) - labels['JKSE_PRICE']) / labels['JKSE_PRICE']) * 100

covlist = split_dataset(covariates)

In [11]:
colnames = covlist[0].columns.tolist()
for cov in covlist:
    # unify column names
    cov.columns = colnames
for cov in covlist:
    # Volume = Turnover / Close Price
    cov['VOLUME'] = cov['TURNOVER'] / cov['PX_LAST']
    # Calculate percent change * 100
    cov['PCT_CHANGE_20'] = ((cov['PX_LAST'].shift(-20) - cov['PX_LAST']) / cov['PX_LAST']) * 100
    # Ratio 10/30 = mean volume ratio for the last 10 days / mean volume ratio for the last 30 days
    cov['VOL_RATIO_10_20'] = cov['VOLUME'].rolling(window=10).mean() / cov['VOLUME'].rolling(window=20).mean()
    cov['VOL_RATIO_20_40'] = cov['VOLUME'].rolling(window=20).mean() / cov['VOLUME'].rolling(window=40).mean()
    cov['VOL_RATIO_40_80'] = cov['VOLUME'].rolling(window=40).mean() / cov['VOLUME'].rolling(window=80).mean()
    cov['VOL_RATIO_80_120'] = cov['VOLUME'].rolling(window=80).mean() / cov['VOLUME'].rolling(window=120).mean()
    
    # PE Band
    cov['PE_Ratio'] = 1 / cov['EARN_YLD']
    win = 60  # Set the rolling window period
    cov['PE_Band_25'] = cov['PE_Ratio'].rolling(win).quantile(0.25)
    cov['PE_Band_50'] = cov['PE_Ratio'].rolling(win).quantile(0.50)
    cov['PE_Band_75'] = cov['PE_Ratio'].rolling(win).quantile(0.75)

    # Calculate the 12-day EMA of PX_LAST
    ema_12 = cov['PX_LAST'].ewm(span=12, adjust=False).mean()

    # Calculate the 26-day EMA of PX_LAST
    ema_26 = cov['PX_LAST'].ewm(span=26, adjust=False).mean()

    # Calculate MACD
    cov['MACD'] = ema_12 - ema_26

    # Calculate the Signal line (9-day EMA of MACD)
    cov['MACD_Signal'] = cov['MACD'].ewm(span=9, adjust=False).mean()

    # Optionally, you can also calculate the MACD Histogram (the difference between MACD and Signal line)
    cov['MACD_Histogram'] = cov['MACD'] - cov['MACD_Signal']
    
    # Example: Momentum Indicator for various lags
    lags = [10, 20, 30, 60, 120]
    for lag in lags:
        cov[f'MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'TURNOVER_{lag}'] = cov['TURNOVER'].rolling(window=lag).mean()
        cov[f'PX_MOMENTUM_{lag}'] = cov['PX_LAST'] / cov['PX_LAST'].shift(lag)
        cov[f'PX_REVERSAL_{lag}'] = cov['PX_LAST'].shift(lag) / cov['PX_LAST']
        cov[f'VOLATILITY_{lag}'] = cov['PX_LAST'].rolling(window=lag).std()
        cov[f'VOLUME_STD_{lag}'] = cov['VOLUME'].rolling(window=lag).std()

In [12]:
covlist2b = []
for cov in covlist:
    cov2b = cov.tail(10)
    covlist2b.append(cov2b)

tickernames = [col[:4] for col in pd.read_csv(f"./data/{bigdataname}.csv", skiprows=3).columns if not col.startswith("Unnamed")]

for i in range(len(covlist)):
    covlist2b[i].loc[:, 'Ticker'] = tickernames[i]

  tickernames = [col[:4] for col in pd.read_csv(f"./data/{bigdataname}.csv", skiprows=3).columns if not col.startswith("Unnamed")]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covlist2b[i].loc[:, 'Ticker'] = tickernames[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covlist2b[i].loc[:, 'Ticker'] = tickernames[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v

In [13]:
pred_covlist = pd.concat(covlist2b, axis=0)

In [14]:
pred_covlist.to_csv('./2bpred/Aug2024.csv')

## Per 15 Year Quintiles

In [None]:
import os
import pandas as pd

# Remove duplicate indices if any
labels = labels[~labels.index.duplicated(keep='first')]

for i in range(len(covlist)):
    cov = covlist[i]
    cov = cov[~cov.index.duplicated(keep='first')]

    # Explicitly create a copy of the cov DataFrame to avoid SettingWithCopyWarning
    cov_copy = cov.copy()

    # Align the DataFrames on their indices (dates)
    aligned_df = labels.join(cov_copy[['PCT_CHANGE_20']], how='inner')

    # Calculate the difference and store it in cov_copy DataFrame
    cov_copy.loc[aligned_df.index, 'DELTA_20_CHANGE'] = aligned_df['PCT_CHANGE_20'] - aligned_df['PCT_CHANGE_20_JKSE']

    # Update the original DataFrame in covlist
    covlist[i] = cov_copy

tickernames = [col[:4] for col in pd.read_csv(f"./data/{bigdataname}.csv", skiprows=3).columns if not col.startswith("Unnamed")]

for i in range(len(covlist)):
    covlist[i]['Ticker'] = tickernames[i]

def unique_index(df, suffix):
    df_copy = df.copy()
    new_index = [f"{date.strftime('%Y-%m-%d')}-{suffix}" for date in df.index]
    df_copy.index = new_index
    return df_copy

covlist_reindex = []
for idx, cov in enumerate(covlist):
    cov['Ticker'] = cov['Ticker'].unique()[0]
    covlist_reindex.append(unique_index(cov, idx))

pooled_df = pd.DataFrame()
for i, df in enumerate(covlist_reindex):
    pooled_df = pd.concat([pooled_df, df['DELTA_20_CHANGE']])

# Apply quintiles to pooled data
pooled_df['DELTA_20_QUINTILES'] = pd.qcut(pooled_df[0], q=4, labels=range(1, 5))

# Apply the quintiles to each DataFrame in covlist_reindex
for i, df in enumerate(covlist_reindex):
    df['DELTA_20_QUINTILES'] = pooled_df.loc[df.index, 'DELTA_20_QUINTILES']
    covlist_reindex[i] = df

# Create the TOP_5 column based on the 75th percentile
for cov in covlist_reindex:
    cov['TOP_5'] = cov['DELTA_20_QUINTILES'].apply(lambda x: 1 if x == 4 else 0)
    cov['TOP_5'] = cov['TOP_5'].fillna(0).astype(int)  # Fill NaN values with 0 before conversion

for i in range(len(covlist_reindex)):
    covlist_reindex[i] = covlist_reindex[i].dropna()
    covlist_reindex[i] = covlist_reindex[i].reset_index(drop=False)

def rolling_window_save_oob(train_window=252, test_window=20, last_n_obs=40):
    tdf = []
    vdf = []
    last_obs_vdf = []

    for cov in covlist_reindex:
        if cov.empty:  # Skip empty DataFrames
            continue

        ticker = cov['Ticker'].iloc[0] if not cov.empty else 'Unknown'
        cov_data = cov

        # Split the last N observations as a separate validation set
        last_obs_df = cov_data.iloc[-last_n_obs:].copy()
        cov_data = cov_data.iloc[:-last_n_obs].copy()

        # Add the last observations to the validation set
        if not last_obs_df.empty:
            last_obs_df['Ticker'] = ticker
            last_obs_df['Window'] = 'Last_N'
            last_obs_vdf.append(last_obs_df)

        # Calculate the number of complete windows
        num_windows = max((len(cov_data) - train_window - test_window) // test_window + 1, 0)

        for i in range(num_windows):
            start_train = i * (train_window + test_window)
            end_train = start_train + train_window
            start_test = end_train
            end_test = start_test + test_window

            train_df = cov_data.iloc[start_train:end_train].copy()
            test_df = cov_data.iloc[start_test:end_test].copy()

            # Verify window sizes
            if len(train_df) == 0 or len(test_df) == 0:
                continue  # Skip any incomplete windows

            print(f"Ticker: {ticker}, Window: {i}, Train size: {len(train_df)}, Test size: {len(test_df)}")

            # Check for non-overlapping dates
            overlapping_dates = train_df.index.intersection(test_df.index)
            if not overlapping_dates.empty:
                print(f"Warning: Overlapping dates found for Ticker: {ticker}, Window: {i}. Overlapping dates: {overlapping_dates}")
            else:
                print(f"No overlapping dates for Ticker: {ticker}, Window: {i}.")

            train_df['Ticker'] = ticker
            test_df['Ticker'] = ticker
            test_df['Window'] = i

            tdf.append(train_df)
            vdf.append(test_df)

    # Combine last observations into the validation DataFrame
    vdf.extend(last_obs_vdf)

    return tdf, vdf

# Run the function
tdf, vdf = rolling_window_save_oob(252, 10, last_n_obs=40)

# Display information about the combined vdf

# Display information about the combined vdf

In [None]:
# Step 1: Concatenate all DataFrames in tdf into train_data
train_data = pd.concat(tdf, ignore_index=True)

# Step 2: Set the 'index' column as the index for train_data
train_data.set_index('index', inplace=True)

# Since vdf is already a DataFrame, directly set the 'index' column as its index
valid_data = pd.concat(vdf, ignore_index=True)
valid_data = valid_data.set_index('index', inplace=False)


In [None]:
train_data.drop(['DELTA_20_CHANGE'], axis=1, inplace=True)

names = ['quintiles', "top_5"]
folder_name = 'covariatesbig'
for name in names:
    if name == 'quintiles':
        tdata = train_data.drop(['TOP_5'], axis=1)
        vdata = valid_data.drop(['TOP_5'], axis=1)
        
        os.makedirs(folder_name, exist_ok=True)
        tdata.to_csv(f"{folder_name}/train_{name}.csv")
        vdata.to_csv(f"{folder_name}/valid_{name}.csv")
    elif name == 'top_5':
        tdata = train_data.drop(['DELTA_20_QUINTILES'], axis=1)
        vdata = valid_data.drop(['DELTA_20_QUINTILES'], axis=1)

        os.makedirs(folder_name, exist_ok=True)
        pred_covlist.to_csv(f"{folder_name}/covpred_{name}.csv")
        tdata.to_csv(f"{folder_name}/train_{name}.csv")
        vdata.to_csv(f"{folder_name}/valid_{name}.csv") 


## This is for hmm clean

In [None]:
deltas = []
for cov in covlist:
    delta = cov[['DELTA_20_CHANGE']]
    deltas.append(delta)

In [None]:
delta_df = pd.concat(deltas, axis=1)
tickernames = [col[:4] for col in pd.read_csv(f"./data/{bigdataname}.csv", skiprows=3).columns if not col.startswith("Unnamed")]
delta_df.columns = tickernames

In [None]:
delta_df.drop(['MDKA', 'ICBP', 'ARTO', 'PGEO'], axis=1, inplace=True)
delta_df.dropna(inplace=True, axis=0, how='all')

In [None]:
delta_df.to_csv('./covset0/unnorm/hmm.csv')

In [None]:
def unique_index(df, suffix):
    df_copy = df.copy()
    new_index = [f"{date.strftime('%Y-%m-%d')}-{suffix}" for date in df.index]
    df_copy.index = new_index
    return df_copy

Create reindexed covlist + pool to find quantiles + insert index back to other covariates

In [None]:
covlist_reindex = []
for idx, cov in enumerate(covlist):
    covlist_reindex.append(unique_index(cov, idx))
    
pooled_df = pd.DataFrame()
for i, df in enumerate(covlist_reindex):
    pooled_df = pd.concat([pooled_df, df['DELTA_20_CHANGE']])

pooled_df['DELTA_20_QUINTILES'] = pd.qcut(pooled_df[0], q=5, labels=range(1, 6))

for i, df in enumerate(covlist_reindex):
    df['DELTA_20_QUINTILES'] = pooled_df.loc[df.index, 'DELTA_20_QUINTILES']
    covlist_reindex[i] = df

Drop NaNs

In [None]:
covlist_reindex[1]

In [None]:
for cov in covlist_reindex:
    cov['TOP_5'] = cov['DELTA_20_QUINTILES'].apply(lambda x: 1 if x == 5 else 0)
    cov['TOP_5'] = cov['TOP_5'].fillna(0).astype(int)  # Fill NaN values with 0 before conversion
    cov.drop('DELTA_20_QUINTILES', axis=1, inplace=True)

In [None]:
for i in range(len(covlist_reindex)):
    covlist_reindex[i] = covlist_reindex[i].dropna()
    covlist_reindex[i] = covlist_reindex[i].reset_index(drop=True)

In [None]:
def rolling_window_save(train_window=252, test_window=60):
    os.makedirs('train_data', exist_ok=True)
    
    for cov in covlist_reindex:
        # Also removes labels 
        cov.drop(['DELTA_20_CHANGE', 'PCT_CHANGE_20'], axis=1, inplace=True)
        tdf = []
        vdf = []
        for start in range(0, len(cov) - train_window - test_window, train_window + test_window):
            end_train = start + train_window
            end_test = end_train + test_window
            
            train_df = cov.iloc[start:end_train]
            test_df = cov.iloc[end_train:end_test]
            
            tdf.append(train_df)
            vdf.append(test_df)
    
    return tdf, vdf

tdf, vdf = rolling_window_save(252, 60)

save data

In [None]:
folder_name = 'covset2'
os.makedirs(folder_name, exist_ok=True)
for i in range(len(tdf)):
    tdf[i].to_csv(f'{folder_name}/train_{i}.csv', index=False)
    vdf[i].to_csv(f'{folder_name}/valid_{i}.csv', index=False)