In [1]:
from pathlib import Path
import sys

# Go 4 levels up to reach topquartile module from the notebook
root = Path().resolve().parents[2]

# Add it to the top of sys.path
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print("✅ Project root added to sys.path:", root)

✅ Project root added to sys.path: /Users/admin/RR_Project_Regime_Prediction/topquartile


In [2]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from BorutaShap import BorutaShap
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression
from arch.__future__ import reindexing
from topquartile.modules.datamodule.partitions import PurgedGroupTimeSeriesPartition
from topquartile.modules.datamodule.dataloader import DataLoader
from topquartile.modules.datamodule.transforms.covariate import (TechnicalCovariateTransform,
                                                                 FundamentalCovariateTransform,
                                                                 MacroeconomicCovariateTransform)
from topquartile.modules.datamodule.transforms.label import KMRFLabelTransform

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def frac_diff(series, d, thres=0.01):
    w = [1.]
    for k in range(1, len(series)):
        w_ = -w[-1] * (d - k + 1) / k
        if abs(w_) < thres:
            break
        w.append(w_)
    w = np.array(w[::-1]).reshape(-1, 1)
    df = pd.Series(series)
    result = pd.Series(dtype='float64')

    for i in range(len(w), len(df)):
        val = np.dot(w.T, df.iloc[i - len(w):i])[0]
        result.at[df.index[i]] = val

    return result

In [4]:
macro_df = pd.read_csv("/Users/admin/Desktop/macro_may2025v2.csv", parse_dates=["Dates"])
macro_df = macro_df.loc[:, ~macro_df.columns.str.contains('^Unnamed')]

In [5]:
macro_df

Unnamed: 0,Dates,VIX Index,GTIDR10Y Govt,IDBIRRPO Index,FEDL01 Index,IDCPIY Index,IDR Curncy,DXY Curncy
0,2015-04-28,12.41,7.736,,0.13,6.38,12982,96.092
1,2015-04-29,13.39,7.695,,0.13,6.38,12944,95.209
2,2015-04-30,14.55,7.667,,0.08,6.79,12964,94.600
3,2015-01-05,12.70,7.665,,0.13,6.79,12964,95.297
4,2015-04-05,12.85,7.746,,0.13,6.79,12983,95.480
...,...,...,...,...,...,...,...,...
2625,2025-05-20,18.09,6.830,5.75,4.33,1.95,16415,100.118
2626,2025-05-21,20.87,6.814,5.50,4.33,1.95,16395,99.559
2627,2025-05-22,20.28,6.832,5.50,4.33,1.95,16330,99.960
2628,2025-05-23,22.29,6.811,5.50,4.33,1.95,16222,99.112


In [6]:
print ("✅ Macro data loaded:", macro_df.shape)

✅ Macro data loaded: (2630, 8)


In [7]:
print(macro_df.columns)

Index(['Dates', 'VIX Index', 'GTIDR10Y Govt', 'IDBIRRPO Index', 'FEDL01 Index',
       'IDCPIY Index', 'IDR Curncy', 'DXY Curncy'],
      dtype='object')


In [8]:
macro_df.isnull().sum()

Dates               0
VIX Index           0
GTIDR10Y Govt       0
IDBIRRPO Index    257
FEDL01 Index        0
IDCPIY Index        0
IDR Curncy          0
DXY Curncy          0
dtype: int64

In [9]:
macro_df

Unnamed: 0,Dates,VIX Index,GTIDR10Y Govt,IDBIRRPO Index,FEDL01 Index,IDCPIY Index,IDR Curncy,DXY Curncy
0,2015-04-28,12.41,7.736,,0.13,6.38,12982,96.092
1,2015-04-29,13.39,7.695,,0.13,6.38,12944,95.209
2,2015-04-30,14.55,7.667,,0.08,6.79,12964,94.600
3,2015-01-05,12.70,7.665,,0.13,6.79,12964,95.297
4,2015-04-05,12.85,7.746,,0.13,6.79,12983,95.480
...,...,...,...,...,...,...,...,...
2625,2025-05-20,18.09,6.830,5.75,4.33,1.95,16415,100.118
2626,2025-05-21,20.87,6.814,5.50,4.33,1.95,16395,99.559
2627,2025-05-22,20.28,6.832,5.50,4.33,1.95,16330,99.960
2628,2025-05-23,22.29,6.811,5.50,4.33,1.95,16222,99.112


In [10]:
required_cols = ['GTIDR10Y Govt', 'IDCPIY Index', 'VIX Index', 'FEDL01 Index', 'IDBIRRPO Index', 'IDR Curncy', 'DXY Curncy']
print("Missing:", [col for col in required_cols if col not in macro_df.columns])

Missing: []


In [11]:
covtrans_config = [

    (FundamentalCovariateTransform, dict(
        pe_ratio=True,
        earnings_yield=True,
        pe_band=([60, 120], [25, 50, 75]),
        debt_to_assets=True,
        debt_to_capital=True,
        equity_ratio=True,
        market_to_book=True,
        eps_growth=True,
        price_to_book=True,
        dividend_yield=True,
    )),

    (MacroeconomicCovariateTransform, dict(
        root_path="/Users/admin/Desktop/macro_may2025.csv", 
        vix_index=True,
        indo_10y_yield=True,
        bi_rate=True,
        fed_funds_rate=True,
        indo_cpi_yoy=True,
        usd_idr=True,
        dxy_index=True,
    )),

    (TechnicalCovariateTransform, dict(
        sma=[10, 20, 50, 100],
        ema=[10, 20, 50],
        rsi=[14],
        macd=True,
        macd_signal=True,
        macd_histogram=True,
        roc=[6, 10, 20],
        cmo=[14],
        atr=True,
        trix=[21],
        obv=True,
        mfi=True,
        force_index=True,
        stc=True,
        bb=True,
        ultimate=True,
        awesome=True,
        plus_di=True,
        minus_di=True,
        max_return=[5, 10, 20],
        price_gap=[20],
        price_vs_sma=[20],
        momentum_change=True,
        ulcer=True,
        mean_price_volatility=[21, 252],

        # TSB + TSFRESH features
        approximate_entropy=True,
        adfuller=True,
        binned_entropy=True,
        cid_ce=True,
        count_above_mean=True,
        count_below_mean=True,
        energy_ratio_chunks=True,
        fft_aggregated=True,
        first_location_maximum=True,
        first_location_minimum=True,
        fourier_entropy=True,
        index_mass_quantile=0.5,
        kurtosis=True,
        last_location_of_maximum=True,
        lempel_ziv_complexity=True,
        linear_trend_timewise=True,
        longest_strike_above_mean=True,
        longest_strike_below_mean=True,
        mean_change=True,
        mean_abs_change=True,
        mean_second_derivative_central=True,
        number_cwt_peaks=True,
        permutation_entropy=True,
        sample_entropy=True,
        skewness=True,
        spkt_welch_density=True,
        time_reversal_asymmetry_statistic=True,
        variation_coefficient=True,
    )),
]

labeltrans_config = [
    (KMRFLabelTransform, dict(price_column="PX_LAST", kama_n=10, gamma=0.5))]

partition_config = dict(
    n_splits=5,
    max_test_group_size=30,
    gap=5,
    verbose=True
)

dataloader = DataLoader(data_id='covariates_may2025v2', covariate_transform=covtrans_config,
                  label_transform=labeltrans_config, partition_class=PurgedGroupTimeSeriesPartition,
                  partition_kwargs=partition_config)

dataloader.covariates_path = Path("/Users/admin/Desktop/covariates_may2025v2.csv")

In [13]:
cv_folds = dataloader.get_cv_folds()
print("Data shape:", dataloader.data.shape)

Partitioning data using PurgedGroupTimeSeriesPartition for 5 splits across 85 tickers.
 Using date groups for ticker ADHI with PurgedGroupTimeSeriesPartition.
[fold 0] train groups 2015-01-05T00:00:00.000000000–2024-10-23T00:00:00.000000000, test groups 2024-10-31T00:00:00.000000000–2024-12-08T00:00:00.000000000
[fold 1] train groups 2015-01-05T00:00:00.000000000–2024-12-02T00:00:00.000000000, test groups 2024-12-09T00:00:00.000000000–2025-01-27T00:00:00.000000000
[fold 2] train groups 2015-01-05T00:00:00.000000000–2025-01-20T00:00:00.000000000, test groups 2025-01-28T00:00:00.000000000–2025-03-21T00:00:00.000000000
[fold 3] train groups 2015-01-05T00:00:00.000000000–2025-03-14T00:00:00.000000000, test groups 2025-03-24T00:00:00.000000000–2025-05-19T00:00:00.000000000
[fold 4] train groups 2015-01-05T00:00:00.000000000–2025-05-05T00:00:00.000000000, test groups 2025-05-20T00:00:00.000000000–2025-12-05T00:00:00.000000000
 Using date groups for ticker ADMR with PurgedGroupTimeSeriesParti

In [14]:
cv_folds= pd.concat(cv_folds[0], axis=0)

In [15]:
print(cv_folds.isnull().sum().sort_values(ascending=False).head(15))

pe_band_120_50     213010
pe_band_120_75     213010
pe_band_120_25     213010
pe_band_60_75      213010
pe_band_60_50      213010
pe_band_60_25      213010
eps_growth         212982
pe_ratio           211863
earnings_yield     211690
price_to_book      211687
market_to_book     211673
RETURN_COM_EQY     210136
REVENUE_PER_SH     210106
IS_EPS             210102
BOOK_VAL_PER_SH    210094
dtype: int64


In [16]:
print(cv_folds.shape)
print(cv_folds.isnull().mean().sort_values(ascending=False).head(10))

(213010, 32)
pe_band_120_50    1.000000
pe_band_120_75    1.000000
pe_band_120_25    1.000000
pe_band_60_75     1.000000
pe_band_60_50     1.000000
pe_band_60_25     1.000000
eps_growth        0.999869
pe_ratio          0.994615
earnings_yield    0.993803
price_to_book     0.993789
dtype: float64


In [17]:
cv_folds

Unnamed: 0_level_0,Unnamed: 1_level_0,TOTAL_EQUITY,BOOK_VAL_PER_SH,REVENUE_PER_SH,RETURN_COM_EQY,CUR_MKT_CAP,PX_LAST,TOT_DEBT_TO_TOT_ASSET,TOT_DEBT_TO_TOT_EQY,BS_TOT_LIAB2,BS_TOT_ASSET,...,pe_band_120_50,pe_band_120_75,earnings_yield,debt_to_assets,debt_to_capital,equity_ratio,market_to_book,eps_growth,price_to_book,dividend_yield
TickerIndex,DateIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADHI,2015-01-05,,,,,,,,,,,...,,,,,,,,,,
ADHI,2015-01-06,,,,,4.404227e+06,1900.0,,,,,...,,,,,,,,,,0.014713
ADHI,2015-01-07,,,,,3.674693e+06,1585.0,,,,,...,,,,,,,,,,0.017637
ADHI,2015-01-09,,,,,3.458534e+06,1492.0,,,,,...,,,,,,,,,,0.018736
ADHI,2015-01-10,,,,,7.246328e+06,1863.0,,,,,...,,,,,,,,,,0.015005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UNVR,2024-12-03,,,,,9.880850e+07,2590.0,,,,,...,,,,,,,,,,0.051737
UNVR,2024-12-04,,,,,1.030050e+08,2700.0,,,,,...,,,,,,,,,,0.049630
UNVR,2024-12-06,,,,,1.243690e+08,3260.0,,,,,...,,,,,,,,,,0.041104
UNVR,2024-12-07,,,,,1.148315e+08,3010.0,,,,,...,,,,,,,,,,0.044518


In [18]:
print("Sample columns:", dataloader.data.columns.tolist()[:10])
print("Total columns:", len(dataloader.data.columns))

Sample columns: ['TOTAL_EQUITY', 'BOOK_VAL_PER_SH', 'REVENUE_PER_SH', 'RETURN_COM_EQY', 'CUR_MKT_CAP', 'PX_LAST', 'TOT_DEBT_TO_TOT_ASSET', 'TOT_DEBT_TO_TOT_EQY', 'BS_TOT_LIAB2', 'BS_TOT_ASSET']
Total columns: 32
