In [2]:
import numpy as np
import wandb

from sklearn.metrics import mean_squared_error
from quantile_forest import RandomForestQuantileRegressor

from topquartile.modules.datamodule.dataloader import DataLoader
from topquartile.modules.datamodule.transforms.covariate import (
    TechnicalCovariateTransform, FundamentalCovariateTransform)
from topquartile.modules.datamodule.transforms.label import BinaryLabelTransform, ExcessReturnTransform
from topquartile.modules.datamodule.partitions import PurgedTimeSeriesPartition

LABEL_DURATION = 20

covtrans_config = [(
    TechnicalCovariateTransform,
    dict(sma=[20, 40, 60],
         ema=[20, 40, 60],
         turnover=[20, 40, 60, 120, 240],
         bb=True,
         mean_price_volatility=True,
         awesome=True,
         macd=[(12, 26, 9)],
         price_gap=[20, 40, 60],
         price_ratio=[9, 19, 39, 59, 119],
         acceleration_rate=True,
         volatility=[10, 20, 40, 60, 120],
         volume_std=[10, 20, 40, 60, 120]),
)]

labeltrans_config = [(ExcessReturnTransform, dict(label_duration=LABEL_DURATION))]
partition_config = dict(n_splits=5, gap=20, max_train_size=504, test_size=60)

dataloader = DataLoader(
    data_id="covariates_may2025v2",
    covariate_transform=covtrans_config,
    label_transform=labeltrans_config,
    partition_class=PurgedTimeSeriesPartition,
    partition_kwargs=partition_config,
)
folds = dataloader.get_cv_folds()

Data not yet processed. Processing now...
Reading data from: /Users/gregruyoga/gmoneycodes/topquartile/topquartile/data/covariates_may2025v2.csv
Found 134 raw ticker names.
 Applying TechnicalCovariateTransform with params {'sma': [20, 40, 60], 'ema': [20, 40, 60], 'turnover': [20, 40, 60, 120, 240], 'price_gap': [20, 40, 60], 'price_ratio': [9, 19, 39, 59, 119], 'acceleration_rate': True, 'volatility': [10, 20, 40, 60, 120], 'volume_std': [10, 20, 40, 60, 120]}
THIS IS COLUMNS Index(['TOTAL_EQUITY', 'BOOK_VAL_PER_SH', 'REVENUE_PER_SH', 'RETURN_COM_EQY',
       'CUR_MKT_CAP', 'PX_LAST', 'TOT_DEBT_TO_TOT_ASSET',
       'TOT_DEBT_TO_TOT_EQY', 'BS_TOT_LIAB2', 'BS_TOT_ASSET', 'IS_EPS',
       'PX_HIGH', 'PX_LOW', 'PX_CLOSE_1D', 'PX_VOLUME', 'TURNOVER', 'ticker',
       'DVD_SH_12M'],
      dtype='object')
Applying label transformations globally to the dataset (before partitioning).
 Applying ExcessReturnTransform with params {'label_duration': 20} (globally)


  self.ihsg.index = pd.to_datetime(self.ihsg.index)
  self.data = self.data.apply(pd.to_numeric, errors='ignore')


Data processing complete.
Partitioning data using PurgedTimeSeriesPartition for 5 splits across 85 tickers.
Fold 0: Train shape (42840, 64), Test shape (5100, 64)
Fold 1: Train shape (42840, 64), Test shape (5100, 64)
Fold 2: Train shape (42840, 64), Test shape (5100, 64)
Fold 3: Train shape (42840, 64), Test shape (5100, 64)
Fold 4: Train shape (42840, 64), Test shape (5100, 64)
Partitioning complete. Generated 5 CV folds.


In [3]:
folds[0][0].isna().mean()

TOTAL_EQUITY          0.000000
BOOK_VAL_PER_SH       0.000000
REVENUE_PER_SH        0.000000
RETURN_COM_EQY        0.000000
CUR_MKT_CAP           0.242834
                        ...   
acceleration_20/60    0.174837
acceleration_40/60    0.173063
eq_returns_20         0.293651
index_returns_20      0.123016
excess_returns_20     0.378385
Length: 64, dtype: float64

In [1]:
TARGET = f'excess_returns_{LABEL_DURATION}'

train_df, test_df = folds[1]
train_df, test_df = train_df.dropna(), test_df.dropna()
DROP_COLS = [TARGET, f"index_returns_{LABEL_DURATION}", f"eq_returns_{LABEL_DURATION}", "ticker"]

train_labels = train_df[TARGET]
train_covs = train_df.drop(columns=DROP_COLS)
valid_labels = train_df[TARGET]
valid_covs = train_df.drop(columns=DROP_COLS)

config = dict(
    n_estimators=100,
    max_depth=50,
    min_samples_leaf=2,
    max_features=1.0,
    bootstrap=True,
    min_samples_split=15,
    criterion='absolute_error',
)

model = RandomForestQuantileRegressor(
    n_estimators=config['n_estimators'],
    max_depth=config['max_depth'],
    min_samples_leaf=config['min_samples_leaf'],
    max_features=config['max_features'],
    min_samples_split=config['min_samples_split'],
    criterion=config['criterion'],
)

model.fit(train_covs, train_labels)
preds = model.predict(train_covs.value, quantiles=[0.05, 0.5, 0.95])
rmse = float(np.sqrt(mean_squared_error(valid_labels, preds)))
print(rmse)


NameError: name 'LABEL_DURATION' is not defined