In [1]:
import sys
sys.path.append('/Users/shintarou/coding/topquartile')

from topquartile.modules.datamodule.dataloader import DataLoader


In [2]:
# Run only if you have errors loading the topquartile module

from pathlib import Path
import sys

root = Path().resolve().parent.parent.parent
sys.path.append(root)

In [3]:
print(root)
print(sys.path)

/Users/shintarou/coding/topquartile
['/opt/homebrew/Caskroom/miniconda/base/envs/topq/lib/python313.zip', '/opt/homebrew/Caskroom/miniconda/base/envs/topq/lib/python3.13', '/opt/homebrew/Caskroom/miniconda/base/envs/topq/lib/python3.13/lib-dynload', '', '/opt/homebrew/Caskroom/miniconda/base/envs/topq/lib/python3.13/site-packages', '/Users/shintarou/coding/topquartile', '/opt/homebrew/Caskroom/miniconda/base/envs/topq/lib/python3.13/site-packages/setuptools/_vendor', PosixPath('/Users/shintarou/coding/topquartile')]


In [4]:
from topquartile.modules.datamodule.dataloader import DataLoader
from topquartile.modules.datamodule.transforms.covariate import (TechnicalCovariateTransform, FundamentalCovariateTransform)
from topquartile.modules.datamodule.transforms.label import BinaryLabelTransform
from topquartile.modules.datamodule.partitions import PurgedTimeSeriesPartition

In [5]:
covtrans_config = [((TechnicalCovariateTransform, dict(sma = [20, 30],
                                                       ema = [20, 30],
                                                       momentum_change=True,
                                                       volatility = [20, 30],)))]

labeltrans_config = [(BinaryLabelTransform, dict(label_duration=20,
                                                quantile=0.75))]

partition_config = dict(n_splits=5, gap=20, max_train_size=504, test_size=60, verbose=False)

In [6]:
dataloader = DataLoader(data_id='dec2024', covariate_transform=covtrans_config,
                  label_transform=labeltrans_config, partition_class=PurgedTimeSeriesPartition,
                  partition_kwargs=partition_config)

In [7]:
folds = dataloader.get_cv_folds()

Data not yet processed. Processing now...
Reading data from: /Users/shintarou/coding/topquartile/topquartile/data/dec2024.csv
Found 342 raw ticker names.
 Applying TechnicalCovariateTransform with params {'sma': [20, 30], 'ema': [20, 30], 'momentum_change': True, 'volatility': [20, 30]}
Applying label transformations globally to the dataset (before partitioning).
 Applying BinaryLabelTransform with params {'label_duration': 20, 'quantile': 0.75} (globally)


  df_copy.groupby(level=self.ticker_level_name, group_keys=False)[self.price_column]


Data processing complete.
Partitioning data using PurgedTimeSeriesPartition for 5 splits across 113 tickers.
Fold 0: Train shape (56952, 39), Test shape (6780, 39)


  data_grouped_by_ticker = self.data.groupby(level="TickerIndex")


Fold 1: Train shape (56952, 39), Test shape (6780, 39)
Fold 2: Train shape (56952, 39), Test shape (6780, 39)
Fold 3: Train shape (56952, 39), Test shape (6780, 39)
Fold 4: Train shape (56952, 39), Test shape (6780, 39)
Partitioning complete. Generated 5 CV folds.


In [8]:
train = folds[0][0]
valid = folds[0][1]


In [9]:
train = train.dropna(how='any', inplace=False)
valid = valid.dropna(how='any', inplace=False)

train.drop('ticker', axis=1, inplace=True)
valid.drop('ticker', axis=1, inplace=True)

to_remove = ['INDEX_RETURN', 'EXCESS_RETURN', '20d_stock_return', 'label']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop('ticker', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid.drop('ticker', axis=1, inplace=True)


In [10]:
train_label = train['EXCESS_RETURN']
train_covariates = train.drop(to_remove, axis=1, inplace=False)

In [11]:
valid_label = valid['EXCESS_RETURN']
valid_covariates = valid.drop(to_remove, axis=1, inplace=False)

In [12]:
from quantile_forest import RandomForestQuantileRegressor
qrf = RandomForestQuantileRegressor()
qrf.fit(train_covariates.to_numpy(), train_label.to_numpy())
y_pred = qrf.predict(valid_covariates.to_numpy(), quantiles=[0.025, 0.5, 0.975])

In [13]:
y_pred

array([[ -6.76628361,   3.8114594 ,  14.79235203],
       [ -4.6183646 ,   5.73611089,  31.52581966],
       [ -6.28290445,   4.89947832,  15.31006166],
       ...,
       [-14.03731297,   0.19225049,   8.8777626 ],
       [-27.32517125,   0.46776354,  49.32271189],
       [-24.63845489,   0.44077901,  49.32271189]], shape=(4940, 3))

## Evaluation

In [22]:
import pandas as pd
import numpy as np

# y_pred: shape (n_samples, 3) with quantiles [0.025, 0.5, 0.975]
df = pd.DataFrame(y_pred, columns=['q025', 'q50', 'q975'])

# Risk = width of the prediction interval
df['risk'] = df['q975'] - df['q025']
df['sharpe'] = df['q50'] / df['risk']

# Remove invalid values (Inf, NaN, near-zero risk)
df = df.replace([np.inf, -np.inf], np.nan).dropna()
df = df[df['risk'] > 1e-6]  # Only keep non-zero risk samples

# Summary statistics
print("Sharpe-like score summary:")
print(df['sharpe'].describe())

# Extract samples with high Sharpe-like scores (e.g., top 10%)
top_sharpes = df[df['sharpe'] > df['sharpe'].quantile(0.9)]
print(f"Top {len(top_sharpes)} samples with high Sharpe-like scores")


Sharpe-like score summary:
count    4940.000000
mean        0.011232
std         0.084462
min        -0.295040
25%        -0.038763
50%         0.014236
75%         0.063946
max         0.268930
Name: sharpe, dtype: float64
Top 494 samples with high Sharpe-like scores


## Optimazation　using parameters same as the paper

In [21]:
import optuna
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from quantile_forest import RandomForestQuantileRegressor

# Preloaded and preprocessed data
X_train = train_covariates.to_numpy()
y_train = train_label.to_numpy()
X_valid = valid_covariates.to_numpy()
y_valid = valid_label.to_numpy()

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 300, 600, step=100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 15)
    max_features = trial.suggest_float('max_features', 0.3, 0.7)

    model = RandomForestQuantileRegressor(
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        n_jobs=-1,
        random_state=42
    )

    # Train on a smaller subset to reduce time
    subset = slice(0, int(len(X_train) * 0.5))  # 50% of training data
    model.fit(X_train[subset], y_train[subset])
    y_pred = model.predict(X_valid, quantiles=[0.025, 0.5, 0.975])

    df = pd.DataFrame(y_pred, columns=['q025', 'q50', 'q975'])
    df['y_true'] = y_valid
    df['risk'] = df['q975'] - df['q025']
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    df = df[df['risk'] > 1e-6]

    if df.empty:
        return -np.inf

    df['sharpe'] = df['q50'] / df['risk']
    df['covered'] = ((df['y_true'] >= df['q025']) & (df['y_true'] <= df['q975'])).astype(int)

    mae = mean_absolute_error(df['y_true'], df['q50'])
    sharpe_median = df['sharpe'].median()
    coverage = df['covered'].mean()

    score = sharpe_median + (2 * coverage) - (0.05 * mae)
    return score

# Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, timeout=600)

print("Best trial params:", study.best_trial.params)
print("Best score:", study.best_value)


[I 2025-05-24 01:52:40,505] A new study created in memory with name: no-name-47fec855-cadb-4dfc-a8fc-cf80f5bb47d1
[I 2025-05-24 01:52:49,551] Trial 0 finished with value: 1.317345045552114 and parameters: {'n_estimators': 300, 'min_samples_leaf': 6, 'max_features': 0.3155280197979616}. Best is trial 0 with value: 1.317345045552114.
[I 2025-05-24 01:53:14,284] Trial 1 finished with value: 1.3253362325638647 and parameters: {'n_estimators': 500, 'min_samples_leaf': 12, 'max_features': 0.6139876485139416}. Best is trial 1 with value: 1.3253362325638647.
[I 2025-05-24 01:53:27,129] Trial 2 finished with value: 1.3141553514538291 and parameters: {'n_estimators': 300, 'min_samples_leaf': 6, 'max_features': 0.4497982533391982}. Best is trial 1 with value: 1.3253362325638647.
[I 2025-05-24 01:53:48,443] Trial 3 finished with value: 1.3324046302210393 and parameters: {'n_estimators': 500, 'min_samples_leaf': 14, 'max_features': 0.5057215376766494}. Best is trial 3 with value: 1.3324046302210393

Best trial params: {'n_estimators': 500, 'min_samples_leaf': 13, 'max_features': 0.3505239316906511}
Best score: 1.3463267649803288
