In [5]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection._split import _BaseKFold

In [3]:
def information_ratio(strategy_returns, benchmark_returns):
    """
    Compute the Information Ratio (IR) of a strategy.

    IR = mean(active return) / std(active return)
    Active return = strategy return - benchmark return

    Parameters:
        strategy_returns (array-like): Returns of your strategy.
        benchmark_returns (array-like): Returns of benchmark (e.g., market).

    Returns:
        float: Information ratio. Returns 0 if tracking error is 0.
    """
    # Convert to numpy arrays
    strategy_returns = np.asarray(strategy_returns)
    benchmark_returns = np.asarray(benchmark_returns)

    # Validation
    if strategy_returns.shape != benchmark_returns.shape:
        raise ValueError("Strategy and benchmark returns must have the same shape.")

    # Handle NaNs
    mask = ~np.isnan(strategy_returns) & ~np.isnan(benchmark_returns)
    if np.sum(mask) == 0:
        return 0

    active_returns = strategy_returns[mask] - benchmark_returns[mask]
    tracking_error = np.std(active_returns)

    return np.mean(active_returns) / tracking_error if tracking_error > 0 else 0

In [4]:
def objective(trial, df, label_col='MarketRegimeLabel', return_col='returns', drop_cols=None):
    """
    Optuna objective function using Information Ratio as the evaluation metric.

    Parameters:
        trial: Optuna trial object
        df (pd.DataFrame): DataFrame containing features, labels, and returns
        label_col (str): Column name for target variable
        return_col (str): Column name for realized returns
        drop_cols (list): List of columns to drop from features (e.g., ['Date', 'Asset', ...])

    Returns:
        float: Average information ratio across cross-validation folds
    """
    from sklearn.ensemble import RandomForestClassifier

    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
        n_jobs=-1
    )

    # Prepare data
    if drop_cols is None:
        drop_cols = []

    X = df.drop(columns=drop_cols + [label_col, return_col])
    y = df[label_col].values
    returns = df[return_col].values

    # Time-series aware CV
    pgts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
    scores = []

    for train_idx, test_idx in pgts.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        returns_test = returns[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # Generate strategy returns based on predictions
        strat_returns = []
        for p, r in zip(preds, returns_test):
            if p == 1: strat_returns.append(r)       # long
            elif p == 0: strat_returns.append(-r)     # short
            else: strat_returns.append(0)             # neutral

        strat_returns = np.array(strat_returns)
        benchmark_returns = np.zeros_like(strat_returns)  # assuming cash as benchmark

        score = information_ratio(strat_returns, benchmark_returns)
        scores.append(score)

    return np.mean(scores)