In [10]:
!pip install optuna scikit-learn alphalens




In [12]:
!pip install ta --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ta (setup.py) ... [?25l[?25hdone


In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from scipy.special import gamma
import ta

In [17]:
from google.colab import files
uploaded = files.upload()

Saving final_df_20250601_130044.csv to final_df_20250601_130044.csv


In [22]:
final_df = pd.read_csv("final_df_20250601_130044.csv")

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel'])
y = final_df['MarketRegimeLabel']

# Optional: fill or drop any remaining NaNs in features
X = X.ffill().bfill()

# Train/test split (e.g., 80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

In [24]:
!pip install optuna
!pip install pyfinance
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Collecting pyfinance
  Downloading pyfinance-1.3.0-py3-none-any.whl.metadata (16 kB)
Collecting xmltodict (from pyfinance)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading pyfinance-1.3.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, pyfinance
Successfully installed pyfinance-1.3.0 xmltodict-0.14.2


In [31]:
def adjusted_sharpe(y_true, y_pred):
    """
    Compute an adjusted Sharpe ratio penalized by max drawdown.

    Parameters:
    - y_true: np.array or list of true future returns
    - y_pred: np.array or list of predictions (1 for long, anything else for short)

    Returns:
    - Adjusted Sharpe ratio: sharpe * (1 - max_dd)
    """
    # Generate trading signal: 1 for long, -1 for short
    signal = np.where(y_pred == 1, 1, -1)

    # Simulated strategy returns
    returns = pd.Series(signal * y_true).dropna()

    # Standard Sharpe ratio
    mean_ret = returns.mean()
    std_ret = returns.std()
    sharpe = mean_ret / std_ret if std_ret > 0 else 0

    # Cumulative return and drawdown
    cumulative = returns.cumsum()
    roll_max = cumulative.cummax()

    # Safe max drawdown calculation
    if roll_max.empty or roll_max.max() == 0:
        max_dd = 0.0
    else:
        drawdown = (cumulative - roll_max).min()
        max_dd = -drawdown / roll_max.max()

    # Adjusted Sharpe penalized by drawdown
    return sharpe * (1 - max_dd)

In [26]:
from sklearn.model_selection import TimeSeriesSplit

class PurgedGroupTimeSeriesSplit:
    def __init__(self, n_splits=5, group_gap=5):
        self.n_splits = n_splits
        self.group_gap = group_gap  # number of samples to purge between train/test

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        indices = np.arange(n_samples)
        for train_idx, test_idx in tscv.split(X):
            # Purge samples within group_gap from test set in train set
            max_train = train_idx.max()
            min_test = test_idx.min()
            purge_start = max_train - self.group_gap
            if purge_start < 0:
                purge_start = 0
            train_idx = train_idx[train_idx < purge_start]
            yield train_idx, test_idx

In [27]:
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
        n_jobs=-1
    )

    # Prepare data
    X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel', 'returns'])
    y = final_df['MarketRegimeLabel'].values
    returns = final_df['returns'].values  # actual returns aligned with labels

    # Initialize PGTS splitter
    pgts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
    scores = []

    for train_idx, test_idx in pgts.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        returns_test = returns[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # Strategy returns based on predicted regime
        strat_returns = []
        for p, r in zip(preds, returns_test):
            if p == 1:  # bullish
                strat_returns.append(r)
            elif p == 0:  # bearish
                strat_returns.append(-r)
            else:
                strat_returns.append(0)

        strat_returns = np.array(strat_returns)

        score = adjusted_sharpe(y_test, preds)
        scores.append(score)

    # Return the average score across all splits
    return np.mean(scores)

In [28]:
final_df['returns'] = final_df.groupby('Asset')['close'].pct_change()
final_df = final_df.dropna(subset=['returns'])  # drop rows with NA returns after pct_change

In [34]:
from optuna.samplers import TPESampler

sampler = TPESampler(multivariate=True)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50, n_jobs=-1)

print("Best hyperparameters:", study.best_params)
print("Best Adjusted Sharpe ratio:", study.best_value)

[I 2025-06-01 14:25:18,026] A new study created in memory with name: no-name-76eea12e-3421-471a-b35a-230c2adc5f01
[I 2025-06-01 14:29:29,342] Trial 4 finished with value: -6.745696076665612 and parameters: {'n_estimators': 153, 'max_depth': 14, 'min_samples_split': 4}. Best is trial 1 with value: -2.432347763657499.
[I 2025-06-01 14:30:41,291] Trial 0 finished with value: 12.881436601490554 and parameters: {'n_estimators': 55, 'max_depth': 14, 'min_samples_split': 2}. Best is trial 0 with value: 12.881436601490554.
[I 2025-06-01 14:32:51,315] Trial 1 finished with value: 0.11138075985411418 and parameters: {'n_estimators': 228, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 0 with value: 12.881436601490554.
[I 2025-06-01 14:36:10,514] Trial 2 finished with value: -7.775718040759452 and parameters: {'n_estimators': 229, 'max_depth': 15, 'min_samples_split': 7}. Best is trial 1 with value: -2.432347763657499.
[I 2025-06-01 14:40:17,045] Trial 3 finished with value: -19.7709020679

Best hyperparameters: {'n_estimators': 55, 'max_depth': 14, 'min_samples_split': 2}
Best Adjusted Sharpe ratio: 12.881436601490554


In [36]:
best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)

X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel', 'returns'])
y = final_df['MarketRegimeLabel']

final_model.fit(X, y)