# 03 - Predictability & Universe Filtering

1. Compute rolling predictability metrics for each ticker
2. Visualize and compare scores across universe and time
3. Select top-N most “learnable” tickers for RL agent
4. Document all decisions, assumptions, and open questions


In [1]:
# SETUP: Imports & Paths ===========================
import jupyter
from src.utils.system import boot
import os
import pandas as pd

boot()
from tqdm import tqdm
from src.data.feature_pipeline import basic_chart_features,load_base_dataframe
from src.predictability.easiness import rolling_sharpe, rolling_r2, rolling_info_ratio, rolling_autocorr
from src.predictability.pipeline import generate_universe_easiness_report
from IPython import display
import warnings
import numpy.linalg as la

# Suppress HMM warnings
warnings.filterwarnings("ignore")


  from pandas.core import (


In [10]:
# LOAD OHLCV ==========================================
ohlcv = load_base_dataframe()
ohlcv.tail()
ohlcv.drop(columns=['sector_id','industry_id'],inplace=True)

In [11]:
# CROP THE SAMPLE =======================================
tickers = ohlcv['symbol'].unique()

In [15]:
# Adaptive Market Regime Classification Pipeline (Per Stock + Timeframe)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import RobustScaler
from collections import defaultdict
import warnings
import numpy.linalg as la

# Suppress HMM warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Utility function to check positive-definiteness
def is_positive_definite(matrix):
    try:
        la.cholesky(matrix)
        return True
    except la.LinAlgError:
        return False

# 1. Feature Engineering
def compute_regime_features(df):
    df['return'] = df['close'].pct_change()
    df['volatility'] = df['return'].rolling(21).std()
    df['momentum'] = df['close'] / df['close'].shift(21) - 1
    df['drawdown'] = (df['close'] / df['close'].cummax()) - 1
    df['rolling_corr'] = df['return'].rolling(21).corr(df['return'].shift(1))
    return df.dropna()

# 2. Try Different Regime Models
def try_regime_models(X):
    models = {
        "kmeans": KMeans(n_clusters=3, random_state=0),
        "gmm": GaussianMixture(n_components=3, covariance_type='full', random_state=0),
        "hmm": GaussianHMM(n_components=3, covariance_type='diag', n_iter=200, tol=1e-3)
    }

    results = {}
    for name, model in models.items():
        try:
            if name == "hmm":
                model.fit(X)
                if not all(is_positive_definite(np.diag(cov) if cov.ndim == 1 else cov) for cov in model.covars_):
                    raise ValueError("HMM covariance not positive-definite.")
                labels = model.predict(X)
            else:
                labels = model.fit_predict(X)
            score = silhouette_score(X, labels)
            results[name] = {'model': model, 'labels': labels, 'score': score}
        except Exception as e:
            print(f"{name} failed: {e}")
    return results

# Standardize features using robust scaling
def standardize_features(X):
    return pd.DataFrame(RobustScaler().fit_transform(X), columns=X.columns, index=X.index)

# 3. Walk-Forward Adaptive Optimization by Stock-Timeframe
def walk_forward_regime_search_grouped(df, group_col='symbol', window_size=252, step=21):
    all_results = defaultdict(list)

    for group, df_group in tqdm(df.groupby(group_col)):
        df_group = compute_regime_features(df_group)
        for start in range(0, len(df_group) - window_size, step):
            X = df_group.iloc[start:start+window_size][['return', 'volatility', 'momentum', 'drawdown', 'rolling_corr']]
            X = standardize_features(X)
            models = try_regime_models(X)
            for name, result in models.items():
                all_results[(group, name)].append({
                    'score': result['score'],
                    'labels': result['labels'],
                    'start': start,
                    'end': start+window_size
                })
    return all_results

# 4. Summarize Results Per Group
def summarize_results(results):
    summary = []
    for key, res_list in results.items():
        group, name = key
        scores = [r['score'] for r in res_list]
        summary.append({
            'group': group,
            'model': name,
            'avg_score': np.median(scores),
            'std_score': np.std(scores),
            'num_windows': len(scores)
        })
    return pd.DataFrame(summary).sort_values(['group', 'avg_score'], ascending=[True, False])

# 5. Plotting Function
def plot_regimes(df, labels, title="Regime Assignment"):
    df = df.copy()
    df['regime'] = labels
    df['close'].plot(figsize=(15, 4), title=title, color='black', alpha=0.3)
    for regime in sorted(df['regime'].unique()):
        df[df['regime'] == regime]['close'].plot(label=f'Regime {regime}', alpha=0.7)
    plt.legend()
    plt.show()

# 6. Final Report Generator (Markdown)
def generate_report(summary_df):
    report_lines = [
        "# 📈 Adaptive Market Regime Classification Report\n",
        "\n",
        "## Model Performance by Stock\n",
        summary_df.to_markdown(index=False),
        "\n",
        "---\n",
        "## Interpretation\n",
        "- Higher silhouette score → better cluster separation.\n",
        "- Monitor which model works best for each stock over time.\n",
        "- Combine regime classification with strategy backtests.\n"
    ]
    with open("regime_classification_report.md", "w") as f:
        f.write("\n".join(report_lines))
    print("Report saved as 'regime_classification_report.md'")

# Example Usage:
# df = pd.read_csv("your_data.csv")
df = ohlcv.copy()
results = walk_forward_regime_search_grouped(df)
summary = summarize_results(results)
generate_report(summary)
for (symbol, model) in results:
    last = results[(symbol, model)][-1]
    df_group = compute_regime_features(df[df['symbol'] == symbol])
    plot_regimes(df_group.iloc[last['start']:last['end']], last['labels'], title=f"{symbol} - {model} Regimes")


  0%|          | 1/504 [00:02<22:45,  2.71s/it]Model is not converging.  Current: -899.819393217708 is not greater than -899.8192312808463. Delta is -0.00016193686167298438
Model is not converging.  Current: -863.6075602575104 is not greater than -863.6075576550219. Delta is -2.602488507363887e-06
Model is not converging.  Current: -783.7624015615747 is not greater than -783.7612282906892. Delta is -0.0011732708854879093
  0%|          | 2/504 [00:05<21:06,  2.52s/it]Model is not converging.  Current: -980.9831885888506 is not greater than -980.9819133784779. Delta is -0.0012752103726825226
Model is not converging.  Current: -1236.049309401171 is not greater than -1236.0475552929342. Delta is -0.0017541082368097705
  1%|          | 3/504 [00:06<18:19,  2.19s/it]Model is not converging.  Current: -886.1019695099558 is not greater than -886.1018834336473. Delta is -8.60763084347127e-05
Model is not converging.  Current: -978.8077781441153 is not greater than -978.807728237265. Delta is -

 17%|█▋        | 85/504 [03:11<16:33,  2.37s/it]Model is not converging.  Current: -920.8533703815071 is not greater than -920.8531374172077. Delta is -0.0002329642994709502
 18%|█▊        | 93/504 [03:29<16:56,  2.47s/it]Model is not converging.  Current: -987.5937571250249 is not greater than -987.5937548807417. Delta is -2.244283223262755e-06
 19%|█▉        | 96/504 [03:36<15:54,  2.34s/it]Model is not converging.  Current: -798.506251220738 is not greater than -798.5057335334598. Delta is -0.0005176872782612918
 21%|██        | 106/504 [04:04<17:37,  2.66s/it]Model is not converging.  Current: -947.5026818655697 is not greater than -947.5026534683044. Delta is -2.839726528236497e-05
 21%|██▏       | 108/504 [04:09<16:29,  2.50s/it]Model is not converging.  Current: -1100.9458382397906 is not greater than -1100.945814816672. Delta is -2.3423118591381353e-05
 22%|██▏       | 111/504 [04:15<14:53,  2.27s/it]Some rows of transmat_ have zero sum because no transition from the state was 

hmm failed: transmat_ rows must sum to 1 (got row sums of [1. 0. 1.])


 22%|██▏       | 113/504 [04:18<12:27,  1.91s/it]Model is not converging.  Current: -1039.6714774206243 is not greater than -1039.671402750445. Delta is -7.467017940143705e-05
 23%|██▎       | 114/504 [04:20<11:24,  1.76s/it]Model is not converging.  Current: -898.3749787587022 is not greater than -898.3664768401884. Delta is -0.00850191851384352
 23%|██▎       | 115/504 [04:21<10:34,  1.63s/it]Model is not converging.  Current: -887.6759803751073 is not greater than -887.6758883018089. Delta is -9.207329844684864e-05
Model is not converging.  Current: -1082.6653569676164 is not greater than -1082.6652734082973. Delta is -8.35593191368389e-05
 23%|██▎       | 116/504 [04:23<10:50,  1.68s/it]Model is not converging.  Current: -1080.1046607441497 is not greater than -1080.103522958183. Delta is -0.0011377859666481527
 24%|██▍       | 120/504 [04:31<12:37,  1.97s/it]Model is not converging.  Current: -821.0200782307409 is not greater than -821.019574316416. Delta is -0.0005039143248950495

Model is not converging.  Current: -1306.4521936300173 is not greater than -1306.451983431019. Delta is -0.00021019899827479094
 39%|███▉      | 196/504 [07:49<19:36,  3.82s/it]Model is not converging.  Current: -1036.156836461329 is not greater than -1036.15683599671. Delta is -4.646190063795075e-07
 39%|███▉      | 197/504 [07:52<18:29,  3.61s/it]Model is not converging.  Current: -912.8353448112096 is not greater than -912.832930147865. Delta is -0.0024146633445525367
Model is not converging.  Current: -1102.0723847462161 is not greater than -1102.0715641663726. Delta is -0.0008205798435483302
 39%|███▉      | 198/504 [07:55<18:16,  3.58s/it]Model is not converging.  Current: -1058.7929196968742 is not greater than -1058.7929181639458. Delta is -1.5329283087339718e-06
Model is not converging.  Current: -1226.0114159937953 is not greater than -1226.0113772401378. Delta is -3.8753657463530544e-05
 39%|███▉      | 199/504 [08:00<19:56,  3.92s/it]Model is not converging.  Current: -1103

 55%|█████▌    | 278/504 [12:08<09:43,  2.58s/it]Model is not converging.  Current: -956.5235759753613 is not greater than -956.5234488247694. Delta is -0.00012715059187939914
Model is not converging.  Current: -1102.0171532414215 is not greater than -1102.0170997305243. Delta is -5.351089725991187e-05
 55%|█████▌    | 279/504 [12:11<10:09,  2.71s/it]Model is not converging.  Current: -1139.8987237624833 is not greater than -1139.8987234321678. Delta is -3.3031551538442727e-07
 56%|█████▌    | 280/504 [12:15<10:34,  2.83s/it]Model is not converging.  Current: -1039.352018196752 is not greater than -1039.3518423948397. Delta is -0.0001758019122917176
Model is not converging.  Current: -1123.396804930608 is not greater than -1123.3946556830233. Delta is -0.0021492475846116577
 56%|█████▌    | 283/504 [12:22<09:37,  2.61s/it]Model is not converging.  Current: -986.9269635559472 is not greater than -986.8400226956834. Delta is -0.08694086026378045
Model is not converging.  Current: -979.62

 72%|███████▏  | 362/504 [16:49<06:20,  2.68s/it]Model is not converging.  Current: -980.9658648727448 is not greater than -980.9658626687132. Delta is -2.204031602559553e-06
 72%|███████▏  | 365/504 [16:55<05:29,  2.37s/it]Model is not converging.  Current: -1029.8753641712149 is not greater than -1029.874777641335. Delta is -0.0005865298799108132
 73%|███████▎  | 366/504 [16:58<05:23,  2.34s/it]Model is not converging.  Current: -917.498742872511 is not greater than -917.4975614731558. Delta is -0.0011813993552323154
 73%|███████▎  | 369/504 [17:07<06:19,  2.81s/it]Model is not converging.  Current: -978.2646346506207 is not greater than -978.2541502672659. Delta is -0.01048438335476476
 74%|███████▍  | 372/504 [17:13<05:14,  2.38s/it]Model is not converging.  Current: -1155.8129794104761 is not greater than -1155.81253291398. Delta is -0.00044649649612438225
 75%|███████▍  | 376/504 [17:25<05:44,  2.70s/it]Model is not converging.  Current: -1202.3095840322349 is not greater than -1

hmm failed: transmat_ rows must sum to 1 (got row sums of [1. 1. 0.])


 89%|████████▉ | 448/504 [20:40<02:23,  2.56s/it]Model is not converging.  Current: -817.020690953876 is not greater than -817.0205465953369. Delta is -0.00014435853915983898
 89%|████████▉ | 449/504 [20:44<02:30,  2.74s/it]Model is not converging.  Current: -1085.8693733278092 is not greater than -1085.8653153650987. Delta is -0.004057962710476204
 89%|████████▉ | 450/504 [20:47<02:36,  2.90s/it]Model is not converging.  Current: -1167.0247926815348 is not greater than -1167.0240080376213. Delta is -0.0007846439134482353
 89%|████████▉ | 451/504 [20:50<02:32,  2.88s/it]Model is not converging.  Current: -1050.7788543573881 is not greater than -1050.7787302451218. Delta is -0.0001241122663486749
 90%|████████▉ | 453/504 [20:55<02:19,  2.74s/it]Model is not converging.  Current: -988.2528378214299 is not greater than -988.2528041068274. Delta is -3.3714602523104986e-05
 90%|█████████ | 456/504 [21:04<02:21,  2.95s/it]Model is not converging.  Current: -686.0079524191689 is not greater t

ImportError: Pandas requires version '0.9.0' or newer of 'tabulate' (version '0.8.10' currently installed).

In [31]:


def summarize_regimes_v2(df, results):
    rows = []
    for (symbol, model), runs in tqdm(results.items()):
        for run in runs:
            sub_df = compute_regime_features(df[df['symbol'] == symbol]).iloc[run['start']:run['end']].copy()
            sub_df['regime'] = run['labels']
            for regime in sorted(sub_df['regime'].unique()):
                regime_df = sub_df[sub_df['regime'] == regime]
                rows.append({
                    'symbol': symbol,
                    'model': model,
                    'regime': regime,
                    'mean_return': regime_df['return'].mean(),
                    'volatility': regime_df['return'].std(),
                    'frequency': len(regime_df)
                })
    summary_df = pd.DataFrame(rows)
    return summary_df.groupby(['symbol', 'model', 'regime']).agg(
        mean_return=('mean_return', 'mean'),
        volatility=('volatility', 'mean'),
        frequency=('frequency', 'sum')
    ).reset_index().sort_values(by='mean_return', ascending=False)

detailed_stats = summarize_regimes_v2(df, results)
print(detailed_stats.head())

100%|██████████| 1512/1512 [42:59<00:00,  1.71s/it]


     symbol   model  regime  mean_return  volatility  frequency
536    BIIB     hmm       2     0.112633    0.017837       1298
539    BIIB  kmeans       2     0.071240    0.015408       2168
533    BIIB     gmm       2     0.070695    0.016127       2244
2336   JNPR     hmm       2     0.053508    0.013800       1243
2333   JNPR     gmm       2     0.043821    0.014168       1914


In [None]:
!pip install tabulate

In [33]:
detailed_stats['regime'].unique()

array([2, 0, 1], dtype=int64)