# 03 - Predictability & Universe Filtering

1. Compute rolling predictability metrics for each ticker
2. Visualize and compare scores across universe and time
3. Select top-N most “learnable” tickers for RL agent
4. Document all decisions, assumptions, and open questions


In [None]:
# SETUP: Imports & Paths ===========================
import jupyter

import os
import pandas as pd


from tqdm import tqdm
from src.data.feature_pipeline import basic_chart_features,load_base_dataframe
from src.predictability.easiness import rolling_sharpe, rolling_r2, rolling_info_ratio, rolling_autocorr
from src.predictability.pipeline import generate_universe_easiness_report
from IPython import display


  from pandas.core import (


In [None]:
# LOAD OHLCV ==========================================
ohlcv_df = load_base_dataframe()
ohlcv_df.tail()
_ohlcv=ohlcv_df.copy()

In [None]:
# CROP THE SAMPLE =======================================
tickers = ohlcv_df['symbol'].unique()

In [None]:

#all_metric = generate_universe_easiness_report(ohlcv_df,tickers)

In [None]:
#all_metric.dropna()

# Walkforward study

In [None]:

#ohlcv_df['date'] = pd.to_datetime(ohlcv_df['date'])
ohlcv_df = _ohlcv.copy()
# Walkforward parameters
start_date = pd.to_datetime('2023-05-01')
end_date = ohlcv_df['date'].max()
freq = '6MS'   # Month Start, use 'W' for weekly, etc
window_length = 60
#tickers = ohlcv_df['ticker'].unique()

walkforward_dates = pd.date_range(start=start_date + pd.Timedelta(days=window_length), end=end_date, freq=freq)

walkforward_dates

In [None]:
prev_date = "2023-01-01"
tickers = ohlcv_df['symbol'].unique()
tickers = ['AAPL']
for cutoff in tqdm(walkforward_dates):
    df = ohlcv_df.copy()
    cutoff_str = cutoff.strftime('%Y-%m-%d')
    
    print(f"\n=== Universe study up to {cutoff_str} ===")
    _ = generate_universe_easiness_report(
        ohlcv_df=df,
        tickers=tickers,
        window_length=window_length,
        target="return_1d",
        benchmark_col="market_return_1d",
        visualize=False,   # Skip plotting for speed, or True for debug
        cutoff_end_date=cutoff_str,
        cutoff_start_date=prev_date,
        save_csv_path="data/experiments/predictability_metrics-{hash}-{start}-{cutoff}.csv".format(hash="{hash}", cutoff=cutoff_str,start=prev_date)
    )
    prev_date = cutoff_str

# Futures and nice to haves

In [None]:
import os
import pandas as pd
import json

EXPERIMENTS_DIR = "data/experiments"

# List all experiment result CSVs
all_files = [f for f in os.listdir(EXPERIMENTS_DIR) if f.startswith('predictability_metrics-') and f.endswith('.csv')]

studies = []
for fname in all_files:
    df = pd.read_csv(os.path.join(EXPERIMENTS_DIR, fname))
    # Parse config from first row (all rows have same config)
    config = json.loads(df['config_json'].iloc[0])
    studies.append({'df': df, 'config': config, 'hash': df['config_hash'].iloc[0], 'filename': fname})


In [None]:
for study in studies:
    print(f"--- Study Hash: {study['hash']} | File: {study['filename']}")
    print(json.dumps(study['config'], indent=2))
    df = study['df']
    metrics = [col for col in df.columns if col not in ["ticker", "date", "config_hash", "config_json", "symbol"]]
    for metric in metrics:
        print(f"Metric: {metric}")
        print(f"  Mean: {df[metric].mean():.4f}, Std: {df[metric].std():.4f}")
        # Top and bottom tickers (by mean)
        agg = df.groupby('ticker')[metric].mean().sort_values(ascending=False)
        print("    Top 3 tickers:", agg.head(3).to_dict())
        print("    Bottom 3 tickers:", agg.tail(3).to_dict())
    print()

In [None]:
import matplotlib.pyplot as plt

def plot_metric_across_studies(metric_name):
    plt.figure(figsize=(12,6))
    for study in studies:
        df = study['df']
        if metric_name not in df.columns:
            continue
        plt.hist(df[metric_name], bins=80, alpha=0.3, label=f"Study {study['hash'][:6]}")
    plt.legend()
    plt.title(f"Distribution of {metric_name} across studies")
    plt.show()

# Example: Compare sharpe distributions
plot_metric_across_studies("sharpe")