In [3]:
import jupyter

In [4]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from src.utils.system import boot
from src.data.feature_pipeline import load_base_dataframe
from experiments import check_if_experiment_exists, register_experiment ,experiment_hash
from trackers import EpisodeTracker,EnvironmentTracker,AgentTracker
from environments import PositionTradingEnvV2

# ========== SYSTEM BOOT ==========
DEVICE = boot()
EXPERIMENT_NAME = "core_learnability_test"
DEFAULT_PATH = "data/experiments/" + EXPERIMENT_NAME
DEVICE = boot()
OHLCV_DF = load_base_dataframe()

# Steps of the Learning Test Suite Pipeline

1. get monthly episodes
2. for each month, get prev episode_length days + lookback + required rollback for things to work.
3. the data for the train dataframe should end in the 1st day of a given month
4. th data for the test dataframe should start on the 1st day of a given month

Having the episodes sampled, let's give them a id and lets train each of the agents with their barebones , to compare what every one of them was able to learn in that timeframe and how usefull it was to th next timeframe

Classify episodes by:
1. meta features
2. next day classifier meta features
3. agent learning efficiency
4. noise 
5. financial metrics
5. transferability

Final goal is to predict if a episode is usefull to be trained at

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from scipy.stats import skew, kurtosis, entropy
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import acf, acovf


In [30]:
def mean_policy(arr):
    # return np.median(arr)
    return pd.Series(arr).ewm(span=5).mean().iloc[-1]

In [31]:
episode_tracker = EpisodeTracker()

In [75]:
import joblib

EXPERIMENT_NAME = "core_episode_rank"
DEFAULT_PATH = "data/experiments/" + EXPERIMENT_NAME
FEATURES_PATH = DEFAULT_PATH+'/features'
TARGETS_PATH = DEFAULT_PATH+'/targets'
META_PATH = DEFAULT_PATH+'/meta'

class EpisodeRanker:
    def __init__(self,ticker,episode_length=120):
        self.episode_tracker = EpisodeTracker()
        self.agent_tracker = AgentTracker()
        self.environment_tracker = EnvironmentTracker()
        self.ticker = ticker
        self.episode_length = episode_length
        self.episodes = self.episode_tracker.monthly_episode_walkforward(OHLCV_DF,ticker,episode_length)
        self.features = []
        self.targets = []
        self.metadata = []
        self.load()
    
    def load(self):
        self.df = OHLCV_DF.copy()
        if all([os.path.exists(path) for path in [FEATURES_PATH, TARGETS_PATH, META_PATH]]):
            self.features = joblib.load(FEATURES_PATH)
            self.targets = joblib.load(TARGETS_PATH)
            self.metadata = joblib.load(META_PATH)
            print("Loaded cached feature/target/meta lists.")

    def evaluate(self,env_cls,agent_cls,env_kwargs={},agent_kwargs={}):
        episodes = self.episodes
        environment = env_cls(self.df,self.ticker,**env_kwargs)
        
        agent = agent_cls.init(environment)
        
        evaluation_entry = {
            "env_id": self.environment_tracker.findOrCreate(env_cls,env_kwargs),
            "agent_id": self.agent_tracker.findOrCreate(agent_cls,agent_kwargs)
        }
        for ep in episodes:
            evaluation_entry['episode_id']= ep["episode_id"]
            df_t1= ep["train_episode"].copy()
            r1d =  ep["train_episode"]['return_1d'].astype(float).values
            v = ep['train_episode']['volume'].astype(float).values
            feat = {
                'symbol': self.ticker,
                'month_str': str(ep['date']),
                'mean_return': mean_policy(r1d),
                'std_return': r1d.std(),
                'skew': skew(r1d),
                'kurtosis': kurtosis(r1d),
                'entropy': entropy(np.histogram(r1d, bins=10, density=True)[0] + 1e-8),
                'vol_mean': mean_policy(v),
                'vol_std': v.std()
            }
            df_lag = df_t1.copy()
            for lag in range(1, 5 + 1):
                df_lag[f'return_lag_{lag}'] = df_lag['return_1d'].shift(lag)
            df_lag = df_lag.dropna()
          
            if len(df_lag) < 5:
                continue
            X = df_lag[[f'return_lag_{i}' for i in range(1, 5+ 1)]].values
            y = df_lag['return_1d'].values
            model = RandomForestRegressor(n_estimators=300, random_state=42)
            model.fit(X, y)
            residuals = y - model.predict(X)
            # Meta-diagnostics
            ljung_pval = acorr_ljungbox(residuals, lags=[5], return_df=True).iloc[0]['lb_pvalue']
            feat['ljung_pval'] = ljung_pval
            feat['episode_id'] = ep['episode_id']
            feat['resid_acf1'] = pd.Series(residuals).autocorr(lag=1)
            feat['resid_std'] = residuals.std()
            feat['resid_skew'] = skew(residuals)
            feat['resid_kurtosis'] = kurtosis(residuals)
            # Predictability label (cross-val R²)
            cv_r2 = mean_policy(cross_val_score(model, X, y, cv=2, scoring='r2'))
            self.features.append(feat)
            self.targets.append(cv_r2)
            self.metadata.append(("AAPL", str(ep['date'])))
            
        
        return agent,environment,evaluation_entry

In [76]:
e["agent_id"]["id"]

0

In [77]:
ep_ranker.features

[{'symbol': 'AAPL',
  'month_str': '2024-01-01',
  'mean_return': -0.00217315495308203,
  'std_return': 0.011972717765185093,
  'skew': -0.8511455333878075,
  'kurtosis': 1.7345324574674956,
  'entropy': 1.8472986697540643,
  'vol_mean': 41490640.90139169,
  'vol_std': 16460479.888669945,
  'ljung_pval': 0.9480842635535118,
  'resid_acf1': 0.08014185924628414,
  'resid_std': 0.005028164947951779,
  'resid_skew': -0.7163116354205482,
  'resid_kurtosis': 1.5369467844632716},
 {'symbol': 'AAPL',
  'month_str': '2024-02-01',
  'mean_return': -0.011569650399054046,
  'std_return': 0.01223517879599969,
  'skew': -0.35847476091546415,
  'kurtosis': 0.5330960628384496,
  'entropy': 1.9811566737652901,
  'vol_mean': 53134145.90497142,
  'vol_std': 15040296.392909277,
  'ljung_pval': 0.8464709129200448,
  'resid_acf1': 0.0064673057163466535,
  'resid_std': 0.0048243155197991465,
  'resid_skew': -0.22244313640745023,
  'resid_kurtosis': 0.3417824934522793},
 {'symbol': 'AAPL',
  'month_str': '202

In [78]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor

class CustomAgent:
    def __init__(self,model_cls,policy, **kwargs):
        #self.model = model_cls(policy,environment,**kwargs)
        self.__version__ = model_cls.__name__
        self.model_cls = model_cls
        self.policy = policy
        self.kwargs = {
            **kwargs
        }
        self.config = {
            "name":self.__version__,
            "policy": policy,
            **kwargs
        }
    def init(self,environment):
        self.model = self.model_cls(self.policy,environment,**self.kwargs)
        return self.model
    
    
PpoAgent = CustomAgent(PPO,"MlpPolicy", ent_coef=0.1, verbose=1)
PpoAgent

<__main__.CustomAgent at 0x23bcd534bd0>

In [79]:
ep_ranker = EpisodeRanker('AAPL',120)
a,_,e=ep_ranker.evaluate(PositionTradingEnvV2,PpoAgent)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['kwargs']= json.loads(record['kwargs'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  record['kwargs']= json.loads(record['kwargs'])


In [None]:
# Meta-feature & Label Extraction =======================

tickers = ohlcv_df['symbol'].unique()
tickers = tickers[~np.isin(tickers, excluded_tickers)]
def mean_policy(arr):
    # return np.median(arr)
    return pd.Series(arr).ewm(span=5).mean().iloc[-1]

# Attempt to load if already exists (resumability)
if all([os.path.exists(path) for path in [FEATURES_PATH, TARGETS_PATH, META_PATH]]):
    features = joblib.load(FEATURES_PATH)
    targets = joblib.load(TARGETS_PATH)
    metadata = joblib.load(META_PATH)
    print("Loaded cached feature/target/meta lists.")
    
else:
    features, targets, metadata = [], [], []
    #tickers = ohlcv_df['symbol'].unique()
    #tickers = [t for t in tickers if t not in run_settings["excluded_tickers"]]
    for symbol in tqdm(tickers):
        df = ohlcv_df[ohlcv_df['symbol'] == symbol].sort_values('date').copy()
        months = df['month'].unique()
        for i in range(1, len(months)):
            m_t = months[i-1]
            m_t1 = months[i]
            df_t = df[df['month'] == m_t]
            df_t1 = df[df['month'] == m_t1]
            if len(df_t1) < run_settings["min_samples"]:
                continue
            r1d = df_t['return_1d'].astype(float).values
            v = df_t['volume'].astype(float).values
            feat = {
                'symbol': symbol,
                'month_str': str(m_t),
                'mean_return': mean_policy(r1d),
                'std_return': r1d.std(),
                'skew': skew(r1d),
                'kurtosis': kurtosis(r1d),
                'entropy': entropy(np.histogram(r1d, bins=10, density=True)[0] + 1e-8),
                'vol_mean': mean_policy(v),
                'vol_std': v.std()
            }
            # Residual diagnostics from simple RF on t+1
            df_lag = df_t1.copy()
            for lag in range(1, 5+ 1):
                df_lag[f'return_lag_{lag}'] = df_lag['return_1d'].shift(lag)
            df_lag = df_lag.dropna()
            if len(df_lag) < run_settings["min_samples"]:
                continue
            X = df_lag[[f'return_lag_{i}' for i in range(1, 5 + 1)]].values
            y = df_lag['return_1d'].values
            model = RandomForestRegressor(n_estimators=300, random_state=config['random_state'])
            model.fit(X, y)
            residuals = y - model.predict(X)
            # Meta-diagnostics
            ljung_pval = acorr_ljungbox(residuals, lags=5, return_df=True).iloc[0]['lb_pvalue']
            feat['ljung_pval'] = ljung_pval
            feat['resid_acf1'] = pd.Series(residuals).autocorr(lag=1)
            feat['resid_std'] = residuals.std()
            feat['resid_skew'] = skew(residuals)
            feat['resid_kurtosis'] = kurtosis(residuals)
            # Predictability label (cross-val R²)
            cv_r2 = mean_policy(cross_val_score(model, X, y, cv=run_settings["cv_folds"], scoring='r2'))
            features.append(feat)
            targets.append(cv_r2)
            metadata.append((symbol, str(m_t)))
    # Save for future resumes
    joblib.dump(features, FEATURES_PATH)
    joblib.dump(targets, TARGETS_PATH)
    joblib.dump(metadata, META_PATH)
    print("Feature/target/meta lists saved.")


In [None]:
# ltm_test_suite.py

import os
import json
import hashlib
import pandas as pd
import numpy as np
from datetime import datetime
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from environments import PositionTradingEnv,PositionTradingEnvV1,PositionTradingEnvV2  # assumed to exist
from data import sample_valid_episodes, extract_meta_features 

# ========== CONFIG ==========
TICKER = "AAPL"
TIMESTEPS = 10_000
EVAL_EPISODES = 5
N_TIMESTEPS = 60
LOOKBACK = 0
SEEDS = [42, 52, 62]
BENCHMARK_PATH = DEFAULT_PATH+"/benchmark_episodes.json"
CHECKPOINT_DIR = DEFAULT_PATH+"/checkpoints"
SCORES_DIR = DEFAULT_PATH+"/scores"
META_PATH = DEFAULT_PATH+"/meta_df.csv"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)
os.makedirs(os.path.dirname(BENCHMARK_PATH), exist_ok=True)

# ========== UTILITIES ==========
def generate_config_hash(config):
    raw = json.dumps(config, sort_keys=True)
    return hashlib.sha256(raw.encode()).hexdigest()

def save_model(model, config_hash):
    path = os.path.join(CHECKPOINT_DIR, f"agent_{config_hash}.zip")
    model.save(path)
    with open(path.replace(".zip", "_config.json"), "w") as f:
        json.dump(config_hash, f, indent=2)

# ========== STEP 1: Load Data ==========
print("[INFO] Loading data...")
# Replace this with real OHLCV loading
df = OHLCV_DF.copy()#[OHLCV_DF['symbol']==TICKER].copy()


# ========== STEP 2: Sample Benchmark Episodes ==========
if os.path.exists(BENCHMARK_PATH):
    with open(BENCHMARK_PATH) as f:
        benchmark_episodes = json.load(f)
else:
    print("[INFO] Sampling benchmark episodes...")
    np.random.seed(0)
    benchmark_episodes = sample_valid_episodes(df, TICKER, N_TIMESTEPS, LOOKBACK, EVAL_EPISODES)
    with open(BENCHMARK_PATH, "w") as f:
        json.dump(benchmark_episodes.tolist(), f)  # ← ✅ Convert to list here

# ========== STEP 3: Run Learnability Tests ==========
meta_records = []
for seed in SEEDS:
    for start_idx in benchmark_episodes:
        print(f"[INFO] Running episode from idx {start_idx} with seed {seed}")

        # Prepare Env
        env = Monitor(PositionTradingEnv(df, TICKER, N_TIMESTEPS, LOOKBACK, start_idx=start_idx, seed=seed))
        model = PPO("MlpPolicy", env, verbose=0, seed=seed)

        model.learn(total_timesteps=TIMESTEPS)

        # Evaluate PPO agent
        obs,_ = env.reset()
        done, score = False, 0
        while not done:
            action = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            score += reward

        # Evaluate random agent
        obs,_ = env.reset()
        done, rand_score = False, 0
        while not done:
            action = env.action_space.sample()
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            rand_score += reward

        # Calculate advantage
        advantage = score - rand_score

        # Log meta-data
        start_date = df.loc[start_idx, "date"]
        end_date = df.loc[start_idx + N_TIMESTEPS - 1, "date"]
        config = {
            "ticker": TICKER,
            "start_date": str(start_date),
            "end_date": str(end_date),
            "timesteps": TIMESTEPS,
            "seed": seed
        }
        config_hash = generate_config_hash(config)
        save_model(model, config_hash)

        meta = extract_meta_features(df.iloc[start_idx: start_idx + N_TIMESTEPS])
        meta.update({
            "config_hash": config_hash,
            "score": score,
            "rand_score": rand_score,
            "advantage": advantage,
            "seed": seed,
            "ticker": TICKER,
            "start_date": str(start_date),
            "end_date": str(end_date)
        })
        meta_records.append(meta)

# ========== STEP 4: Save Results ==========
pd.DataFrame(meta_records).to_csv(META_PATH, index=False)
print("[INFO] Learnability test complete. Results saved to:", META_PATH)


In [None]:
env.reset()

In [None]:
pd.DataFrame(meta_records)

In [None]:


def generate_config_hash(config):
    raw = json.dumps(config, sort_keys=True)
    return hashlib.sha256(raw.encode()).hexdigest()

def save_model(model, config_hash):
    path = os.path.join(CHECKPOINT_DIR, f"agent_{config_hash}.zip")
    model.save(path)
    with open(path.replace(".zip", "_config.json"), "w") as f:
        json.dump(config_hash, f, indent=2)




print("[INFO] Loading benchmark episodes...")
with open(BENCHMARK_PATH) as f:
    benchmark_episodes = json.load(f)

meta_records = []

for seed in SEEDS:
    for start_idx in benchmark_episodes:
        print(f"[INFO] Transferability episode: seed={seed}, train_idx={start_idx}")

        # TRAIN on Month T
        env_train = Monitor(PositionTradingEnv(df[df['symbol'] ==TICKER].reset_index(), TICKER, N_TIMESTEPS, LOOKBACK, seed=seed, start_idx=start_idx))
        model = PPO("MlpPolicy", env_train, verbose=0, seed=seed)
        model.learn(total_timesteps=TIMESTEPS)

        # Evaluate PPO on Month T
        obs, _= env_train.reset()
        done, score_train = False, 0
        while not done:
            action = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env_train.step(action)
            score_train += reward

        # Random agent on Month T
        obs, _ = env_train.reset()
        done, rand_train = False, 0
        while not done:
            action = env_train.action_space.sample()
            obs, reward, done, _, _ = env_train.step(action)
            rand_train += reward

        # TEST on Month T+1
        test_idx = start_idx + 60  # approx. one month later
        if test_idx + N_TIMESTEPS >= len(df):
            print("[WARN] Skipping episode — test idx out of range")
            continue

        env_test = Monitor(PositionTradingEnv(df[df['symbol'] ==TICKER].reset_index(), TICKER, N_TIMESTEPS, LOOKBACK, seed=seed, start_idx=test_idx))

        obs, _ = env_test.reset()
        done, score_test = False, 0
        while not done:
            action = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env_test.step(action)
            score_test += reward

        # Random agent on Month T+1
        obs, _ = env_test.reset()
        done, rand_test = False, 0
        while not done:
            action = env_test.action_space.sample()
            obs, reward, done, _, _ = env_test.step(action)
            rand_test += reward

        advantage_train = score_train - rand_train
        advantage_test = score_test - rand_test
        transfer_delta = score_test - score_train

        config = {
            "ticker": TICKER,
            "train_idx": int(start_idx),
            "test_idx": int(test_idx),
            "timesteps": TIMESTEPS,
            "seed": seed
        }
        config_hash = generate_config_hash(config)
        save_model(model, config_hash)

        features = extract_meta_features(df.iloc[start_idx:start_idx + N_TIMESTEPS])
        features.update({
            "config_hash": config_hash,
            "score_train": score_train,
            "score_test": score_test,
            "advantage_train": advantage_train,
            "advantage_test": advantage_test,
            "transfer_delta": transfer_delta,
            "transfer_success": int(transfer_delta > 0),
            "ticker": TICKER,
            "seed": seed
        })
        meta_records.append(features)

pd.DataFrame(meta_records).to_csv(TRANSFER_META_PATH, index=False)
print("[INFO] Transferability test complete. Results saved to:", TRANSFER_META_PATH)



In [None]:
benchmark_episodes


In [None]:
df[df['symbol']=="AAPL"].reset_index().iloc[672 ]

In [None]:

import os
import json
import numpy as np
import pandas as pd
from typing import Callable, Dict, List, Union
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.monitor import Monitor
from scipy.stats import ttest_ind

# Example agent registry
AGENT_REGISTRY = {
    "ppo": PPO,
    "a2c": A2C
}

def compute_additional_metrics(env):
    values = np.array(env.values)
    rewards = np.array(env.rewards)
    actions = np.array(env.actions)

    returns = pd.Series(values).pct_change().dropna()
    volatility = returns.std()
    entropy = -np.sum(np.bincount(actions, minlength=2)/len(actions) * np.log2(np.bincount(actions, minlength=2)/len(actions) + 1e-9))
    max_drawdown = (values / np.maximum.accumulate(values)).min() - 1
    sharpe = returns.mean() / (returns.std() + 1e-9) * np.sqrt(252)
    sortino = returns.mean() / (returns[returns < 0].std() + 1e-9) * np.sqrt(252)
    calmar = returns.mean() / abs(max_drawdown + 1e-9)
    success_trades = np.sum((np.diff(values) > 0) & (actions[1:] == 1)) + np.sum((np.diff(values) < 0) & (actions[1:] == 0))

    return {
        "volatility": volatility,
        "entropy": entropy,
        "max_drawdown": max_drawdown,
        "sharpe": sharpe,
        "sortino": sortino,
        "calmar": calmar,
        "success_trades": success_trades,
        "action_hold_ratio": np.mean(actions == 0),
        "action_long_ratio": np.mean(actions == 1)
    }

def formalized_learning_evaluation(
    df: pd.DataFrame,
    ticker: str,
    agents: List[str] = ["ppo", "a2c"],
    env_cls: Callable = None,
    env_name: str = "PositionTradingEnv",
    env_version: str = "v0",
    env_config: Dict = None,
    timesteps: int = 10_000,
    eval_episodes: int = 10,
    n_timesteps: int = 60,
    lookback: int = 0,
    seed: int = 42,
    result_path: str = "data/eval/ltm_learnability.csv"
):
    os.makedirs(os.path.dirname(result_path), exist_ok=True)

    # Load previously completed configs
    if os.path.exists(result_path):
        past_df = pd.read_csv(result_path)
        past_configs = set(past_df['config_hash'].unique())
    else:
        past_df = pd.DataFrame()
        past_configs = set()

    # Filter data and sample episodes
    df = df[df['symbol'] == ticker].copy().sort_values('date')
    df['date'] = pd.to_datetime(df['date'])
    mondays = df[df['date'].dt.weekday == 0]
    valid_starts = []
    for date in mondays['date']:
        start_idx = df.index[df['date'] == date][0]
        if start_idx + n_timesteps < len(df):
            valid_starts.append(start_idx)
    sampled_starts = np.random.default_rng(seed).choice(valid_starts, size=min(eval_episodes, len(valid_starts)), replace=False)

    all_results = []

    for agent_name in agents:
        agent_cls = AGENT_REGISTRY[agent_name]
        for start_idx in sampled_starts:
            config = {
                "agent": agent_name,
                "ticker": ticker,
                "start_idx": start_idx,
                "timesteps": timesteps,
                "n_timesteps": n_timesteps,
                "lookback": lookback,
                "seed": seed,
                "env_name": env_name,
                "env_version": env_version,
                "env_config": env_config or {}
            }
            config_hash = hash(json.dumps(config, sort_keys=True))
            if config_hash in past_configs:
                continue

            env_train = env_cls(df, ticker, n_timesteps, lookback, seed=seed, start_idx=start_idx)
            env_train = Monitor(env_train)
            model = agent_cls("MlpPolicy", env_train, seed=seed, verbose=0)
            model.learn(total_timesteps=timesteps)

            # Eval PPO
            env_eval = env_cls(df, ticker, n_timesteps, lookback, seed=seed, start_idx=start_idx)
            obs, _ = env_eval.reset()
            done, ppo_score = False, 0
            while not done:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, done, _, _ = env_eval.step(action)
                ppo_score += reward

            metrics = compute_additional_metrics(env_eval)

            # Eval Random
            env_rand = env_cls(df, ticker, n_timesteps, lookback, seed=seed, start_idx=start_idx)
            obs, _ = env_rand.reset()
            done, rand_score = False, 0
            while not done:
                action = env_rand.action_space.sample()
                obs, reward, done, _, _ = env_rand.step(action)
                rand_score += reward

            row = {
                "config_hash": config_hash,
                "agent": agent_name,
                "env_name": env_name,
                "env_version": env_version,
                "ticker": ticker,
                "seed": seed,
                "start_idx": start_idx,
                "timesteps": timesteps,
                "ppo_score": ppo_score,
                "rand_score": rand_score,
                "ppo_advantage": ppo_score - rand_score,
                "ppo_std": np.std([ppo_score]),
                "rand_std": np.std([rand_score]),
                "ppo_median": ppo_score,
                "rand_median": rand_score,
                "train_start_date": df.loc[start_idx, 'date'],
                "train_end_date": df.loc[start_idx + n_timesteps, 'date'],
                "config_json": json.dumps(config, sort_keys=True),
                **metrics
            }
            all_results.append(row)

    if all_results:
        new_df = pd.DataFrame(all_results)
        combined = pd.concat([past_df, new_df], ignore_index=True)
        combined.to_csv(result_path, index=False)

    return pd.DataFrame(all_results)


Beautiful — you’ve just outlined the foundation of a **meta-learning curriculum engine** for reinforcement learning in financial environments. Let’s break this down methodically, with **concrete scoring logic** for each goal and then tie it all into a full meta-classifier pipeline.

---

## 🎯 GOAL: Quantify and predict *how good or bad* an episode is for learning or transferring.

---

## ✅ 1. **Scoring Learnability \[0–100]**

> "How well can an agent learn from this episode?"

### ✔️ Definition:

Based on the PPO agent’s performance **vs. random policy on the training episode**.

### ✅ Formula:

```python
learnability_score = score_train_norm = normalize(score_train, 0, 100)
```

If your environment already outputs `score_train` normalized to \[0, 100], you're done.

You can also smooth it:

```python
learnability_score = 100 * sigmoid(advantage_train / scaling_factor)
```

---

## ✅ 2. **Scoring Transferability \[0–100]**

> "How well does what was learned transfer to a new episode?"

### ✔️ Definition:

Based on the **delta** between training and test advantage:

### ✅ Formula:

```python
transferability_score = 100 * sigmoid(advantage_test)
```

Alternatively:

```python
transferability_score = 100 - abs(score_test - score_train)
```

You may also combine both:

```python
transferability_score = 100 * sigmoid(advantage_test) * (1 - abs(transfer_delta)/100)
```

---

## ✅ 3. **Scoring Difficulty \[0–100]**

> "How inherently hard is this episode for an agent to learn?"

This should be **inversely related** to learnability and advantage.

### ✅ Formula:

```python
difficulty_score = 100 - learnability_score
```

Or more robust:

```python
difficulty_score = 100 * (1 - sigmoid(advantage_train))  # inverse learnability
```

You could enhance this using entropy, kurtosis, adf\_pval:

```python
difficulty_score += chaos_bonus  # if entropy or pval is high
```

---

## ✅ 4. **Clustering Episodes by Agent Performance**

Use unsupervised learning to group episodes based on key outcome metrics:

### ✅ Input Features:

```python
['score_train', 'score_test', 'advantage_train', 'advantage_test', 'transfer_delta', 'success_trades', 'action_hold_ratio']
```

### ✅ Method:

```python
from sklearn.cluster import KMeans
X = df[feature_columns]
clusters = KMeans(n_clusters=3, random_state=42).fit_predict(X)
df['performance_cluster'] = clusters
```

You can later analyze:

* Cluster 0 = high learn, high transfer
* Cluster 1 = learnable but not transferable
* Cluster 2 = hard/misleading episodes

---

## ✅ 5. **Meta-Classify an Episode Before Running It**

Now we build a classifier that learns:

> **f(episode\_meta\_features) → predict (learnability, transferability)**

### ✅ Input Features:

```python
['entropy', 'volatility', 'hurst', 'adf_pval', 'mean_return', 'return_trend', 'kurtosis', 'skew_return', 'max_drawdown', ...]
```

### ✅ Targets:

```python
'learnability_score' and 'transferability_score'
```

### ✅ Classifier Example:

```python
from sklearn.ensemble import RandomForestRegressor

X = df[meta_feature_columns]
y_learn = df['learnability_score']
y_transfer = df['transferability_score']

clf_learn = RandomForestRegressor().fit(X, y_learn)
clf_transfer = RandomForestRegressor().fit(X, y_transfer)

# Predict on new episodes
new_df['predicted_learnability'] = clf_learn.predict(new_df[meta_feature_columns])
new_df['predicted_transferability'] = clf_transfer.predict(new_df[meta_feature_columns])
```

---

## ✅ Final Step: Filter by Predicted Scores

```python
train_candidates = new_df[new_df['predicted_learnability'] > 70]
test_candidates = new_df[
    (new_df['predicted_learnability'] > 50) &
    (new_df['predicted_transferability'] > 50)
]
```

---

## ✅ Summary

| Goal                | Metric Used                      | Scale       |
| ------------------- | -------------------------------- | ----------- |
| Learnability        | `score_train`, `advantage_train` | 0–100       |
| Transferability     | `advantage_test`, `score_test`   | 0–100       |
| Difficulty          | Inverse of learnability          | 0–100       |
| Clustering          | KMeans on agent outcome metrics  | categorical |
| Predict meta-scores | RF or XGBoost on meta-features   | regression  |

---

Would you like me to generate the full code to:

* Compute the three scores
* Cluster episodes
* Train a meta-model
* Predict and filter future training/test episodes?

This can become your **adaptive episode sampler**.


In [80]:

EXPERIENCE_NAME = "stock_predictability"
FEATURES_PATH = f".data/cache/features_{EXPERIENCE_NAME}.pkl"
TARGETS_PATH = f"data/cache/targets_{EXPERIENCE_NAME}.pkl"
META_PATH = f"data/cache/meta_{EXPERIENCE_NAME}.pkl"

excluded_tickers=['CEG', 'GEHC', 'GEV', 'KVUE', 'SOLV']
excluded_tickers.sort()
#tickers = TOP2_STOCK_BY_SECTOR

config={
    "regressor":"RandomForestRegressor",
    "n_estimators": 100,
    "random_state":314
}
run_settings={
    "excluded_tickers": excluded_tickers,
    "min_samples": 10,
    "cv_folds": 3,
    "lags": 5,
    "start_date":"2022-01-01",
    "end_date":"2023-01-01"
}

# Config section


In [81]:

# LOAD OHLCV ==========================================


ohlcv_df = load_base_dataframe()
ohlcv_df['date'] = pd.to_datetime(ohlcv_df['date'])
ohlcv_df = ohlcv_df[(ohlcv_df['date'] >= run_settings["start_date"]) & (ohlcv_df['date'] < run_settings["end_date"])]
ohlcv_df['month'] = ohlcv_df['date'].dt.to_period('M')
ohlcv_df['return_1d'] = ohlcv_df['return_1d'].fillna(0)
ohlcv_df['sector_id'] = ohlcv_df['sector_id'].fillna('unknown')
ohlcv_df['industry_id'] = ohlcv_df['industry_id'].fillna('unknown')

In [83]:
from tqdm import tqdm
# Meta-feature & Label Extraction =======================
"""
# BASIC PREPROCESSING ===================================
excluded_tickers = run_settings["excluded_tickers"]
min_samples = run_settings["min_samples"]
cv_folds = run_settings["cv_folds"]
lags = run_settings["lags"]
start_date = run_settings["start_date"]
end_date = run_settings["end_date"]

# CROP THE SAMPLE =======================================
tickers = ohlcv_df['symbol'].unique()[:100]
tickers = tickers[~np.isin(tickers, excluded_tickers)]
tickers = ["AAPL","MSFT","JPM","V",'LLY','UNH','AMZN','TSLA','META','GOOGL','GE','UBER','COST','WMT','XOM','CVX'.'NEE','SO','AMT','PLD','LIN','SHW']

# FOR POC ONLY


ohlcv_df = ohlcv_df.copy()
ohlcv_df['date'] = pd.to_datetime(ohlcv_df['date'])
ohlcv_df = ohlcv_df[(ohlcv_df['date'] >= start_date) & (ohlcv_df['date'] < end_date)]
ohlcv_df['month'] = ohlcv_df['date'].dt.to_period('M')
ohlcv_df['return_1d'] = ohlcv_df['return_1d'].fillna(0)
"""
tickers = ohlcv_df['symbol'].unique()
tickers = tickers[~np.isin(tickers, excluded_tickers)]
def mean_policy(arr):
    # return np.median(arr)
    return pd.Series(arr).ewm(span=5).mean().iloc[-1]

# Attempt to load if already exists (resumability)
if all([os.path.exists(path) for path in [FEATURES_PATH, TARGETS_PATH, META_PATH]]):
    features = joblib.load(FEATURES_PATH)
    targets = joblib.load(TARGETS_PATH)
    metadata = joblib.load(META_PATH)
    print("Loaded cached feature/target/meta lists.")
    
else:
    features, targets, metadata = [], [], []
    #tickers = ohlcv_df['symbol'].unique()
    #tickers = [t for t in tickers if t not in run_settings["excluded_tickers"]]
    for symbol in tqdm(tickers):
        df = ohlcv_df[ohlcv_df['symbol'] == symbol].sort_values('date').copy()
        months = df['month'].unique()
        for i in range(1, len(months)):
            m_t = months[i-1]
            m_t1 = months[i]
            df_t = df[df['month'] == m_t]
            df_t1 = df[df['month'] == m_t1]
            if len(df_t1) < run_settings["min_samples"]:
                continue
            r1d = df_t['return_1d'].astype(float).values
            v = df_t['volume'].astype(float).values
            feat = {
                'symbol': symbol,
                'month_str': str(m_t),
                'mean_return': mean_policy(r1d),
                'std_return': r1d.std(),
                'skew': skew(r1d),
                'kurtosis': kurtosis(r1d),
                'entropy': entropy(np.histogram(r1d, bins=10, density=True)[0] + 1e-8),
                'vol_mean': mean_policy(v),
                'vol_std': v.std()
            }
            # Residual diagnostics from simple RF on t+1
            df_lag = df_t1.copy()
            for lag in range(1, run_settings['lags'] + 1):
                df_lag[f'return_lag_{lag}'] = df_lag['return_1d'].shift(lag)
            df_lag = df_lag.dropna()
            if len(df_lag) < run_settings["min_samples"]:
                continue
            X = df_lag[[f'return_lag_{i}' for i in range(1, run_settings['lags'] + 1)]].values
            y = df_lag['return_1d'].values
            model = RandomForestRegressor(n_estimators=config['n_estimators'], random_state=config['random_state'])
            model.fit(X, y)
            residuals = y - model.predict(X)
            # Meta-diagnostics
            ljung_pval = acorr_ljungbox(residuals, lags=[run_settings['lags']], return_df=True).iloc[0]['lb_pvalue']
            feat['ljung_pval'] = ljung_pval
            feat['resid_acf1'] = pd.Series(residuals).autocorr(lag=1)
            feat['resid_std'] = residuals.std()
            feat['resid_skew'] = skew(residuals)
            feat['resid_kurtosis'] = kurtosis(residuals)
            # Predictability label (cross-val R²)
            cv_r2 = mean_policy(cross_val_score(model, X, y, cv=run_settings["cv_folds"], scoring='r2'))
            features.append(feat)
            targets.append(cv_r2)
            metadata.append((symbol, str(m_t)))
    # Save for future resumes
    joblib.dump(features, FEATURES_PATH)
    joblib.dump(targets, TARGETS_PATH)
    joblib.dump(metadata, META_PATH)
    print("Feature/target/meta lists saved.")


100%|██████████| 498/498 [38:40<00:00,  4.66s/it]


FileNotFoundError: [Errno 2] No such file or directory: '.data/cache/features_stock_predictability.pkl'

In [None]:
# DataFrame Construction  ============================
X_df = pd.DataFrame(features)
y_df = pd.Series(targets, name='cv_r2')
meta_df = pd.DataFrame(metadata, columns=['symbol', 'month'])

In [None]:
# Scaling & Preparation ==============================

X = X_df.drop(columns=['symbol', 'month_str'])
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
import joblib
from src.env.base_trading_env import CumulativeTradingEnv

RL_LABELS_PATH = f"../../data/cache/meta_rl_labels_{EXPERIENCE_NAME}.pkl"

feature_cols = ["return_1d", "volume"]  # Or your preferred features
episode_length = 18  # Or whatever fits your month
train_steps = 300    # Fast!
min_ep_len = 18
# Resume logic: Load meta_df with RL columns if available
if os.path.exists(RL_LABELS_PATH):
    meta_df_rl = pd.read_pickle(RL_LABELS_PATH)
    print("Loaded meta_df with RL columns.")
else:
    # Copy original meta_df and initialize RL columns
    meta_df_rl = meta_df.copy()
    meta_df_rl['agent_reward'] = np.nan
    meta_df_rl['random_reward'] = np.nan
    meta_df_rl['advantage'] = np.nan
    meta_df_rl['sharpe'] = np.nan
    meta_df_rl['cum_return'] = np.nan
    meta_df_rl['alpha'] = np.nan


for i, row in tqdm(meta_df_rl.iterrows(), total=len(meta_df_rl), desc="Meta-RL Agent Loop"):
    # Skip if already computed
    if not np.isnan(meta_df_rl.loc[i, 'agent_reward']):
        continue

    symbol, month = row['symbol'], row['month']
    df_env = ohlcv_df[(ohlcv_df['symbol'] == symbol) & (ohlcv_df['month'] == month)].sort_values("date")
    if len(df_env) < min_ep_len:
        min_ep_len = len(df_env)
        print('new min',min_ep_len)
    if len(df_env) < episode_length:
        print('x',len(df_env) ,episode_length)
        continue  # Not enough data, skip

    try:
        env = PositionTradingEnvV2sitionTradingEnvV2sitionTradingEnvV2sitionTradingEnvV2sitionTradingEnvV2(
            df=df_env,
            feature_cols=feature_cols,
            episode_length=episode_length,
            transaction_cost=0.0001,
            seed=42
        )
        env = gym.wrappers.FlattenObservation(env)
        check_env(env, warn=True)

        model = PPO("MlpPolicy", env, verbose=0, n_steps=64, batch_size=16, learning_rate=0.001, seed=42)
        model.learn(total_timesteps=train_steps)

        # Evaluate PPO
        obs, _ = env.reset()
        agent_rewards, done = [], False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, info = env.step(action)
            agent_rewards.append(reward)
        agent_reward = np.sum(agent_rewards)

        # Evaluate Random
        obs, _ = env.reset()
        random_rewards, done = [], False
        while not done:
            action = env.action_space.sample()
            obs, reward, done, truncated, info = env.step(action)
            random_rewards.append(reward)
        random_reward = np.sum(random_rewards)

        advantage = agent_reward - random_reward

        meta_df_rl.loc[i, 'agent_reward'] = agent_reward
        meta_df_rl.loc[i, 'random_reward'] = random_reward
        meta_df_rl.loc[i, 'advantage'] = advantage
        meta_df_rl.loc[i, 'sharpe'] = info.get("episode_sharpe", np.nan)
        meta_df_rl.loc[i, 'cum_return'] = info.get("cumulative_return", np.nan)
        meta_df_rl.loc[i, 'alpha'] = info.get("alpha", np.nan)
        #print(info)
        
    except Exception as e:
        print(f"Skipped ({symbol})",e)


In [None]:
meta_df_rl.to_csv('mrl.csv')
meta_df_rl['target'] = (meta_df_rl['advantage'] > 0).astype(int)
meta_df_rl

In [None]:
feature_cols = [col for col in meta_df_rl.columns if col not in ['symbol', 'month', 'agent_reward', 'random_reward', 'advantage', 'target']]
feature_cols

In [None]:
meta_df_rl['target'].value_counts()

In [None]:
# Make sure columns are compatible for merge
X_df['month'] = X_df['month_str']
merged = pd.merge(X_df, meta_df_rl, on=['symbol', 'month'], how='inner')
merged

In [None]:
feature_cols = [
    col for col in merged.columns
    if col not in ['symbol', 'month', 'month_str', 'agent_reward', 'random_reward', 'advantage', 'target']
]

X = merged[feature_cols]
y = merged['target']

# Scale features
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
feature_cols

In [None]:
#ohlcv_df.sort_values(by="date").head().to_csv('ohlcv_to_upload.csv')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 6))
plt.bar(range(len(importances)), importances[sorted_idx])
plt.xticks(range(len(importances)), [feature_cols[i] for i in sorted_idx], rotation=90)
plt.title("Meta-Feature Importances for Predicting RL Agent Advantage")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# y_true: true labels, y_pred: predicted labels
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)  # Optional: color map
plt.title("Confusion Matrix")
plt.show()


class_names = ['Will Learn', 'Wont learn']  # Adjust to your problem

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Oranges)
plt.title("Confusion Matrix")
plt.show()