# 01 - Environment Reward System Ablation Test

1. Compute rolling predictability metrics for each ticker
2. Visualize and compare scores across universe and time
3. Select top-N most “learnable” tickers for RL agent
4. Document all decisions, assumptions, and open questions


In [195]:
# SETUP: Imports & Paths ===========================
import jupyter

from src.utils.system import boot
import os
import numpy as np
import pandas as pd

boot()
from tqdm import tqdm
from src.data.feature_pipeline import load_base_dataframe
from src.predictability.easiness import rolling_sharpe, rolling_r2, rolling_info_ratio, rolling_autocorr
from src.predictability.pipeline import generate_universe_easiness_report
from IPython import display

from stable_baselines3 import PPO
from src.experiments.experiment_tracker import ExperimentTracker  
from src.env.base_trading_env import (  
    BaseTradingEnv, SharpeTradingEnv, SortinoTradingEnv, AlphaTradingEnv,
    DrawdownTradingEnv, CumulativeTradingEnv, CalmarTradingEnv, HybridTradingEnv,BaselineTradingAgent
)

# --- Experiment Constants ---
EXPERIMENT_NAME = "walkforward_ablation_rl_env"
EXCLUDED_TICKERS = ['CEG', 'GEHC', 'GEV', 'KVUE', 'SOLV']

FEATURE_COLS = [
  "day_of_month",                     
   "day_of_week",                      
   #"candle_size",                    
   "order_flow",                     
   "candle_body",                    
   "upper_shadow",                   
   "lower_shadow",                   
   "price_change",                   
   "candle_change",                  
   "order_flow_change",              
   "overnight_price_change",         
   "volume_change",                  
   "vwap_change",                    
   "trade_count_change",             
 #26  sector_id               0 non-null       object        
 #27  industry_id             0 non-null       object        
   "return_1d",                      
 #29  vix                     429706 non-null  float64       
   "vix_norm",                       
 #31  sp500                   429706 non-null  float64       
 #32  sp500_norm              429706 non-null  float64       
   "market_return_1d",             
]
FEATURE_COLS.sort()
TOTAL_TIMESTEPS=20000
EPISODE_LENGTH = 50
TOTAL_TRAIN_EPISODES = int(TOTAL_TIMESTEPS/EPISODE_LENGTH )+1
TOTAL_TEST_EPISODES = 5

SEED = 314
TRANSACTION_COST=0#0.0001

# --- Walk-forward Splits ---
walk_forward_splits = [
    ("2023-01-01", "2023-07-01", "2023-09-01", "2023-12-01"),
    ("2024-01-01", "2024-07-01", "2024-09-01", "2024-12-01"),
]

# --- Ablation Variants ---
reward_envs = [
    ("cumulative", CumulativeTradingEnv),
    ("sharpe", SharpeTradingEnv),
    ("sortino", SortinoTradingEnv),
    ("drawdown", DrawdownTradingEnv),
    ("calmar", CalmarTradingEnv),
    ("alpha", AlphaTradingEnv),
    ("hybrid", HybridTradingEnv),
]

CONFIG = {
    "feature_cols":FEATURE_COLS,
    "total_train_episodes":TOTAL_TRAIN_EPISODES,
    "total_test_episodes":TOTAL_TEST_EPISODES,
    "episode_length":EPISODE_LENGTH,
    "seed":SEED,
    "transaction_cost":TRANSACTION_COST,
    "total_timesteps":TOTAL_TIMESTEPS,
    "agent":"PPO"
}


In [196]:
# LOAD OHLCV ==========================================
ohlcv_df = load_base_dataframe()
ohlcv_df.tail()
_ohlcv=ohlcv_df.copy()

In [197]:
# CROP THE SAMPLE =======================================
tickers = ohlcv_df['symbol'].unique()

In [None]:
#  Deterministic Episode Generator ====================
def generate_episode_sequences(df, episode_length, n_episodes, excluded_tickers, seed=314):
    rng = np.random.default_rng(seed)
    eligible_tickers = [t for t in df['symbol'].unique() if t not in excluded_tickers]
    sequences = []
    for _ in range(n_episodes):
        ticker = rng.choice(eligible_tickers)
        stock_df = df[df['symbol'] == ticker]
        max_start = len(stock_df) - episode_length - 1
   
        if max_start < 1:
            continue
        start_idx = rng.integers(0, max_start)
        sequences.append((ticker, int(start_idx)))
    return sequences

In [None]:
df = ohlcv_df.copy()

In [None]:
class BaselineTradingAgent:
    def __init__(self,df,feature_cols=[],
            episode_length=100, seed=314,set_episode_sequence=[]):
    
        self.env = BaseTradingEnv(df, feature_cols=feature_cols,
            episode_length=episode_length, seed=seed)
        self.env.set_episode_sequence(set_episode_sequence)
        
    def predict(self,obs,*args,**kwargs):
        #print(self.env.stocks,'xxxxxxxxxxx')
        return self.env.action_space.sample(),{}
    
    def set_episode_sequence(self,seq):
        
        pass


In [None]:
 def evaluate_env(env, agent, n_episodes):
    # Evaluate agent on test episodes, collect metrics from env.info
    episode_metrics = []
    for _ in range(n_episodes):
    
        obs, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = agent.predict(obs, deterministic=True)
            obs, reward, done, truncated, info = env.step(action)
            total_reward += reward
        # Save info returned at episode end
        episode_metrics.append(info)
    return episode_metrics

#test_metrics = evaluate_env(test_env, agent, len(test_episodes))
#train_metrics = evaluate_env(train_env, agent, min(100, len(train_episodes)))  # sample few train episodes


In [None]:
def baseline_run(df,config):
    tracker = None
    train_episode_sequences=None
    test_episode_sequences=None
    print(walk_forward_splits)
    for split_idx, (train_start, train_end, test_start, test_end) in enumerate(walk_forward_splits):
        #print(split_idx)
        df_train = df[(df['date'] >= train_start) & (df['date'] < train_end) & ~df['symbol'].isin(EXCLUDED_TICKERS)].reset_index(drop=True)
        #rint('yyy')
        df_test = df[(df['date'] >= test_start) & (df['date'] < test_end) & ~df['symbol'].isin(EXCLUDED_TICKERS)].reset_index(drop=True)
        #print('test',df_test)
        split_seed = int(pd.Timestamp(test_start).timestamp())
        if tracker == None:
            #print('xxxx',df_train)
            train_episode_sequences = generate_episode_sequences(df_train, EPISODE_LENGTH, TOTAL_TRAIN_EPISODES, EXCLUDED_TICKERS, seed=split_seed)
            #print('xxxx',df_test)
            
       
            test_episode_sequences = generate_episode_sequences(df_test, EPISODE_LENGTH, TOTAL_TEST_EPISODES, EXCLUDED_TICKERS, seed=split_seed + 1)
            #print('xxxx')
            tracker = ExperimentTracker(
                project=EXPERIMENT_NAME,
                # phase="ablation",
                train_episode_sequences=train_episode_sequences,
                test_episode_sequences=test_episode_sequences,
            )
       

            # Assuming tracker.train_episode_sequences is your list of tuples
            tracker.train_episode_sequences = [(sym, int(num)) for sym, num in tracker.train_episode_sequences]
            tracker.test_episode_sequences = [(sym, int(num)) for sym, num in tracker.test_episode_sequences]
            
         
            
            
        #print('a')
        train_env = SharpeTradingEnv(
            df_train, feature_cols=FEATURE_COLS,
            episode_length=EPISODE_LENGTH, seed=SEED
        )
        train_env.set_episode_sequence(train_episode_sequences)
        test_env = SharpeTradingEnv(
            df_test, feature_cols=FEATURE_COLS,
            episode_length=EPISODE_LENGTH, seed=SEED
        )
           
        test_env.set_episode_sequence(test_episode_sequences)
        print(test_episode_sequences)
        if len(test_env.stocks) == 0:
            continue
            
        print('pass')
        #print('b',train_episode_sequences)
        test_env.set_episode_sequence(test_episode_sequences)
        agent = BaselineTradingAgent(  df_train.copy(), feature_cols=FEATURE_COLS,
            episode_length=EPISODE_LENGTH, seed=SEED,set_episode_sequence=train_episode_sequences)
        target_date = train_end
        run_settings={
            "train_start":train_start,
            "train_end":train_end,
            "test_end":test_end,
            "reward":"baseline",
            "mode":"in-sample"
        }
        #print('c')
        print(train_env.reset())
        #print('res')
        print('')
        results = evaluate_env(train_env, agent, 1)
        results = results[0]
        results.pop('returns')
        results.pop('downside')
        print(results)
        print('d')

        tracker.save_run(config,results,target_date,run_settings)
        #print(test_env.stocks)
        results = evaluate_env(test_env, agent, 1)
        results = results[0]
        results.pop('returns')
        results.pop('downside')
        run_settings["mode"]="out-of-sample"
        print('e')
        tracker.save_run(config,results,target_date,run_settings)
    return tracker
    
tracker = baseline_run(ohlcv_df.copy(),CONFIG)
tracker

In [None]:
tracker.load_runs()
train_episodes = tracker.train_episode_sequences#generate_episode_sequences(df_train, EPISODE_LENGTH, TOTAL_TRAIN_EPISODES, EXCLUDED_TICKERS, seed=split_seed)
test_episodes = tracker.test_episode_sequences# generate_episode_sequences(df_test, EPISODE_LENGTH, TOTAL_TEST_EPISODES, EXCLUDED_TICKERS, seed=split_seed + 1)


In [205]:
# Ablation Loop ==============================================

all_results = []

for split_idx, (train_start, train_end, test_start, test_end) in enumerate(walk_forward_splits):
    # --- Slice data for split ---
    df_train = df[(df['date'] >= train_start) & (df['date'] < train_end) & ~df['symbol'].isin(EXCLUDED_TICKERS)].reset_index(drop=True)
    df_test = df[(df['date'] >= test_start) & (df['date'] < test_end) & ~df['symbol'].isin(EXCLUDED_TICKERS)].reset_index(drop=True)

    # --- Deterministic episode sequences per split ---
    split_seed = int(pd.Timestamp(test_start).timestamp())
  
    # --- Base config (shared) ---
    #base_config = dict(
    #    agent="PPO",
    #    env_class="BaseTradingEnv",
    #    episode_length=EPISODE_LENGTH,
    #    feature_cols=FEATURE_COLS,
    #    transaction_cost=TRANSACTION_COST,
    #    total_timesteps=TOTAL_TIMESTEPS,
    #    seed=SEED,
    #)
    wf_settings = dict(
        train_start=train_start, train_end=train_end,
        test_start=test_start, test_end=test_end,
    )

    for reward_name, EnvClass in reward_envs:
        # --- Per-ablation config ---
        #config = dict(**base_config, reward=reward_name)
        #run_settings = dict(**wf_settings, reward=reward_name)
        run_settings={
            "train_start":train_start,
            "train_end":train_end,
            "test_end":test_end,
            "reward":reward_name,
            "mode":"in-sample"
        }
        tracker = ExperimentTracker(
            project=EXPERIMENT_NAME,
            #phase="ablation",
            train_episode_sequences=train_episodes,
            test_episode_sequences=test_episodes,
        )

        # --- Build env ---
        train_env = EnvClass(
            df_train, feature_cols=FEATURE_COLS,
            episode_length=EPISODE_LENGTH, seed=SEED
        )
        train_env.set_episode_sequence(train_episodes)
        test_env = EnvClass(
            df_test, feature_cols=FEATURE_COLS,
            episode_length=EPISODE_LENGTH, seed=SEED
        )
        test_env.set_episode_sequence(test_episodes)

        # --- RL Training ---
        agent = PPO("MlpPolicy", train_env, verbose=0, n_steps=EPISODE_LENGTH, seed=SEED)
        agent.learn(total_timesteps=base_config["total_timesteps"])

        # --- Evaluation ---
        def evaluate_env(env, agent, n_episodes):
            # Evaluate agent on test episodes, collect metrics from env.info
            episode_metrics = []
            for _ in range(n_episodes):
                obs, _ = env.reset()
                done = False
                total_reward = 0
                while not done:
                    action, _ = agent.predict(obs, deterministic=True)
                    obs, reward, done, truncated, info = env.step(action)
                    total_reward += reward
                # Save info returned at episode end
                episode_metrics.append(info)
            return episode_metrics

        test_metrics = evaluate_env(test_env, agent, len(test_episodes))
        train_metrics = evaluate_env(train_env, agent, min(100, len(train_episodes)))  # sample few train episodes

        # --- Aggregate episode metrics ---
        def summarize(metrics):
            # Metrics: episode_sharpe, cumulative_return, calmar, alpha, etc.
            keys = ['episode_sharpe', 'calmar', 'cumulative_return', 'alpha', 'max_drawdown', 'win_rate']
            out = {k: np.nanmean([m.get(k, np.nan) for m in metrics]) for k in keys}
            return out

        train_summary = summarize(train_metrics)
        test_summary = summarize(test_metrics)

        # --- Save to tracker ---
        results = {
            "train": train_summary,
            "test": test_summary,
        }
        tracker.save_run(
            config=config,
            results=results,
            target_date=test_start,
            run_settings=run_settings,
        )
        # --- Save to aggregate results for reporting ---
        all_results.append({
            "split": f"{test_start}--{test_end}",
            "reward": reward_name,
            **test_summary
        })

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=50 and n_envs=1)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
 

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean

In [None]:
# Output and Report ===============================================

results_df = pd.DataFrame(all_results)
results_df.to_csv(f"{EXPERIMENT_NAME}_walkforward_ablation_results.csv", index=False)
print(results_df)

# --- Example plot (Sharpe by reward function and split) ---
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,6))
sns.barplot(data=results_df, x="split", y="episode_sharpe", hue="reward")
plt.title("Test Sharpe by Reward Function (Ablation, Walk-Forward)")
plt.ylabel("Sharpe Ratio")
plt.show()

# --- Save markdown summary ---
summary_md = results_df.groupby("reward").agg(
    mean_sharpe=('episode_sharpe', 'mean'),
    mean_calmar=('calmar', 'mean'),
    mean_return=('cumulative_return', 'mean')
).reset_index().to_markdown(index=False)
with open(f"{EXPERIMENT_NAME}_summary.md", "w") as f:
    f.write("# Walk-Forward Ablation Study Results\n\n")
    f.write(summary_md)