In [1]:
# SETUP ===================================
import jupyter
import warnings

from src.utils.system import boot, Notify

boot()
warnings.filterwarnings("ignore")



# PACKAGES ================================
import os
import torch
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import gymnasium as gym
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.preprocessing import  RobustScaler

# FRAMEWORK STUFF =========================
from src.defaults import TOP2_STOCK_BY_SECTOR, FEATURE_COLS,EPISODE_LENGTH
from src.data.feature_pipeline import load_base_dataframe
from src.experiments.experiment_tracker import ExperimentTracker
from src.env.base_timeseries_trading_env import BaseSequenceAwareTradingEnv,SequenceAwareAlphaTradingEnv,SequenceAwareBaselineTradingAgent,SequenceAwareCalmarTradingEnv,SequenceAwareCumulativeTradingEnv,SequenceAwareDrawdownTradingEnv,SequenceAwareHybridTradingEnv,SequenceAwareHybridTradingEnv,SequenceAwareSharpeTradingEnv,SequenceAwareSortinoTradingEnv

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import ActorCriticPolicy



In [18]:
import random
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces


from src.env.step_rewards import reward_sharpe,reward_sortino,reward_drawdown,reward_alpha,reward_cumulative,reward_calmar,reward_hybrid

class BaseSequenceAwareTradingEnv(gym.Env):
    """
    Flexible RL Trading Env with windowed sequence obs (Transformer/LSTM/MLP-ready).
    - Set `return_sequences=True` for (window_length, obs_dim) obs (for transformers).
    - Set `return_sequences=False` for flat obs (classic RL, SB3 LSTM/MLP).
    """
    metadata = {"render_modes": ["human"]}

    def __init__(
        self, df, feature_cols=None, reward_fn=None, internal_features=None,
        episode_length=100, transaction_cost=0.0001, seed=314, window_length=10, return_sequences=True):
        super().__init__()
        self.df = df.copy()
        self.feature_cols = feature_cols or []
        self.internal_features = internal_features or [
            "position", "holding_period", "cumulative_reward", "pct_time",
            "drawdown", "rel_perf", "unrealized_pnl", "entry_price", "time_in_position"
        ]
        self.obs_dim = len(self.feature_cols) + len(self.internal_features)
        self.episode_length = episode_length +2
        self.window_length = max(1, window_length)
        self.return_sequences = return_sequences  # True: (window, obs_dim), False: flat
        self.transaction_cost = transaction_cost
        self.seed = seed
        self.action_counts = {0: 0, 1: 0, 2: 0}  # Hold, Buy, Sell
        if seed is not None:
            np.random.seed(seed)
        counts = df['symbol'].value_counts()
        eligible = counts[counts >= episode_length].index
        self.stocks = df[df['symbol'].isin(eligible)]['symbol'].unique()
        self.episode_df = df.copy()
        self.reward_fn = reward_fn or self.default_reward_fn

        # Set observation space
        if self.return_sequences:
            self.observation_space = spaces.Box(
                low=-np.inf, high=np.inf,
                shape=(self.window_length, self.obs_dim), dtype=np.float32
            )
        else:
            self.observation_space = spaces.Box(
                low=-np.inf, high=np.inf,
                shape=(self.window_length * self.obs_dim,), dtype=np.float32
            )
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

    
    def get_current_regime(self):
        """
        Detects current regime based on recent return volatility and trend.
        Returns:
            0 = Bull, 1 = Bear, 2 = Sideways
        """
        if self.current_step < self.window_length:
            return 2  # Not enough data, assume sideways

        # Use recent price changes to detect regime
        returns = self.episode_df['return_1d'].iloc[self.current_step - self.window_length:self.current_step].values
        mean_return = returns.mean()
        std_return = returns.std()

        # Thresholds can be tuned
        if mean_return > 0.001 and std_return < 0.01:
            return 0  # Bull
        elif mean_return < -0.001 and std_return < 0.01:
            return 1  # Bear
        else:
            return 2  # Sideways
        
    def default_reward_fn(self, position, price_change, **kwargs):
        return position * price_change

    def set_episode_sequence(self, sequence):
        self.episode_sequence = sequence
        self.episode_counter = 0

    def generate_episode_sequences_v1(self, train_steps=10000):
        dataset_length = len(self.df)
        episodes = int(train_steps / self.episode_length) + 1
        episode_sequences = []
        ticker = self.df['symbol'].unique()[0]
        min_start = 0
        max_start = dataset_length - self.episode_length - 2
        for i in range(episodes):
            episode_sequences.append((ticker, np.random.randint(0, max_start)))
        np.random.shuffle(episode_sequences)
        return episode_sequences

    def generate_episode_sequences(self, train_steps=10000):
        episodes = int(train_steps / self.episode_length) + 1
        episode_sequences = []
        for _ in range(episodes):
            ticker = np.random.choice(self.stocks)
            stock_df = self.df[self.df['symbol'] == ticker].reset_index(drop=True)
            max_start = len(stock_df) - self.episode_length - 2
            if max_start <= 0:
                continue  # skip if not enough data
            start = np.random.randint(0, max_start)
            episode_sequences.append((ticker, start))
        np.random.shuffle(episode_sequences)
        return episode_sequences

    def reset(self, seed=None, options=None, start_index=None):
        self.entry_step = None
        self.unrealized_pnl = 0
        self.relative_perf = 0
        self.drawdown = 0
        self.time_in_position = 0
        self.action_counts = {0: 0, 1: 0, 2: 0}  # Hold, Buy, Sell
        if self.seed is not None:
            np.random.seed(self.seed)
            
            
        symbol, start_idx = self.episode_sequence[self.episode_counter]
        #print(symbol,start_idx,self.df['symbol'].unique())
        symbol_df = self.df[self.df['symbol'] == symbol].reset_index(drop=True)
        #print(len(symbol_df))
        
        if start_idx + self.episode_length > len(symbol_df):
            print(f"[WARN] Episode too short for {symbol} at {start_idx}, skipping...")
            self.episode_counter = (self.episode_counter + 1) % len(self.episode_sequence)
            return self.reset()  # tenta o próximo episódio

        # ✅ Extração segura
        #self.episode_df = symbol_df.iloc[start_idx : start_idx + self.episode_length].copy()
        end = start_idx + self.episode_length + 1
        if end > len(symbol_df):
            print(f"[WARN] Not enough data for {symbol} from {start_idx}, skipping.")
            self.episode_counter = (self.episode_counter + 1) % len(self.episode_sequence)
            return self.reset()
        self.episode_df = symbol_df.iloc[start_idx:end].copy()
        

        # Move to next episode (with wrap-around)
        self.episode_counter = (self.episode_counter + 1) % len(self.episode_sequence)
        """
        for _ in range(10):  # Try up to 10 times to get a valid episode
            stock = self.stocks[0]
            if hasattr(self, "episode_sequence"):
                if self.episode_counter >= len(self.episode_sequence):
                    self.episode_counter = 0
                _, start = self.episode_sequence[self.episode_counter]
                self.episode_counter += 1
            else:
                stock = np.random.choice(self.stocks)
                stock_df = self.df[self.df['symbol'] == stock].reset_index(drop=True)
                max_start = len(stock_df) - self.episode_length
                if max_start <= 0:
                    continue  # Try another stock
                start = np.random.randint(0, max_start + 1)

            self.stock = stock
            stock_df = self.df[self.df['symbol'] == self.stock].reset_index(drop=True)
            self.episode_df = stock_df.iloc[int(start):int(start) + int(self.episode_length + 2)].reset_index(drop=True)

            if len(self.episode_df) >= self.window_length:
                break
        else:
            raise RuntimeError("Failed to sample a valid episode with sufficient data.")
        """
        self.current_step = 0
        self.entry_price = None
        self.position = 0
        self.holding_period = 0
        self.cumulative_reward = 0
        self.returns_history = []
        self.reward_history = []
        self.episode_pct_changes = self.episode_df['return_1d'].values
        self.max_possible_reward = np.sum(np.abs(self.episode_pct_changes))
        self.current_wealth = 1.0
        self.peak_wealth = 1.0

        return self._get_obs(), {}

    def _get_obs(self):
        # Returns a rolling window of observations (2D or flattened)
        obs_list = []
        #for i in range(self.current_step - self.window_length + 1, self.current_step + 1):
        #    idx = max(i, 0)  # pad with earliest available step
        #    features = self.episode_df.iloc[idx][self.feature_cols].values.astype(np.float32)
        for i in range(self.current_step - self.window_length + 1, self.current_step + 1):
            if 0 <= i < len(self.episode_df):
                features = self.episode_df.iloc[i][self.feature_cols].values.astype(np.float32)
            else:
                features = np.zeros(len(self.feature_cols), dtype=np.float32)  # zero padding
            internal_state = {
                "position": self.position,
                "holding_period": self.holding_period,
                "cumulative_reward": self.cumulative_reward,
                "pct_time": self.current_step / self.episode_length,
                "drawdown": self.drawdown,
                "rel_perf": self.relative_perf,
                "unrealized_pnl": self.unrealized_pnl,
                "entry_price": self.entry_price if self.entry_price is not None else 0.0,
                "time_in_position": self.time_in_position,
            }
            internal = np.array([internal_state[name] for name in self.internal_features], dtype=np.float32)
            obs = np.concatenate([features, internal])
            obs_list.append(obs)
        obs_window = np.stack(obs_list)  # shape: (window_length, obs_dim)
        if self.return_sequences:
            return obs_window  # shape: (window_length, obs_dim)
        else:
            return obs_window.flatten()  # shape: (window_length * obs_dim,)
        
    

    def step(self, action):
        #print(self.current_step,self.episode_length,len(self.episode_df))
        done = self.current_step >= self.episode_length - 1
        current_row = self.episode_df.iloc[self.current_step]

        # Protege contra acesso fora dos limites
        if self.current_step + 1 < len(self.episode_df):
            next_row = self.episode_df.iloc[self.current_step + 1]
        else:
            next_row = current_row.copy()  # fallback seguro

        price_change = next_row['return_1d']
        prev_position = self.position
        reward = 0
        cost = 0

        self.action_counts[action] += 1

        if action == 1:  # Buy
            if self.position != 1:
                if self.position != 0:
                    cost = self.transaction_cost
                self.position = 1
                self.holding_period = 0
                self.entry_price = current_row['close']
                self.entry_step = self.current_step

        elif action == 2:  # Sell
            if self.position != -1:
                if self.position != 0:
                    cost = self.transaction_cost
                self.position = -1
                self.holding_period = 0
                self.entry_price = current_row['close']
                self.entry_step = self.current_step

        if self.position != 0:
            self.holding_period += 1

        step_return = self.position * price_change
        self.returns_history.append(step_return)
        self.current_wealth *= (1 + step_return)
        if self.current_wealth > self.peak_wealth:
            self.peak_wealth = self.current_wealth
        self.drawdown = 1 - self.current_wealth / self.peak_wealth

        if self.position != 0 and self.entry_price is not None:
            current_price = next_row['close']
            self.unrealized_pnl = (current_price - self.entry_price) * self.position / self.entry_price
            self.time_in_position = self.current_step - self.entry_step
        else:
            self.unrealized_pnl = 0
            self.time_in_position = 0

        if 'market_return_1d' in self.episode_df.columns:
            self.relative_perf = price_change - next_row['market_return_1d']
        else:
            self.relative_perf = 0

        reward = self.reward_fn(
            position=self.position,
            price_change=price_change,
            prev_position=prev_position,
            env=self
        )
        reward -= cost
        self.reward_history.append(reward)
        self.cumulative_reward += reward

        self.current_step += 1
        obs = self._get_obs()
        info = {}
        info["regime"] = self.get_current_regime()

        # Calcula métricas no final do episódio
        if done:
            returns = np.array(self.returns_history)
            mean = np.median(returns) if len(returns) > 0 else np.nan
            std = returns.std() if len(returns) > 1 else np.nan
            downside = returns[returns < 0]
            downside_std = downside.std() if len(downside) > 1 else np.nan

            sharpe = mean / std if (std is not None and std > 0 and not np.isnan(std)) else np.nan
            sortino = mean / downside_std if (downside_std is not None and downside_std > 0 and not np.isnan(downside_std)) else np.nan

            wealth_curve = np.cumprod(1 + returns) if len(returns) > 0 else np.array([])
            peak_wealth = np.maximum.accumulate(wealth_curve) if len(wealth_curve) > 0 else np.array([])
            drawdowns = (wealth_curve - peak_wealth) / (peak_wealth + 1e-8) if len(wealth_curve) > 0 else np.array([])
            max_drawdown = np.abs(drawdowns.min()) if len(drawdowns) > 0 else np.nan
            calmar = ((wealth_curve[-1] - 1) / max_drawdown) if (len(wealth_curve) > 0 and max_drawdown and not np.isnan(max_drawdown) and max_drawdown > 0) else np.nan
            cum_return = wealth_curve[-1] - 1 if len(wealth_curve) > 0 else np.nan
            final_wealth = wealth_curve[-1] if len(wealth_curve) > 0 else np.nan

            # Trade-level metrics
            trades = []
            trade_profits = []
            prev = 0
            for i, ret in enumerate(returns):
                if prev == 0 and ret != 0:
                    entry_idx = i
                    entry_dir = np.sign(ret)
                elif prev != 0 and (ret == 0 or np.sign(ret) != np.sign(prev)):
                    if 'entry_idx' in locals():
                        trade = returns[entry_idx:i+1]
                        trade_profits.append(np.sum(trade))
                        del entry_idx
                prev = ret
            win_rate = np.median(np.array(trade_profits) > 0) if trade_profits else np.nan

            if 'market_return_1d' in self.episode_df.columns:
                market_returns = self.episode_df['market_return_1d'].values[1:self.episode_length]
                market_wealth_curve = np.cumprod(1 + market_returns) if len(market_returns) > 0 else np.array([])
                market_cum_return = market_wealth_curve[-1] - 1 if len(market_wealth_curve) > 0 else np.nan
                alpha = cum_return - market_cum_return if cum_return is not None and not np.isnan(cum_return) and market_cum_return is not None and not np.isnan(market_cum_return) else np.nan
            else:
                alpha = np.nan

            info.update({
                "episode_sharpe": sharpe,
                "episode_sortino": sortino,
                "rewards_history": np.array(self.reward_history),
                "episode_total_reward": np.sum(self.reward_history) if len(self.reward_history) > 0 else np.nan,
                "cumulative_return": cum_return,
                "calmar": calmar,
                "max_drawdown": max_drawdown,
                "win_rate": win_rate,
                "alpha": alpha,
                "returns": returns,
                "market_returns": market_returns if 'market_returns' in locals() else [],
                "downside": downside,
                "regime": self.get_current_regime(),
                "final_wealth": final_wealth,
                "action_hold_count": self.action_counts[0],
                "action_buy_count": self.action_counts[1],
                "action_sell_count": self.action_counts[2]
            })

        return obs, reward, done, False, info

    def render(self):
        print(f"Step: {self.current_step} | Pos: {self.position} | Hold: {self.holding_period} | CumRew: {self.cumulative_reward:.4f}")



class SequenceAwareSharpeTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_sharpe, **kwargs)

class SequenceAwareSortinoTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_sortino, **kwargs)

class SequenceAwareAlphaTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_alpha, **kwargs)

class SequenceAwareDrawdownTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_drawdown, **kwargs)

class SequenceAwareCumulativeTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_cumulative, **kwargs)

class SequenceAwareCalmarTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_calmar, **kwargs)

class SequenceAwareHybridTradingEnv(BaseSequenceAwareTradingEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, reward_fn=reward_hybrid, **kwargs)

class SequenceAwareBaselineTradingAgent:
    def __init__(self,df,feature_cols=[],
            episode_length=100, seed=314,set_episode_sequence=[]):
    
        self.env = BaseSequenceAwareTradingEnv(df, feature_cols=feature_cols,
            episode_length=episode_length, seed=seed)
        self.env.set_episode_sequence(set_episode_sequence)
        
    def predict(self,obs,*args,**kwargs):
        #print(self.env.stocks,'xxxxxxxxxxx')
        return self.env.action_space.sample(),{}
    
    def set_episode_sequence(self,seq):
        
        pass

    
    
# Regime Augmentation Wrapper ===========================
class RegimeAugmentingWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.regime_dim = 3  # One-hot: bull, bear, sideways
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(obs_shape[0], obs_shape[1] + self.regime_dim),
            dtype=np.float32
        )

    def observation(self, obs):
        regime = self.env.get_current_regime()  # should return 0, 1, or 2
        one_hot = np.zeros(self.regime_dim)
        one_hot[regime] = 1.0
        one_hot = np.repeat(one_hot[None, :], obs.shape[0], axis=0)
        return np.concatenate([obs, one_hot], axis=-1)

In [19]:
# Regime Augmentation Wrapper ===========================
class RegimeAugmentingWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.regime_dim = 3  # One-hot: bull, bear, sideways
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(obs_shape[0], obs_shape[1] + self.regime_dim),
            dtype=np.float32
        )

    def observation(self, obs):
        regime = self.env.get_current_regime()  # 0,1,2 -> bull,bear,sideways
        one_hot = np.zeros(self.regime_dim)
        one_hot[regime] = 1.0
        one_hot = np.repeat(one_hot[None, :], obs.shape[0], axis=0)
        return np.concatenate([obs, one_hot], axis=-1)

class PerEpisodeRewardNormalizer(gym.Wrapper):
    def reset(self, **kwargs):
        self.episode_rewards = []
        return self.env.reset(**kwargs)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        self.episode_rewards.append(reward)
        mean = np.mean(self.episode_rewards)
        std = np.std(self.episode_rewards) + 1e-8
        norm_reward = (reward - mean) / std
        return obs, norm_reward, done, truncated, info

In [20]:
class MarketVersusWalletHistoryTracker:
    def __init__(self, initial_wallet=1.0):
        self.wallet_value = initial_wallet
        self.prev_wallet_value = initial_wallet
        self.wallet_locked = False
        self.buy_price = None
        self.market_entry_price = None
        self.last_price = None
        self.has_opened_position = False  # NEW: ensure proper update after first buy

        self.wallet_history = []
        self.market_history = []
        self.price_history = []
        self.action_history = []

    def reset(self, initial_price):
        self.__init__(initial_wallet=1.0)
        self.market_entry_price = initial_price
        self.last_price = initial_price
        self.wallet_history.append(self.wallet_value)
        self.market_history.append(1.0)
        self.price_history.append(initial_price)
        self.action_history.append(0)

    def step(self, action, current_price):
        self.price_history.append(current_price)
        agent_action = 0

        # === 1. Update market benchmark ===
        market_perf = current_price / self.market_entry_price
        self.market_history.append(market_perf)

        # === 2. Update wallet value ===
        if self.wallet_locked and self.has_opened_position:
            self.wallet_value *= current_price / self.last_price

        self.wallet_history.append(self.wallet_value)
        self.prev_wallet_value = self.wallet_value
        self.last_price = current_price  # must be set after wallet update!

        # === 3. Process Action ===
        if action == 1 and not self.wallet_locked:
            self.buy_price = current_price
            self.wallet_locked = True
            self.has_opened_position = True
            agent_action = 1

        elif action == 2 and self.wallet_locked:
            self.wallet_locked = False
            self.buy_price = None
            self.has_opened_position = False
            agent_action = 2

        self.action_history.append(agent_action)

    def export(self):
        return {
            "wallet_history": self.wallet_history,
            "market_history": self.market_history,
            "market_price_history": self.price_history,
            "performed_action_history": self.action_history
        }


In [None]:
# === Imports ===
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import ttest_ind, mannwhitneyu

from sb3_contrib import RecurrentPPO
#from src.env.base_timeseries_trading_env import SequenceAwareCumulativeTradingEnv
from src.defaults import TOP2_STOCK_BY_SECTOR, EPISODE_LENGTH
from src.data.feature_pipeline import load_base_dataframe
#from src.env.base_timeseries_trading_env import RegimeAugmentingWrapper
TOTAL_TIMESTEPS=200000
ENV_CLASS=SequenceAwareCumulativeTradingEnv
# === Config ===
N_EVAL_EPISODES = 5


# === Load Data ===
ohlcv_df = load_base_dataframe()
test_df = ohlcv_df[(ohlcv_df['date'] >= "2023-07-01") & (ohlcv_df['date'] < "2024-01-01")]
test_df = test_df[test_df['symbol'].isin(TOP2_STOCK_BY_SECTOR)]




test_env =ENV_CLASS(test_df, episode_length=EPISODE_LENGTH, feature_cols=FEATURE_COLS)
# Train on few episodes to prove a point only

test_seq = test_env.generate_episode_sequences(int(TOTAL_TIMESTEPS))




def evaluate_random_agent( env, n_episodes=22):
    episode_metrics = []
    episode_infos = []

    for _ in tqdm(range(n_episodes), desc="Evaluating Agent"):
        obs, _ = env.reset()
        done = False
        state = None
        rewards = []
        infos = []

        # === Tracker setup ===
        tracker = MarketVersusWalletHistoryTracker()
        initial_price = env.env.env.episode_df.iloc[0]['close']
        tracker.reset(initial_price)
        initial_symbol =env.env.env.episode_df.iloc[0]['symbol']
        #print(initial_symbol,env.env.env.episode_df.iloc[0]['date'],len(env.env.env.episode_df),env.env.env.episode_counter,env.env.env.episode_sequence)
        while not done:
            action=  env.action_space.sample()
            action = int(action)
            current_price = env.env.env.episode_df.iloc[env.env.env.current_step]['close']
            current_symbol = env.env.env.episode_df.iloc[env.env.env.current_step]['symbol']
            if(current_symbol != initial_symbol):
                print('EPISODE SWITCHED', initial_symbol,current_symbol)
                initial_symbol = current_symbol
            # Step the wallet tracker
            tracker.step(action, current_price)

            obs, reward, done, _, info = env.step(action)
            rewards.append(reward)
            infos.append(info)
        #print(current_symbol)
        # === Episode summary ===
        _env = env.env.env
        agent_wealth = infos[-1].get("final_wealth", np.nan)
        market_wealth = np.prod(1 + _env.episode_df['market_return_1d'].values)
        alpha = agent_wealth - market_wealth

        metrics = {
            "total_reward": np.sum(rewards),
            "final_wealth": agent_wealth,
            "market_wealth": market_wealth,
            "calmar": infos[-1].get("calmar", np.nan),
            "sharpe": infos[-1].get("episode_sharpe", np.nan),
            "sortino": infos[-1].get("episode_sortino", np.nan),
            "alpha": alpha,
            "episode_id": _env.episode_counter,
            "regime": infos[-1].get("regime", np.nan)
        }

        tracker_data = tracker.export()
        info["ticker"] = _env.episode_df.iloc[0]['symbol']
        info["wallet_history"] = tracker_data["wallet_history"]
        info["market_history"] = tracker_data["market_history"]
        info["market_price_history"] = tracker_data["market_price_history"]
        info["performed_action_history"] = tracker_data["performed_action_history"]
        episode_infos.append(info)
        episode_metrics.append(metrics)

    return pd.DataFrame(episode_metrics), episode_infos



def make_test_env():
    eval_env =ENV_CLASS(test_df, episode_length=EPISODE_LENGTH, feature_cols=FEATURE_COLS)
    eval_env.set_episode_sequence(test_seq)
    return PerEpisodeRewardNormalizer(RegimeAugmentingWrapper(eval_env)) #RegimeAugmentingWrapper(ENV_CLASS(test_df, episode_length=EPISODE_LENGTH,feature_cols=FEATURE_COLS))



random_agent_df, random_agent_infos = evaluate_random_agent(make_test_env(), n_episodes=len(test_seq))
#random_agent_df, random_agent_infos = evaluate_random_agent(make_test_env(), n_episodes=5)

#ppo_agent_df["agent"] = "recurrent_ppo"
random_agent_df["agent"] = "random"
#results_df = pd.concat([ppo_agent_df, random_agent_df])



Evaluating Agent:   9%|▉         | 183/1961 [02:22<24:55,  1.19it/s]

In [None]:
info

In [None]:
# REWARD ANALYSIS

import matplotlib.pyplot as plt

# Pick one episode
info = random_agent_infos[0]  # or any i
rewards = info["episode_total_reward"]
returns = info["returns"]
cum_rewards = np.cumsum(info["returns"] * info["performed_action_history"][1:])  # or info['reward_history'] if saved
cum_returns = np.cumprod(1 + np.array(returns)) - 1

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(cum_returns, label="Cumulative Return")
plt.plot(np.cumsum(info["rewards_history"]), label="Cumulative Reward")
plt.title("Cumulative Return vs Reward")
plt.legend(); plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(np.cumsum(info["rewards_history"]), cum_returns)
plt.xlabel("Cumulative Reward")
plt.ylabel("Cumulative Return")
plt.title("Reward vs Return Correlation")
plt.grid(True)
plt.show()

In [None]:
reward_list = []
return_list = []
alpha_list = []

for info in random_agent_infos:
    reward_list.append(np.sum(info["rewards_history"]))
    return_list.append(info["final_wealth"] - 1)
    alpha_list.append(info["alpha"])

import pandas as pd
df = pd.DataFrame({
    "reward": reward_list,
    "return": return_list,
    "alpha": alpha_list
})

print(df.corr())
import seaborn as sns
sns.pairplot(df)