# Data Wrangling and Feature Engineering for RL Environment
This notebook preprocesses and engineers features from raw trading data to prepare it for Soft Actor-Critic training.


In [32]:
import os
import pandas as pd
import numpy as np
import yfinance as yf
from src.utils.features import add_features, build_all_features  # assuming build_all_features uses add_features per ticker

# ---------------------------------------------------------------------------
# 1. Download and clean intraday price data for the selected ETFs
# ---------------------------------------------------------------------------
tickers = ["SPY", "QQQ", "IWM", "TLT", "GLD", "XLE", "XLF", "EEM", "HYG", "DBC"]

raw = yf.download(
    tickers=tickers,
    start="2025-08-19",
    end="2025-10-09",
    interval="2m",
    group_by="ticker",
    auto_adjust=True,
    threads=True
)

def clean_panel(raw):
    """
    Cleans raw Yahoo Finance panel data:
    - Restricts to standard OHLCV fields
    - Localizes to NY time zone if missing
    - Trims to regular trading hours
    - Forward/backward fills short gaps
    - Drops entirely empty rows
    """
    fields = ["Open", "High", "Low", "Close", "Volume"]
    df = raw.dropna(how="all")
    df = df.loc[:, pd.IndexSlice[:, fields]]
    df = df.sort_index()
    if df.index.tz is None:
        df = df.tz_localize("America/New_York", nonexistent="shift_forward", ambiguous="NaT")
    df = df.between_time("09:30", "16:00")
    df = df.groupby(axis=1, level=0).ffill(limit=5).bfill(limit=5)
    df = df.dropna(how="all")
    return df

panel = clean_panel(raw)

# ---------------------------------------------------------------------------
# 2. Save cleaned price panel to disk for reproducibility
# ---------------------------------------------------------------------------
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
panel.to_pickle("data/processed/etfs_2m_clean.pkl")
print(" Clean panel saved:", panel.shape)

# ---------------------------------------------------------------------------
# 3. Feature construction per ticker
#    Each ticker is processed individually using the custom feature builder
# ---------------------------------------------------------------------------
feat = build_all_features(panel)
print(" Raw feature matrix:", feat.shape)

# Standardize column structure to a consistent MultiIndex (Ticker, Feature)
if feat.columns.nlevels == 3:
    feat.columns = feat.columns.droplevel(2)
feat.columns.set_names(["Ticker", "Feature"], inplace=True)

# ---------------------------------------------------------------------------
# 4. Remove raw OHLCV fields from the feature set to keep only engineered features
# ---------------------------------------------------------------------------
drop_fields = ["Open", "High", "Low", "Close", "Volume"]
feat = feat.drop(columns=drop_fields, level="Feature", errors="ignore")

# ---------------------------------------------------------------------------
# 5. Normalize features by feature-wise z-scoring across the entire dataset
#    This ensures comparable scale across assets and features
# ---------------------------------------------------------------------------
feat = (
    feat.T
    .groupby(level="Feature")
    .apply(lambda x: (x - x.mean()) / (x.std() + 1e-9))
    .T
)

# ---------------------------------------------------------------------------
# 6. Final cleanup and persistence of normalized feature matrix
# ---------------------------------------------------------------------------
feat = feat.replace([np.inf, -np.inf], np.nan).dropna(how="all")
feat.to_pickle("data/processed/etfs_2m_features_clean.pkl")
print(" Normalized features saved:", feat.shape)

# ---------------------------------------------------------------------------
# 7. Diagnostic: compute per-feature variance to verify successful normalization
# ---------------------------------------------------------------------------
# Drop extra column levels if present
if feat.columns.nlevels > 2:
    feat.columns = feat.columns.droplevel(list(range(feat.columns.nlevels - 2)))

# Rebuild MultiIndex with clear level names
feat.columns = pd.MultiIndex.from_tuples(feat.columns, names=["Ticker", "Feature"])

# Calculate average standard deviation per feature across tickers
feat_var = (
    feat.T
    .groupby(level="Feature")
    .std()
    .mean(axis=1)
    .sort_values(ascending=False)
)

print(" Feature variance summary:")
print(feat_var)


[*********************100%***********************]  10 of 10 completed
  df = df.groupby(axis=1, level=0).ffill(limit=5).bfill(limit=5)
  df["returns"] = df["Close"].pct_change()
  df["momentum_5"] = df["Close"].pct_change(5)
  df["momentum_20"] = df["Close"].pct_change(20)
  df["vol_change"] = df["Volume"].pct_change()


 Clean panel saved: (2128, 50)
 Raw feature matrix: (2108, 130)
 Normalized features saved: (2108, 80)
 Feature variance summary:
Feature
rsi              1.000000
zscore           1.000000
vol_change       1.000000
macd             1.000000
momentum_20      0.999999
momentum_5       0.999999
returns          0.999997
volatility_20    0.999995
dtype: float64


In [33]:
def compute_turbulence(r1, window=390):
    """
    Computes a turbulence index using Mahalanobis distance over a rolling window.
    This measures how unusual the current return vector is relative to its recent history.
    """

    turb = []
    idx = r1.index

    # Iterate through the time series, starting after the initial window
    for i in range(window, len(r1)):
        # Extract historical return window
        hist = r1.iloc[i - window:i]

        # Compute historical mean and covariance matrix
        mu = hist.mean().values
        cov = hist.cov().values

        # Skip periods where covariance matrix is singular or ill-conditioned
        if np.linalg.det(cov) <= 0:
            turb.append(np.nan)
            continue

        # Compute Mahalanobis distance between current returns and historical distribution
        diff = r1.iloc[i].values - mu
        m_dist = np.sqrt(diff.T @ np.linalg.inv(cov) @ diff)
        turb.append(m_dist)

    # Align turbulence series with original index
    turb = pd.Series(turb, index=idx[window:])
    return turb.reindex(idx)


In [28]:
%%writefile src/utils/costs.py
import numpy as np
import pandas as pd

def spread_proxy(high, low):
    """
    Estimates the bid-ask half-spread using high–low ranges.
    Acts as a proxy when direct spread data is unavailable.
    """
    rng = (high - low).clip(lower=0)
    mid = 0.5 * (high + low)
    sp = (rng / mid).fillna(0.0)
    return 0.25 * sp  # Scales the range to approximate half-spread

def realized_vol(close, win=60):
    """
    Computes rolling realized volatility based on percentage returns.
    Uses a simple standard deviation over a specified window.
    """
    r = close.pct_change()
    return r.rolling(win).std().fillna(0.0)

def participation(dollar_trade, price, volume):
    """
    Calculates participation rate relative to the dollar volume of the bar.
    Used to cap trading volume relative to available liquidity.
    """
    dollar_bar = (price * volume).replace(0, np.nan)
    return (dollar_trade / dollar_bar).fillna(0.0).clip(lower=0.0)

def exec_price(side, mid, half_spread, sigma, dt_min, part, k=0.6, lam=2e-4):
    """
    Models execution price with drift and impact components.
    - side: trade direction (+1 buy, -1 sell, 0 no trade)
    - mid: current mid price
    - half_spread: estimated half-spread
    - sigma: local volatility estimate
    - dt_min: time increment in minutes
    - part: participation rate
    - k: drift scaling coefficient
    - lam: impact parameter
    """
    drift = k * sigma * np.sqrt(max(dt_min, 1.0))
    impact = lam * (part ** 2)
    if side > 0:
        px = mid * (1.0 + half_spread + drift + impact)
    elif side < 0:
        px = mid * (1.0 - half_spread - drift - impact)
    else:
        px = mid
    return px

def turnover_l1(w_new, w_old):
    """
    L1 turnover penalty: sum of absolute changes in portfolio weights.
    Used to penalize excessive rebalancing.
    """
    return np.abs(w_new - w_old).sum()



Overwriting src/utils/costs.py


In [29]:
%%writefile src/env/portfolio_env.py
import numpy as np
import pandas as pd
from src.utils.costs import spread_proxy, realized_vol, participation, exec_price, turnover_l1

class PortfolioEnv:
    def __init__(
        self,
        prices,
        freq_min=1,
        start_equity=1_000_000,
        part_cap=0.05,
        k=0.6,
        lam=2e-4,
        gamma_bar=12,
        eta_turnover=0.001
    ):
        # Store raw price panel and extract core fields
        self.prices = prices
        self.close = prices.loc[:, pd.IndexSlice[:, "Close"]]
        self.high  = prices.loc[:, pd.IndexSlice[:, "High"]]
        self.low   = prices.loc[:, pd.IndexSlice[:, "Low"]]
        self.vol   = prices.loc[:, pd.IndexSlice[:, "Volume"]]
        self.mid   = self.close

        # Precompute spread and volatility proxies used in execution cost and risk models
        self.half_spread = spread_proxy(self.high, self.low)
        self.sigma = realized_vol(self.close, win=60)

        # Core trading and cost parameters
        self.part_cap = part_cap
        self.k = k
        a = lam
        self.lam = a
        self.freq_min = freq_min
        self.gamma = gamma_bar / (252 * 390)   # Convert annualized gamma to per-minute scale
        self.eta_turnover = eta_turnover
        self.start_equity = start_equity
        self.tickers = self.close.columns.get_level_values(0).unique()

        self.reset()

    # ------------------------------------------------------
    # RESET
    # ------------------------------------------------------
    def reset(self):
        # Initialize environment state: starting equity, positions, weights, and time index
        self.t = 1
        self.equity = float(self.start_equity)
        self.w = np.zeros(len(self.tickers))
        self.q = np.zeros(len(self.tickers))        # positions in shares
        self.cash = float(self.start_equity)
        return self._obs()

    # ------------------------------------------------------
    # OBSERVATION
    # ------------------------------------------------------
    def _obs(self):
        # Expose weights and current equity as the observation state
        return {"weights": self.w.copy(), "equity": float(self.equity)}

    # ------------------------------------------------------
    # STEP
    # ------------------------------------------------------
    def step(self, w_target):
        # Clamp and normalize target weights to ensure valid portfolio allocation
        w_target = np.array(w_target).flatten()
        w_target = np.clip(w_target, 0, 1)
        if w_target.sum() > 0:
            w_target /= w_target.sum()

        # Current and next index for price lookup
        idx = self.close.index[self.t]
        idx_next = self.close.index[min(self.t + 1, len(self.close) - 1)]

        # Retrieve current price and liquidity state, replacing bad values
        mid = np.nan_to_num(self.mid.loc[idx].values, nan=0.0, posinf=0.0, neginf=0.0)
        hs  = np.nan_to_num(self.half_spread.loc[idx].values, nan=0.0, posinf=0.0, neginf=0.0)
        sg  = np.nan_to_num(self.sigma.loc[idx].values, nan=1e-6, posinf=1e-6, neginf=1e-6)
        vol = np.nan_to_num(self.vol.loc[idx].values,  nan=1.0, posinf=1.0, neginf=1.0)

        # Calculate current portfolio value based on mid-prices
        dollar_pos_now = self.q * mid
        port_now = float(self.cash + np.sum(dollar_pos_now))
        self.equity = max(port_now, 1.0)

        # Compute target vs current weight difference
        delta_w = w_target - self.w

        # Convert target weight change into dollar notional and cap participation
        desired_notional = delta_w * self.equity
        part = participation(pd.Series(np.abs(desired_notional)), pd.Series(mid), pd.Series(vol)).values
        part = np.nan_to_num(np.clip(part, 0, self.part_cap), nan=0.0)
        feasible_notional = desired_notional * part
        signed_shares = np.nan_to_num(feasible_notional / np.maximum(mid, 1e-12))

        # Determine execution prices incorporating spread, drift, and impact
        exec_px = np.array([
            exec_price(np.sign(dw), m, h, sgm, self.freq_min, p, self.k, self.lam)
            for dw, m, h, sgm, p in zip(delta_w, mid, hs, sg, part)
        ])
        exec_px = np.nan_to_num(exec_px, nan=mid, posinf=mid, neginf=mid)

        # Apply cash impact of trades and update share positions
        trade_cash_flow = float(np.sum(signed_shares * exec_px))
        self.cash -= trade_cash_flow
        self.q += signed_shares

        # Revalue portfolio using next prices
        next_px = np.nan_to_num(self.mid.loc[idx_next].values, nan=mid, posinf=mid, neginf=mid)
        portfolio_value = float(self.cash + np.sum(self.q * next_px))
        self.equity = max(portfolio_value, 1.0)

        # Update portfolio weights based on new equity and prices
        dollar_pos_next = self.q * next_px
        self.w = np.clip(np.nan_to_num(dollar_pos_next / np.maximum(self.equity, 1e-12)), 0, 1)

        # Compute risk penalty based on instantaneous variance estimate
        var_diag = np.nan_to_num((self.sigma.loc[idx] ** 2).values, nan=0.0)
        risk_pen = 0.5 * self.gamma * float(np.dot(self.w, var_diag * self.w))

        # Compute turnover penalty relative to pre-trade weights
        prev_w = np.nan_to_num(dollar_pos_now / np.maximum(port_now, 1e-12))
        tvr = turnover_l1(self.w, prev_w)
        tvr_pen = self.eta_turnover * tvr

        # Calculate realized return using price change between previous and next ticks
        prev_idx = self.close.index[self.t - 1]
        r_bar = np.nan_to_num(
            self.mid.loc[idx_next].values / np.maximum(self.mid.loc[prev_idx].values, 1e-12) - 1.0,
            nan=0.0,
        )
        pnl_ret = float(np.dot(self.w, r_bar))

        # Add small drift penalty to discourage frequent allocation shifts
        drift_pen = np.clip(1e-5 * np.square(delta_w).sum(), 0, 1e-3)

        # Aggregate reward components and apply squashing for stability
        raw = pnl_ret - risk_pen - tvr_pen - drift_pen
        reward = float(np.clip(np.tanh(raw * 20.0), -1.0, 1.0))

        info = {
            "pnl_ret": pnl_ret,
            "risk_pen": risk_pen,
            "tvr_pen": tvr_pen,
            "equity": float(self.equity)
        }

        # Advance simulation clock and detect terminal state
        self.t += 1
        done = self.t >= len(self.close) - 1

        return self._obs(), reward, done, info


Overwriting src/env/portfolio_env.py


In [30]:
%%writefile src/env/portfolio_gym.py
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from src.env.portfolio_env import PortfolioEnv

class PortfolioGym(gym.Env):
    metadata = {"render.modes": ["human"]}

    def __init__(self, prices, feature_df=None, start_equity=1_000_000):
        super().__init__()
        # Core environment handles portfolio mechanics and reward calculation
        self.core = PortfolioEnv(prices, start_equity=start_equity)
        self.feature_df = feature_df
        self.tickers = self.core.tickers
        self.n_assets = len(self.tickers)

        # Observation vector consists of equity, portfolio weights, and optional features
        feat_dim = 0 if feature_df is None else feature_df.shape[1]
        obs_dim = 1 + self.n_assets + feat_dim
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)

        # Action space corresponds to target portfolio weights for each asset
        self.action_space = spaces.Box(low=0.0, high=1.0, shape=(self.n_assets,), dtype=np.float32)

    def reset(self, *, seed=None, options=None):
        # Reset both the wrapper and the underlying portfolio environment
        super().reset(seed=seed)
        obs_core = self.core.reset()
        obs = self._build_obs(obs_core)
        return obs.astype(np.float32), {}

    def _build_obs(self, core_obs):
        # Construct the observation vector at the current timestep
        equity = float(core_obs["equity"])
        w = np.array(core_obs["weights"], dtype=np.float32)

        if self.feature_df is not None:
            # Align features with current environment timestep
            t = min(self.core.t, len(self.feature_df) - 1)
            f = self.feature_df.iloc[t].to_numpy(dtype=np.float32)
            f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0)
            f = (f - np.mean(f)) / (np.std(f) + 1e-8)
            obs = np.concatenate(([equity], w, f))
        else:
            obs = np.concatenate(([equity], w))

        return np.nan_to_num(obs, nan=0.0, posinf=0.0, neginf=0.0)

    def step(self, action):
        # Delegate trade execution and reward calculation to the core environment
        obs_core, reward, done, info = self.core.step(action)

        # Replace non-finite rewards to avoid propagation of numerical errors
        if not np.isfinite(reward):
            print(f"[Warning] Non-finite reward at t={self.core.t}: {reward}")
            reward = 0.0

        obs = self._build_obs(obs_core)

        # Replace NaNs in observation with zeros to maintain numerical stability
        if np.isnan(obs).any():
            print(f"[NaN detected in obs] t={self.core.t}, replacing with zeros.")
            obs = np.nan_to_num(obs, nan=0.0, posinf=0.0, neginf=0.0)

        if np.isnan(reward):
            reward = 0.0

        # Clip extreme values to avoid destabilizing the learning process
        obs = np.clip(obs, -1e6, 1e6)
        reward = float(np.clip(reward, -1e3, 1e3))

        # Gymnasium interface compatibility: separate terminated and truncated flags
        terminated = bool(done)
        truncated = False

        return obs.astype(np.float32), reward, terminated, truncated, info

    def render(self):
        # Basic render method for quick debugging or logging
        print(f"Step {self.core.t}, Equity {self.core.equity:.2f}")


Overwriting src/env/portfolio_gym.py


In [31]:
%%writefile src/utils/features.py

import pandas as pd
import numpy as np

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Enriches a single-ticker price dataframe with a set of standard technical indicators.
    Assumes columns: ['Open', 'High', 'Low', 'Close', 'Volume'].
    """
    df = df.copy()

    # Basic return and volatility metrics
    df["returns"] = df["Close"].pct_change()
    df["volatility_20"] = df["returns"].rolling(20).std()

    # Momentum indicators over short and medium windows
    df["momentum_5"] = df["Close"].pct_change(5)
    df["momentum_20"] = df["Close"].pct_change(20)

    # Relative Strength Index (RSI) over a 14-period lookback
    delta = df["Close"].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / (loss + 1e-9)
    df["rsi"] = 100 - (100 / (1 + rs))

    # Moving Average Convergence Divergence (MACD)
    ema12 = df["Close"].ewm(span=12, adjust=False).mean()
    ema26 = df["Close"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26

    # Z-score of price relative to a 20-period rolling mean
    df["zscore"] = (df["Close"] - df["Close"].rolling(20).mean()) / (df["Close"].rolling(20).std() + 1e-9)

    # Volume change as a percentage
    df["vol_change"] = df["Volume"].pct_change()

    # Drop initial NaNs introduced by rolling calculations
    return df.dropna()

def build_all_features(panel):
    """
    Applies the add_features function to each ticker in a multi-index price panel.
    Returns a feature panel with a two-level column index: (Ticker, Feature).
    """
    tickers = panel.columns.get_level_values(0).unique()
    feat_list = []

    # Iterate over tickers and compute features individually
    for t in tickers:
        df_t = panel[t].copy()
        df_feat = add_features(df_t)
        feat_list.append(df_feat)

    # Combine feature dataframes for all tickers into a single multi-index dataframe
    feat = pd.concat(feat_list, axis=1, keys=tickers)
    return feat


Overwriting src/utils/features.py


In [34]:
%%writefile src/env/portfolio_env.py
import numpy as np
import pandas as pd
from src.utils.costs import spread_proxy, realized_vol, participation, exec_price, turnover_l1

class PortfolioEnv:
    def __init__(
        self,
        prices,
        freq_min=1,
        start_equity=1_000_000,
        part_cap=0.05,
        k=0.6,
        lam=2e-4,
        gamma_bar=11.0,
        eta_turnover=0.001
    ):
        self.prices = prices
        self.close = prices.loc[:, pd.IndexSlice[:, "Close"]]
        self.high = prices.loc[:, pd.IndexSlice[:, "High"]]
        self.low = prices.loc[:, pd.IndexSlice[:, "Low"]]
        self.vol = prices.loc[:, pd.IndexSlice[:, "Volume"]]
        self.mid = self.close

        # Cost and risk model components
        self.half_spread = spread_proxy(self.high, self.low)
        self.sigma = realized_vol(self.close, win=60)

        self.part_cap = part_cap
        self.k = k
        self.lam = lam
        self.freq_min = freq_min
        self.gamma = gamma_bar / (252 * 390)  # annualize risk cost
        self.eta_turnover = eta_turnover
        self.start_equity = start_equity
        self.tickers = self.close.columns.get_level_values(0).unique()

        self.reset()

    # ------------------------------------------------------
    # RESET
    # ------------------------------------------------------
    def reset(self):
        self.t = 1
        self.equity = float(self.start_equity)
        self.w = np.zeros(len(self.tickers))
        self.cash = self.equity
        return self._obs()

    # ------------------------------------------------------
    # OBSERVATION
    # ------------------------------------------------------
    def _obs(self):
        return {"weights": self.w.copy(), "equity": float(self.equity)}

    # ------------------------------------------------------
    # STEP
    # ------------------------------------------------------
    def step(self, w_target):
        # --- Safety clamp ---
        w_target = np.array(w_target).flatten()
        w_target = np.clip(w_target, 0, 1)
        if w_target.sum() > 0:
            w_target /= w_target.sum()

        # --- Time + current prices ---
        idx = self.close.index[self.t]
        idx_next = self.close.index[min(self.t + 1, len(self.close) - 1)]

        mid = np.nan_to_num(self.mid.loc[idx].values, nan=0.0, posinf=0.0, neginf=0.0)
        hs = np.nan_to_num(self.half_spread.loc[idx].values, nan=0.0, posinf=0.0, neginf=0.0)
        sg = np.nan_to_num(self.sigma.loc[idx].values, nan=1e-6, posinf=1e-6, neginf=1e-6)
        vol = np.nan_to_num(self.vol.loc[idx].values, nan=1.0, posinf=1.0, neginf=1.0)

        # --- Compute trade ---
        delta_w = w_target - self.w
        dollar_trade = np.abs(delta_w) * self.equity
        part = participation(pd.Series(dollar_trade), pd.Series(mid), pd.Series(vol)).values
        part = np.nan_to_num(np.clip(part, 0, self.part_cap), nan=0.0)
        side = np.sign(delta_w)

        exec_px = np.array([
            exec_price(s, m, h, sgm, self.freq_min, p, self.k, self.lam)
            for s, m, h, sgm, p in zip(side, mid, hs, sg, part)
        ])
        exec_px = np.nan_to_num(exec_px, nan=mid, posinf=mid, neginf=mid)

        # --- Trade cash impact ---
        shares = np.nan_to_num((np.abs(delta_w) * self.equity) / np.maximum(mid, 1e-12))
        trade_cash = np.sum(shares * exec_px * side)

        # --- Update positions ---
        dollar_pos = self.w * self.equity
        dollar_pos += shares * exec_px * side
        self.equity = np.maximum(self.equity - np.abs(trade_cash), 1.0)
        self.cash = max(self.start_equity - np.sum(dollar_pos), 0.0)

        next_px = np.nan_to_num(self.mid.loc[idx_next].values, nan=mid, posinf=mid, neginf=mid)
        dollar_pos = dollar_pos * (next_px / np.maximum(mid, 1e-12))
        self.equity = float(np.nan_to_num(np.sum(dollar_pos) + self.cash, nan=self.start_equity))
        self.w = np.clip(np.nan_to_num(dollar_pos / np.maximum(self.equity, 1e-12)), 0, 1)

        # --- Penalties ---
        var_diag = np.nan_to_num((self.sigma.loc[idx] ** 2).values, nan=0.0)
        risk_pen = 0.5 * self.gamma * float(np.dot(self.w, var_diag * self.w))
        tvr = turnover_l1(self.w, self.w - delta_w)
        tvr_pen = self.eta_turnover * tvr

        # --- Reward ---
        prev_idx = self.close.index[self.t - 1]
        r_bar = np.nan_to_num(
            self.mid.loc[idx_next].values / np.maximum(self.mid.loc[prev_idx].values, 1e-12) - 1.0,
            nan=0.0,
        )
        pnl_ret = float(np.dot(self.w, r_bar))

        drift_pen = 0.001 * np.square(delta_w).sum()

        # --- Reward decomposition ---
        # Scale pnl by equity to get relative return per step
        step_ret = pnl_ret

        # Risk penalty scaled down: risk cost shouldn't dwarf pnl
        risk_pen_scaled = 0.1 * risk_pen

        # Turnover penalty usually small relative to pnl
        tvr_pen_scaled = tvr_pen

        # Drift penalty already tiny
        drift_pen_scaled = drift_pen

        # Raw reward
        raw_reward = step_ret - risk_pen_scaled - tvr_pen_scaled - drift_pen_scaled

        # Normalize by a typical step volatility to keep magnitude reasonable
        reward_norm = raw_reward / (np.std([step_ret, risk_pen_scaled, tvr_pen_scaled, 1e-6]) + 1e-6)

        # Soft clipping to avoid tanh saturation
        reward = float(np.clip(reward_norm, -5.0, 5.0))


        # --- Stability guards ---
        reward = float(np.nan_to_num(reward, nan=0.0, posinf=0.0, neginf=0.0))
        self.equity = float(np.nan_to_num(self.equity, nan=self.start_equity, posinf=self.start_equity, neginf=self.start_equity))

        # Scale reward
        

        info = {
            "pnl_ret": pnl_ret,
            "risk_pen": risk_pen,
            "tvr_pen": tvr_pen,
            "equity": float(self.equity)
        }

        self.t += 1
        done = self.t >= len(self.close) - 1
        return self._obs(), reward, done, info


Overwriting src/env/portfolio_env.py
