In [None]:
"""
Multi-asset PPO trading setup

Assumptions:
- prices: np.ndarray of shape (T, N_assets)
- features: np.ndarray of shape (T, F), already aligned with prices by time
- At each step t:
    * Agent observes state s_t = [features_t, position_pct, cash_pct]
    * Agent outputs a_t in [-1,1]^N_assets (one PPO output per asset)
    * We follow your mapping:
        E_t = cash_t + sum(p_i * S_i_t)
        For N assets, each asset gets E_t / N as base scale:
            V_i = (E_t / N) * a_i
            N_i_target = V_i / S_i_t
            Delta_i = N_i_target - p_i (trade size)
    * Reward is portfolio return from t to t+1 minus transaction costs
"""

import numpy as np
import gymnasium as gym
from gymnasium import spaces

import torch
import torch.nn as nn
from torch.distributions import Normal


# ==========================
# 1. Trading Environment (gymnasium style)
# ==========================

class MultiAssetTradingEnv(gym.Env):
    """
    FinRL-style multi-asset trading environment with continuous actions,
    following your order-size mapping diagram.

    State: [features_t, position_value_pct, cash_pct]
    Action: Box[-1,1] of size N_assets
    Reward: portfolio return from t to t+1 (after trades) minus transaction cost

    Gymnasium API:
        reset() -> obs, info
        step(action) -> obs, reward, terminated, truncated, info
    """

    metadata = {"render_modes": ["human"]}

    def __init__(
        self,
        prices: np.ndarray,
        features: np.ndarray,
        initial_cash: float = 100000.0,
        transaction_cost: float = 0.0005,  # 5 bps per dollar traded
        render_mode: str | None = None,
    ):
        super().__init__()

        assert prices.shape[0] == features.shape[0], \
            "prices and features must have the same time dimension"

        self.prices = prices.astype(np.float32)
        self.features = features.astype(np.float32)

        self.T, self.n_assets = self.prices.shape
        self.feature_dim = self.features.shape[1]

        self.initial_cash = float(initial_cash)
        self.transaction_cost = float(transaction_cost)
        self.render_mode = render_mode

        # Action: continuous allocation signals in [-1,1] for each asset
        self.action_space = spaces.Box(
            low=-1.0,
            high=1.0,
            shape=(self.n_assets,),
            dtype=np.float32,
        )

        # Observation: features + position percentage per asset + cash percentage
        self.obs_dim = self.feature_dim + self.n_assets + 1
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.obs_dim,),
            dtype=np.float32,
        )

        # Internal state
        self._reset_internal_state()

    # ---------- internal helpers ----------

    def _reset_internal_state(self):
        self.t = 0  # current time index
        self.positions = np.zeros(self.n_assets, dtype=np.float32)  # shares
        self.cash = float(self.initial_cash)
        self.equity = float(self.initial_cash)

    def _get_observation(self) -> np.ndarray:
        """
        Build observation vector at current time index self.t:
        [features_t, position_value_pct, cash_pct]
        """
        features_t = self.features[self.t]  # shape (F,)
        price_t = self.prices[self.t]       # shape (N_assets,)

        position_values = self.positions * price_t  # dollar value per asset
        total_equity = self.equity + 1e-8

        position_pct = position_values / total_equity  # shape (N_assets,)
        cash_pct = np.array([self.cash / total_equity], dtype=np.float32)

        obs = np.concatenate(
            [features_t, position_pct.astype(np.float32), cash_pct],
            axis=0
        )
        return obs.astype(np.float32)

    # ---------- gymnasium API ----------

    def reset(self, *, seed: int | None = None, options: dict | None = None):
        """
        Gymnasium reset: returns (obs, info)
        """
        super().reset(seed=seed)
        self._reset_internal_state()
        obs = self._get_observation()
        info = {}
        return obs, info

    def step(self, action):
        """
        One trading step:
        - Use current prices S_t and action a_t to compute desired target positions,
          following your mapping:
            E_t = cash + sum(p_i * S_i_t)
            For general N assets:
                base_scale = E_t / N
                V_i = base_scale * a_i
                N_i_target = V_i / S_i_t
                Delta_i = N_i_target - p_i
        - Apply trades, charge transaction cost on traded notional
        - Move to next time t+1, compute portfolio return as reward
        """
        # Clip action into [-1,1]
        action = np.clip(action, -1.0, 1.0).astype(np.float32)
        price_t = self.prices[self.t]  # prices at time t
        price_tp1 = self.prices[self.t + 1] if self.t + 1 < self.T else price_t

        # 1. Compute current equity before trades
        current_position_value = np.sum(self.positions * price_t)
        self.equity = self.cash + current_position_value

        # 2. Mapping: compute target dollar exposure and shares
        base_scale = self.equity / float(self.n_assets)  # E / N
        target_values = base_scale * action  # V_i = E/N * a_i

        # Convert target dollar values to target positions (in shares)
        target_positions = np.where(
            price_t > 0,
            target_values / price_t,
            0.0,
        ).astype(np.float32)

        # Trade size (Delta_i)
        trade_shares = target_positions - self.positions  # + buy, - sell

        # 3. Apply trades, charge transaction cost
        trade_values = trade_shares * price_t  # signed trade notional per asset
        # total dollar turnover (absolute)
        dollar_turnover = np.sum(np.abs(trade_values))
        transaction_costs = self.transaction_cost * dollar_turnover

        # Update cash and positions:
        self.cash = self.cash - np.sum(trade_values) - transaction_costs
        self.positions = self.positions + trade_shares

        # 4. Move to t+1 and compute new portfolio value
        new_position_value = np.sum(self.positions * price_tp1)
        new_equity = self.cash + new_position_value

        # Reward: simple portfolio return from t to t+1
        reward = (new_equity - self.equity) / (self.equity + 1e-8)

        # Update equity and time
        self.equity = new_equity
        self.t += 1

        # Episode termination: reached last usable time index
        terminated = self.t >= self.T - 1
        truncated = False  # 可以加 max_steps 等逻辑，这里先不截断

        obs = self._get_observation()
        info = {
            "equity": self.equity,
            "cash": self.cash,
            "positions": self.positions.copy(),
        }

        # Gymnasium: (obs, reward, terminated, truncated, info)
        return obs, float(reward), terminated, truncated, info

    def render(self):
        print(
            f"t={self.t}, equity={self.equity:.2f}, cash={self.cash:.2f}, "
            f"positions={self.positions}"
        )


# ==========================
# 2. PPO Agent
# ==========================

class PolicyValueNet(nn.Module):
    """
    Shared base network with separate policy (mean) and value heads.
    Output distribution is Normal; actions are later clipped into [-1,1].
    """

    def __init__(self, obs_dim, action_dim, hidden_sizes=(128, 128)):
        super().__init__()
        layers = []
        input_dim = obs_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            input_dim = h
        self.base = nn.Sequential(*layers)

        self.mu_head = nn.Linear(input_dim, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        self.value_head = nn.Linear(input_dim, 1)

    def forward(self, x):
        base = self.base(x)
        # keep mean in [-1,1] via tanh; we still use Normal and clip later
        mu = torch.tanh(self.mu_head(base))
        std = torch.exp(self.log_std)
        value = self.value_head(base).squeeze(-1)
        return mu, std, value

    def get_dist_and_value(self, obs):
        mu, std, value = self.forward(obs)
        dist = Normal(mu, std)
        return dist, value


class PPOAgent:
    def __init__(
        self,
        obs_dim,
        action_dim,
        lr=3e-4,
        gamma=0.99,
        lam=0.95,
        clip_ratio=0.2,
        vf_coef=0.5,
        ent_coef=0.0,
        max_grad_norm=0.5,
        device=None,
    ):
        self.device = device or (
            "cuda" if torch.cuda.is_available() else "cpu"
        )

        self.gamma = gamma
        self.lam = lam
        self.clip_ratio = clip_ratio
        self.vf_coef = vf_coef
        self.ent_coef = ent_coef
        self.max_grad_norm = max_grad_norm

        self.net = PolicyValueNet(obs_dim, action_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=lr)

    def act(self, obs):
        """
        Given a single observation (numpy array), return action (numpy),
        log_prob, and value estimate.
        """
        obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
        dist, value = self.net.get_dist_and_value(obs_t)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(-1)
        action_np = action.squeeze(0).cpu().numpy()
        log_prob_np = log_prob.item()
        value_np = value.item()
        return np.clip(action_np, -1.0, 1.0), log_prob_np, value_np

    def compute_gae(self, rewards, values, dones, last_value):
        """
        Generalized Advantage Estimation (GAE-Lambda)
        rewards, values, dones: numpy arrays length T
        last_value: scalar, V(s_{T})
        """
        T = len(rewards)
        adv = np.zeros(T, dtype=np.float32)
        last_adv = 0.0
        for t in reversed(range(T)):
            mask = 1.0 - float(dones[t])
            delta = rewards[t] + self.gamma * last_value * mask - values[t]
            last_adv = delta + self.gamma * self.lam * mask * last_adv
            adv[t] = last_adv
            last_value = values[t]
        returns = values + adv
        return adv, returns

    def update(self, batch, epochs=10, batch_size=64):
        """
        batch: dict with keys
            'obs', 'actions', 'log_probs', 'returns', 'advantages'
        """
        obs = torch.as_tensor(batch["obs"], dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(batch["actions"], dtype=torch.float32, device=self.device)
        old_log_probs = torch.as_tensor(batch["log_probs"], dtype=torch.float32, device=self.device)
        returns = torch.as_tensor(batch["returns"], dtype=torch.float32, device=self.device)
        advantages = torch.as_tensor(batch["advantages"], dtype=torch.float32, device=self.device)

        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        n = obs.size(0)
        idxs = np.arange(n)

        for _ in range(epochs):
            np.random.shuffle(idxs)
            for start in range(0, n, batch_size):
                end = start + batch_size
                mb_idx = idxs[start:end]

                mb_obs = obs[mb_idx]
                mb_actions = actions[mb_idx]
                mb_old_log_probs = old_log_probs[mb_idx]
                mb_returns = returns[mb_idx]
                mb_advantages = advantages[mb_idx]

                dist, values = self.net.get_dist_and_value(mb_obs)
                new_log_probs = dist.log_prob(mb_actions).sum(-1)
                entropy = dist.entropy().sum(-1).mean()

                # Policy loss (clipped surrogate objective)
                ratio = torch.exp(new_log_probs - mb_old_log_probs)
                surr1 = ratio * mb_advantages
                surr2 = torch.clamp(
                    ratio,
                    1.0 - self.clip_ratio,
                    1.0 + self.clip_ratio
                ) * mb_advantages
                policy_loss = -torch.min(surr1, surr2).mean()

                # Value loss
                value_loss = nn.functional.mse_loss(values, mb_returns)

                # Total loss
                loss = policy_loss + self.vf_coef * value_loss - self.ent_coef * entropy

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
                self.optimizer.step()


# ==========================
# 3. Minimal training loop (compatible with gymnasium)
# ==========================

def train_ppo_on_env(
    env: MultiAssetTradingEnv,
    agent: PPOAgent,
    total_steps: int = 50_000,
    rollout_horizon: int = 512,
):
    """
    Minimal PPO training loop.
    You can adapt this to log training curves, save models, etc.
    """
    obs, info = env.reset()
    step_count = 0

    while step_count < total_steps:
        # Collect one rollout
        obs_buf = []
        act_buf = []
        logp_buf = []
        rew_buf = []
        val_buf = []
        done_buf = []

        for _ in range(rollout_horizon):
            action, logp, value = agent.act(obs)
            next_obs, reward, terminated, truncated, info = env.step(action)

            done = terminated or truncated

            obs_buf.append(obs)
            act_buf.append(action)
            logp_buf.append(logp)
            rew_buf.append(reward)
            val_buf.append(value)
            done_buf.append(done)

            obs = next_obs
            step_count += 1

            if done:
                obs, info = env.reset()

            if step_count >= total_steps:
                break

        # Compute advantage & returns using last value estimate
        with torch.no_grad():
            last_value = agent.net.get_dist_and_value(
                torch.as_tensor(
                    obs, dtype=torch.float32, device=agent.device
                ).unsqueeze(0)
            )[1].item()

        obs_arr = np.array(obs_buf, dtype=np.float32)
        act_arr = np.array(act_buf, dtype=np.float32)
        logp_arr = np.array(logp_buf, dtype=np.float32)
        rew_arr = np.array(rew_buf, dtype=np.float32)
        val_arr = np.array(val_buf, dtype=np.float32)
        done_arr = np.array(done_buf, dtype=bool)

        adv, ret = agent.compute_gae(rew_arr, val_arr, done_arr, last_value)

        batch = {
            "obs": obs_arr,
            "actions": act_arr,
            "log_probs": logp_arr,
            "returns": ret,
            "advantages": adv,
        }

        agent.update(batch)

        print(f"Trained up to step {step_count}")

    print("Training finished.")




In [6]:
# ==========================
# Example usage with dummy data
# ==========================

if __name__ == "__main__":
    # Example: 5 ETFs, 1000 days, 20 signals per day
    T = 1000
    N_assets = 5
    F = 20

    # Dummy price paths (random walk)
    rng = np.random.default_rng(42)
    log_returns = rng.normal(0, 0.01, size=(T, N_assets))
    prices = 100 * np.exp(np.cumsum(log_returns, axis=0)).astype(np.float32)

    # Dummy features (here just some random numbers; in your project use real signals)
    features = rng.normal(size=(T, F)).astype(np.float32)

    env = MultiAssetTradingEnv(
        prices=prices,
        features=features,
        initial_cash=100000.0,
        transaction_cost=0.0005,
    )
    ppo_agent = PPOAgent(
        obs_dim=env.obs_dim,
        action_dim=env.n_assets,
        lr=3e-4,
        gamma=0.99,
        lam=0.95,
        clip_ratio=0.2,
    )

    train_ppo_on_env(env, ppo_agent, total_steps=5000, rollout_horizon=256)

Trained up to step 256
Trained up to step 512
Trained up to step 768
Trained up to step 1024
Trained up to step 1280
Trained up to step 1536
Trained up to step 1792
Trained up to step 2048
Trained up to step 2304
Trained up to step 2560
Trained up to step 2816
Trained up to step 3072
Trained up to step 3328
Trained up to step 3584
Trained up to step 3840
Trained up to step 4096
Trained up to step 4352
Trained up to step 4608
Trained up to step 4864
Trained up to step 5000
Training finished.


In [1]:
! pip install numpy pandas matpolib

Collecting numpy
  Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)


ERROR: Ignored the following versions that require a different python version: 2.3.0 Requires-Python >=3.11; 2.3.1 Requires-Python >=3.11; 2.3.2 Requires-Python >=3.11; 2.3.3 Requires-Python >=3.11; 2.3.4 Requires-Python >=3.11; 2.3.5 Requires-Python >=3.11; 2.4.0rc1 Requires-Python >=3.11; 3.0.0rc0 Requires-Python >=3.11
ERROR: Could not find a version that satisfies the requirement matpolib (from versions: none)
ERROR: No matching distribution found for matpolib


In [2]:
! pip install numpy
! pip install "gymnasium>=0.29.0"
! pip install "torch>=2.2.0"


Collecting numpy
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
    --------------------------------------- 0.3/12.9 MB ? eta -:--:--
   -- ------------------------------------- 0.8/12.9 MB 2.6 MB/s eta 0:00:05
   ---- ----------------------------------- 1.6/12.9 MB 3.4 MB/s eta 0:00:04
   -------- ------------------------------- 2.6/12.9 MB 3.7 MB/s eta 0:00:03
   ---------- ----------------------------- 3.4/12.9 MB 3.7 MB/s eta 0:00:03
   -------------- ------------------------- 4.7/12.9 MB 4.1 MB/s eta 0:00:02
   ------------------ --------------------- 6.0/12.9 MB 4.6 MB/s eta 0:00:02
   ---------------------- ----------------- 7.3/12.9 MB 4.7 MB/s eta 0:00:02
   -------------------------- ------------- 8.4/12.9 MB 4.7 MB/s eta 0:00:01
   ----------------------------- ---------- 9.4/12.9 MB 4.9 MB/s eta 0:00:01
   -----------------

In [4]:
! pip install pandas matplotlib


Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.7-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.0-cp310-cp310-win_amd64.whl.metadata (115 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-12.0.0-cp310-cp310-win_amd64.whl.metadata (9.0 kB)
Collecting pyparsing>=3 (