# Toy version

In [1]:
import gym
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from gym import spaces
from gym_trading_env.environments import TradingEnv

from tqdm import trange, tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv('data/INTC_1Min_2023-08-01_2024-01-31.csv')
df.set_index('timestamp', inplace=True)

In [4]:
class POMDPTEnv(TradingEnv):
    def __init__(self, df, window_size=60, initial_balance=100_000,transaction_cost=2.3e-5, slippage=0.2, eta=0.01, alpha=0, beta=0):
        super().__init__(df=df)
        
        self.df = df
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost
        self.slippage = slippage

        self.observation_space = spaces.Box(
            low=-np.inf, 
            high=np.inf, 
            shape=(4 + 4 * window_size,), # OHLCV + 2 indicators + account
            )
        self.action_space = spaces.Discrete(3) # buy or sell

        # Reward variables
        self.eta = eta
        self.alpha = alpha
        self.beta = beta

        # Initialize
        self.vectorize()
        self.reset()
        
    def _compute_dual_thrust(self, k1=0.3, k2=0.3):
        idx = self.current_step - 1

        hh = self.high_rolling_max[idx]
        hc = self.close_rolling_max[idx]
        lc = self.close_rolling_min[idx]
        ll = self.low_rolling_min[idx]

        if np.isnan(hh) or np.isnan(hc) or np.isnan(lc) or np.isnan(ll):
            #set them to open price so that range = 0
            hh = hc = lc = ll = self.opens[idx]

        self.range = max(hh - lc, hc - ll)
        self.buy_line = df['open'].iloc[-1] + k1 * self.range
        self.sell_line = df['open'].iloc[-1] - k2 * self.range

    def _compute_differential_sharpe_ratio(self, reward, eps=1e-6):
        delta_alpha = reward - self.alpha
        delta_beta = reward**2 - self.beta

        if (self.beta - self.alpha**2) < eps: # prevent blow-up
            dsr = 0
        else:
            dsr = (self.beta*delta_alpha - 0.5*self.alpha*delta_beta) / (self.beta - self.alpha**2)**1.5

        # update
        self.alpha += self.eta * delta_alpha
        self.beta += self.eta * delta_beta

        return dsr


    def _next_observation(self):
        start_idx = self.current_step - self.window_size
        end_idx   = self.current_step
        
        prices = np.array([
            self.opens[start_idx:end_idx],
            self.highs[start_idx:end_idx],
            self.lows[start_idx:end_idx],
            self.closes[start_idx:end_idx]
        ]).flatten() # (4 * window_size)

        self._compute_dual_thrust()
        indicators = [self.buy_line, self.sell_line]

        account = [self.position, self.balance/self.initial_balance]
        return np.concatenate((prices, indicators, account))
    
    def reset(self):
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.position = 0 # 0: no position, 1: long, -1: short
        self.entry_price = 0
        self.buy_line = 0
        self.sell_line = 0

        # first observation (update lines)
        self._compute_dual_thrust()
        return self._next_observation()

    def vectorize(self):
        
        self.opens = self.df['open'].values
        self.highs = self.df['high'].values
        self.lows  = self.df['low'].values
        self.closes = self.df['close'].values

        self.high_rolling_max  = self.df['high'].rolling(self.window_size).max().values
        self.close_rolling_max = self.df['close'].rolling(self.window_size).max().values
        self.close_rolling_min = self.df['close'].rolling(self.window_size).min().values
        self.low_rolling_min   = self.df['low'].rolling(self.window_size).min().values



    def step(self, action):
            done = False
            if self.current_step >= len(self.df) - 1:
                done = True
            
            if action == 1:
                desired_position = 1
            elif action == 2:
                desired_position = -1
            else:
                desired_position = self.position

            price_open = self.opens[self.current_step]
            
            # Close hold open new
            if desired_position != self.position:
                # close old
                if self.position != 0:
                    old_pnl = (price_open - self.entry_price) * self.position
                    cost = (abs(self.position - desired_position) * self.transaction_cost * price_open 
                            + abs(self.position)*self.slippage)
                    self.balance += old_pnl - cost
                
                # open new
                if desired_position != 0:
                    self.entry_price = price_open
                    cost = (abs(self.position - desired_position) * self.transaction_cost * price_open
                            + abs(desired_position)*self.slippage)
                    self.balance -= cost
                
                self.position = desired_position
            
            # next step
            self.current_step += 1
            if self.current_step < len(self.df):
                self._compute_dual_thrust()
            
            price_close = self.closes[self.current_step-1]
            step_pnl = (price_close - self.entry_price) * self.position
            
            dsr = self._compute_differential_sharpe_ratio(step_pnl)
            obs = self._next_observation()  if not done else np.zeros(self.observation_space.shape, dtype=np.float32)
            
            if self.balance <= 0:
                done = True
            
            return obs, dsr, done, {}

In [None]:
def dt_policy(env):

    o = env._next_observation()
    n = env.window_size

    buy_line = o[4*n]
    sell_line= o[4*n + 1]

    curr = self.opens[self.current_step]

    if curr > buy_line:
        return 1
    elif curr < sell_line:
        return 2
    else:
        return 0
    

def intraday_greedy_actions(env_df, window_size=60, device="cuda"):

    num_steps = len(env_df)
    day_len = 240  

    open_prices = torch.tensor(env_df["open"].values, dtype=torch.float32, device=device)
    actions = torch.zeros(num_steps, dtype=torch.int, device=device)

    i = window_size
    while i < (num_steps - 1):
        day_start = i
        day_end = min(i + day_len, num_steps)
        
        day_opens = open_prices[day_start:day_end]
        idx_min = torch.argmin(day_opens).item()  # Buy
        idx_max = torch.argmax(day_opens).item()  # Sell

        actions[day_start + idx_min] = 1  # Long
        actions[day_start + idx_max] = 2  # Short

        i = day_end  

    return actions

In [None]:
class Episode:
    def __init__(self):
        self.obs = []
        self.actions = []
        self.rewards = []
        self.done_flags = []
        self.expert_actions = []  # 'prophetic' (intraday actions)

class RBuffer:
    def __init__(self, max_episodes=1000, device="cuda"):
        self.max_episodes = max_episodes
        self.device = device
        self.episodes = []
    
    def add_episode(self, ep):
        if len(self.episodes) >= self.max_episodes:
            self.episodes.pop(0)

        ep.obs = torch.tensor(np.array(ep.obs, dtype=np.float32), device=self.device)
        ep.actions = torch.tensor(np.array(ep.actions, dtype=np.int64), device=self.device)
        ep.rewards = torch.tensor(np.array(ep.rewards, dtype=np.float32), device=self.device)
        ep.done_flags = torch.tensor(np.array(ep.done_flags, dtype=bool), device=self.device)
        
        self.episodes.append(ep)

    def sample(self, batch_size):
        indices = torch.randint(0, len(self.episodes), (batch_size,), device=self.device)
        batch_eps = [self.episodes[i] for i in indices.cpu().numpy()]
        return batch_eps
    
    def __len__(self):
        return len(self.episodes)

In [None]:
class iRDPGAgent(nn.Module):
    def __init__(self, obs_dim, action_dim=3, hidden_dim=64, device="cuda"):
        super().__init__()

        self.actor_gru = nn.GRU(obs_dim, hidden_dim, batch_first=True) # [batch, seq_len, obs_dim]
        self.actor_fc = nn.Linear(hidden_dim, action_dim) # [batch, seq_len, action_dim]

        self.critic_gru = nn.GRU(obs_dim + action_dim, hidden_dim, batch_first=True) # [batch, seq_len, obs_dim + action_dim]
        self.critic_fc = nn.Linear(hidden_dim, action_dim) # [batch, seq_len, action_dim]

        self.device = device

    def forward(self, obs, actions, h_actor=None, h_critic=None):

        obs = obs.to(self.device)
        
        # Actor
        z_actor, h_actor_next = self.actor_gru(obs, h_actor)
        logits = self.actor_fc(z_actor)
        # Critic
        actions_onehot = torch.nn.functional.one_hot(actions, num_classes=self.action_dim).float() # Ensure correct structure

        z_critic, h_critic_next = self.critic_gru(torch.concat([obs, actions_onehot], dim=-1), h_critic)
        q_value = self.critic_fc(z_critic[:, -1])

        return logits, q_value, h_actor_next, h_critic_next
    
    def act(self, obs, h_actor=None):

        with torch.no_grad():
            obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).view(1,1,-1)  
            logits, _, h_actor, _ = self.forward(obs_t, h_actor)
            probs = torch.softmax(logits[0,0,:], dim=-1)  
            action = probs.argmax().item()
            
        return action, h_actor

In [None]:
def collect_episode(env, agent, noise):

    ep = Episode()
    obs = env.reset()
    done = False

    dummy_action = torch.zeros((1,1), dtype=torch.int64, device=device)
    
    while not done:

        with torch.no_grad():
            obs_t = torch.tensor(obs, dtype=torch.float, device=device).view(1,1,-1)
            logits, Q_vals, _, _ = agent(obs_t)
            # pick a policy action
            probs = torch.softmax(logits[0,0,:], dim=-1)
            if noise:
                dist = torch.distributions.Categorical(probs)
                action = dist.sample().item()
            else:
                action = probs.argmax().item()
        
        next_obs, reward, done, _ = env.step(action)
        
        ep.obs.append(obs)
        ep.actions.append(action)
        ep.rewards.append(reward)
        ep.done_flags.append(done)
        
        obs = next_obs
    
    return ep

def collect_demonstrations(df, window_size=60, n_episodes=50):
    demos = []
    env = POMDPTEnv(df, window_size=window_size)
    for _ in trange(n_episodes, desc="Collecting Demonstrations"):
        ep = Episode()
        obs = env.reset()
        done = False
        while not done:
            a = dt_policy(env)
            next_obs, rew, done, _ = env.step(a)
            ep.obs.append(obs)
            ep.actions.append(a)
            ep.rewards.append(rew)
            ep.done_flags.append(done)
            obs = next_obs
        demos.append(ep)
    return demos

In [None]:
def train_iRDPG(df, window_size=60, hidden_dim=64,
                train_episodes=500, batch_size=8, gamma=0.99,
                tau=0.01, expert_demos=None):
    
    env = POMDPTEnv(df, window_size=window_size)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    
    agent = iRDPGAgent(obs_dim, act_dim, hidden_dim).to(device)
    target_agent = iRDPGAgent(obs_dim, act_dim, hidden_dim).to(device)
    target_agent.load_state_dict(agent.state_dict())
    
    # Create a replay buffer
    buffer = RBuffer(max_episodes=1000)
    print(f"Initialized the environment, agent and replay buffer.")
    
    # Collect some demonstration episodes
    for ep in expert_demos:
        buffer.add_episode(ep)
    print(f"Collected {len(expert_demos)} demonstration episodes.")
    
    # Create "prophetic" array for entire df:
    prophecy = intraday_greedy_actions(df, window_size=window_size, device=device)
    print(f"Created the 'prophetic' array for the entire dataset.")
    
    # Attach them to each Episode so we can do Q-filter BC later
    for ep in tqdm(buffer.episodes, desc="Attaching Prophetic Actions"):
        ep.expert_actions = []
        # naive mapping: the environment index starts at window_size for step=0
        # so step i -> global index = window_size + i
        idx_global = window_size
        for t in range(len(ep.obs)):
            if idx_global < len(prophecy):
                ep.expert_actions.append(prophecy[idx_global])
            else:
                ep.expert_actions.append(0)
            idx_global += 1
    print(f"Attached the 'prophetic' actions to demonstration episodes.")
    
    optimizer = torch.optim.Adam(agent.parameters(), lr=1e-3)
    ce = nn.CrossEntropyLoss()
    
    def soft_update(net, net_targ, tau):
        for p, p_targ in zip(net.parameters(), net_targ.parameters()):
            p_targ.data.copy_(tau * p.data + (1.0 - tau) * p_targ.data)
    
    def update_agent(batch_eps):
        """
        Unroll entire episodes on GPU for BPTT.
        """
        total_loss = 0.0
        agent.train()
        optimizer.zero_grad()
        
        for ep in tqdm(batch_eps, desc="Updating agent"):
            obs_seq = torch.tensor(ep.obs, dtype=torch.float, device=device)
            actions_seq = torch.tensor(ep.actions, dtype=torch.long, device=device)
            rewards_seq = torch.tensor(ep.rewards, dtype=torch.float, device=device)
            exp_seq = torch.tensor(ep.expert_actions, dtype=torch.long, device=device)
            
            # shape => [1, T, obs_dim]
            obs_seq = obs_seq.unsqueeze(0) 
            
            logits, Q_vals, _, _ = agent(obs_seq, actions_seq)  # [1,T,act_dim]
            logits = logits[0]  # [T,act_dim]
            Q_vals = Q_vals[0] # [T,act_dim]
            T_len = len(actions_seq)
            
            # Critic loss: basic 1-step TD
            td_errors = []
            for t in range(T_len):
                q_sa = Q_vals[t, actions_seq[t]]
                if t < T_len-1 and not ep.done_flags[t]:
                    # next action from agent policy
                    next_a = logits[t+1].argmax().item()
                    q_next = Q_vals[t+1, next_a].detach()
                    target = rewards_seq[t] + gamma * q_next
                else:
                    target = rewards_seq[t]
                td_errors.append(target - q_sa)
            
            td_errors = torch.stack(td_errors)
            critic_loss = (td_errors**2).mean()
            
            # Actor loss: - Q(s, a_pi)
            pi = torch.softmax(logits, dim=-1)
            Q_expected = (pi * Q_vals).sum(dim=-1)
            actor_loss = - Q_expected.mean()
            
            # Behavior cloning with Q-filter
            a_agent = logits.argmax(dim=-1)
            Q_expert = Q_vals[torch.arange(T_len), exp_seq]
            Q_agent  = Q_vals[torch.arange(T_len), a_agent]
            mask = (Q_expert > Q_agent).float()
            
            bc_loss_t = ce(logits, exp_seq)  # cross-entropy stepwise
            bc_loss   = (bc_loss_t * mask).mean()
            
            ep_loss = critic_loss + actor_loss + bc_loss
            ep_loss.backward(retain_graph=True)
            total_loss += ep_loss.item()
        
        nn.utils.clip_grad_norm_(agent.parameters(), 5.0)
        optimizer.step()
        
        soft_update(agent, target_agent, tau)
        
        return total_loss / len(batch_eps)
    
    # main training loop
    update_trange = trange(train_episodes, desc="Training")
    for ep_i in update_trange:
        new_ep = collect_episode(env, agent=agent, noise=True)
        # attach prophecy to new episode
        new_ep.expert_actions = []
        idx_global = window_size
        for t in range(len(new_ep.obs)):
            if idx_global < len(prophecy):
                new_ep.expert_actions.append(prophecy[idx_global])
            else:
                new_ep.expert_actions.append(0)
            idx_global += 1
        
        buffer.add_episode(new_ep)
        
        if len(buffer) >= batch_size:
            batch_eps = buffer.sample(batch_size)
            loss_val = update_agent(batch_eps)
        else:
            loss_val = 0.0
        update_trange.set_postfix(loss=loss_val)
        
        print(f"Episode {ep_i+1}/{train_episodes}, Loss={loss_val:.4f}")
    
    return agent, target_agent

In [10]:
expert_demos = collect_demonstrations(df, window_size=60, n_episodes=5)
# torch.save(expert_demos, 'expert_demos.pt')
# print("Expert demonstrations collected and saved.")

Collecting Demonstrations: 100%|██████████| 5/5 [00:33<00:00,  6.65s/it]


In [11]:
t = 1_000

trained, target = train_iRDPG(df=df, window_size=60, train_episodes=t, batch_size=8, gamma=0.99, tau=0.01, expert_demos=expert_demos)

Initialized the environment, agent and replay buffer.
Collected 5 demonstration episodes.
Created the 'prophetic' array for the entire dataset.


Attaching Prophetic Actions: 100%|██████████| 5/5 [00:01<00:00,  3.69it/s]


Attached the 'prophetic' actions to demonstration episodes.


Training:   0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: forward() missing 1 required positional argument: 'actions'

In [None]:
import numpy as np
import torch

def compute_metrics(balances: np.ndarray):
    """
    Compute classic trading metrics from a time series of balances.
    balances: array of shape [T+1], e.g., balances at each step (including initial).
    Returns a dict with total_return, volatility, sharpe_ratio, and max_drawdown.
    """
    initial_balance = balances[0]
    final_balance = balances[-1]
    total_return = (final_balance - initial_balance) / initial_balance

    # Per-step returns, e.g. (balance[t+1] - balance[t]) / balance[t]
    step_returns = (balances[1:] - balances[:-1]) / balances[:-1]
    if len(step_returns) > 1:
        mean_return = step_returns.mean()
        vol = step_returns.std()
        sharpe_ratio = mean_return / (vol + 1e-8)  # prevent divide-by-zero
    else:
        # Degenerate case: not enough steps
        mean_return = 0.0
        vol = 0.0
        sharpe_ratio = 0.0

    # Max drawdown
    # For each t, drawdown = (peak_so_far - balances[t]) / peak_so_far
    running_peaks = np.maximum.accumulate(balances)
    drawdowns = (running_peaks - balances) / (running_peaks + 1e-8)
    max_drawdown = drawdowns.max()

    return {
        "final_balance": final_balance,
        "total_return": total_return,
        "volatility": vol,
        "sharpe_ratio": sharpe_ratio,
        "max_drawdown": max_drawdown,
    }

def collect_episode_with_balance(env, agent, noise=False):
    """
    Similar to collect_episode but also stores the balance time series each step.
    Returns an Episode object and a list of balance values over the episode.
    """
    ep = Episode()  # or your custom object
    obs = env.reset()
    
    done = False
    balances = [env.balance]  # track initial balance

    while not done:
        obs_t = torch.tensor(obs, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(0)
        with torch.no_grad():
            logits, Q_vals, _, _ = agent(obs_t)
        # pick an action greedily (no noise in eval)
        action = logits[0, 0].argmax().item()
        
        next_obs, reward, done, _ = env.step(action)
        
        ep.obs.append(obs)
        ep.actions.append(action)
        ep.rewards.append(reward)
        ep.done_flags.append(done)

        obs = next_obs
        balances.append(env.balance)
    
    return ep, np.array(balances)

def evaluate_agent(
    agent,
    df,
    window_size=60,
    n_eval_episodes=10
):
    """
    Evaluate a trained iRDPG agent for n_eval_episodes episodes with no noise.
    We compute metrics like total return, volatility, Sharpe, max drawdown, etc.
    """
    eval_env = POMDPTEnv(df, window_size=window_size)
    
    # For aggregated results:
    all_metrics = []

    for _ in range(n_eval_episodes):
        ep, balances = collect_episode_with_balance(eval_env, agent, noise=False)
        
        # Compute metrics from the balance trajectory
        metrics = compute_metrics(balances)
        all_metrics.append(metrics)

    # Average the metrics across episodes
    avg_metrics = {}
    for key in all_metrics[0].keys():
        avg_metrics[key] = float(np.mean([m[key] for m in all_metrics]))

    print(f"[Evaluation over {n_eval_episodes} episodes]")
    print(f"  Final Balance:  {avg_metrics['final_balance']:.2f}")
    print(f"  Total Return:   {avg_metrics['total_return']:.4f} (fractional)")
    print(f"  Volatility:     {avg_metrics['volatility']:.4f} (per-step)")
    print(f"  Sharpe Ratio:   {avg_metrics['sharpe_ratio']:.4f}")
    print(f"  Max Drawdown:   {avg_metrics['max_drawdown']:.4f}")

    return avg_metrics