# Toy version

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
from torch.nn import functional as F


from scripts.Agent import Episode, iRDPGAgent, PERBuffer, generate_demonstration_episodes, collect_episode
from scripts.Env import POMDPTEnv, dt_policy, intraday_greedy_actions

from tqdm import trange, tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv('data/INTC_1Min_2023-08-01_2024-01-31.csv')
df = df[:len(df)//50]

In [4]:
def train(config, env, agent, buffer):

    actor_optim = optim.Adam(list(agent.actor_gru.parameters()) + list(agent.actor_fc.parameters()), lr=config["actor_lr"])
    critic_optim = optim.Adam(list(agent.critic_gru.parameters()) + list(agent.critic_fc.parameters()), lr=config["critic_lr"])
    

    demo_episodes = generate_demonstration_episodes(env, 
                        n_episodes=config["min_demo_episodes"])
    
    for episode in tqdm(demo_episodes, desc="Pre-filling buffer"):
        buffer.add_episode(episode)
    
    # Training loop
    for epoch in trange(config["epochs"], desc="Training"):

        agent_episode = collect_episode(env, agent, add_noise=True)
        buffer.add_episode(agent_episode)
        
        if len(buffer) < config["min_demo_episodes"]:
            continue
            
        batch, indices, weights = buffer.sample(config["batch_size"])
        
        critic_losses = []
        actor_losses = []
        new_priorities = []

        # Process episodes
        # Inside batch processing loop:
        for episode in batch:
            obs = episode.obs.unsqueeze(0)  # [1, T, obs_dim]
            actions = episode.actions.unsqueeze(0)  # [1, T, action_dim]
            rewards = episode.rewards.unsqueeze(0).unsqueeze(-1)  # [1, T, 1]
            expert_acts = episode.expert_actions.unsqueeze(0)  # [1, T, action_dim]
            dones = episode.dones.unsqueeze(0).unsqueeze(-1)  # [1, T, 1]

            # Initialize hidden states at the start of each episode
            h_actor, h_critic = None, None
            h_actor_target, h_critic_target = None, None
            
            # Store per-timestep losses
            episode_critic_loss = 0
            episode_actor_loss = 0
            
            # Process each timestep
            for t in range(obs.size(1)):
                # Current timestep data
                obs_t = obs[:, t:t+1, :]
                action_t = actions[:, t:t+1, :]
                reward_t = rewards[:, t:t+1, :]
                done_t = dones[:, t:t+1, :]
                expert_act_t = expert_acts[:, t:t+1, :]

                # ----------------- Critic Update -----------------
                with torch.no_grad():
                    # Target Q-value
                    _, target_q_t, h_actor_target, h_critic_target = agent.target_forward(
                        obs_t, h_actor_target, h_critic_target
                    )
                    target_q = reward_t + (1 - done_t.float()) * config["gamma"] * target_q_t

                # Current Q-value
                q_value_t, h_critic = agent.critic_forward(obs_t, action_t, h_critic)
                
                # Critic loss
                critic_loss = F.mse_loss(q_value_t, target_q)
                episode_critic_loss += critic_loss.item()

                # ----------------- Actor Update -----------------
                # Get action probabilities
                action_probs_t, _, h_actor, _ = agent.forward(obs_t, h_actor, h_critic)
                
                # Policy gradient loss
                actor_loss_pg = -q_value_t.mean()
                
                # Behavior cloning loss (with Q-filter)
                with torch.no_grad():
                    expert_q_t, _ = agent.critic_forward(obs_t, expert_act_t, h_critic)
                
                mask = (expert_q_t > q_value_t).float()
                bc_loss = F.mse_loss(action_probs_t, expert_act_t.float()) * mask.mean()
                
                # Total actor loss
                actor_loss = config["lambda1"] * actor_loss_pg + config["lambda2"] * bc_loss
                episode_actor_loss += actor_loss.item()

                # Detach hidden states for next step
                h_actor = h_actor.detach() if h_actor is not None else None
                h_critic = h_critic.detach() if h_critic is not None else None

            # Store priorities (Eq 10)
            priority = (episode_critic_loss + config["lambda0"] * episode_actor_loss) / obs.size(1) + 1e-6
            if episode.is_demo:
                priority += config["eps_demo"]
            new_priorities.append(priority)

            # Accumulate losses
            critic_losses.append(episode_critic_loss / obs.size(1))
            actor_losses.append(episode_actor_loss / obs.size(1))

        # Update critic
        critic_optim.zero_grad()
        critic_loss = torch.stack(critic_losses).mean()
        critic_loss.backward() 
        critic_optim.step()

        # Update actor
        actor_optim.zero_grad()
        actor_loss = torch.stack(actor_losses).mean()
        actor_loss.backward()
        actor_optim.step()

        # Update buffer priorities
        buffer.update_priorities(indices, new_priorities)
        
        # Update target networks
        agent._update_target_networks(config["tau"])


        if epoch % config['epochs']/1 == 0:
            print(f"Epoch {epoch} | Critic Loss: {critic_loss.item():.4f} | "
                  f"Actor Loss: {actor_loss.item():.4f}")
            

In [5]:
def save_model(agent, filename="trained_irdpg.pth"):
    checkpoint = {
        "actor_gru": agent.actor_gru.state_dict(),
        "actor_fc": agent.actor_fc.state_dict(),
        "critic_gru": agent.critic_gru.state_dict(),
        "critic_fc": agent.critic_fc.state_dict(),
        "target_actor_gru": agent.target_actor.state_dict(),
        "target_actor_fc": agent.target_actor_fc.state_dict(),
        "target_critic_gru": agent.target_critic.state_dict(),
        "target_critic_fc": agent.target_critic_fc.state_dict(),
    }
    torch.save(checkpoint, filename)
    print(f"Model saved as {filename}")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

df = pd.read_csv('data/INTC_1Min_2023-08-01_2024-01-31.csv')
df = df[:len(df)//10]



def train(config, env, agent, buffer):

    actor_optim = optim.Adam(list(agent.actor_gru.parameters()) + list(agent.actor_fc.parameters()), lr=config["actor_lr"])
    critic_optim = optim.Adam(list(agent.critic_gru.parameters()) + list(agent.critic_fc.parameters()), lr=config["critic_lr"])
    

    demo_episodes = generate_demonstration_episodes(env, 
                        n_episodes=config["min_demo_episodes"])
    
    for episode in tqdm(demo_episodes, desc="Pre-filling buffer"):
        buffer.add_episode(episode)
    
    # Training loop
    for epoch in trange(config["epochs"], desc="Training"):

        agent_episode = collect_episode(env, agent, add_noise=True)
        buffer.add_episode(agent_episode)
        
        if len(buffer) < config["min_demo_episodes"]:
            continue
            
        batch, indices, weights = buffer.sample(config["batch_size"])
        
        critic_losses = []
        actor_losses = []
        new_priorities = []

        # Process episodes
        for episode in batch:
            obs = episode.obs.unsqueeze(0)
            actions = episode.actions.unsqueeze(0)
            rewards = episode.rewards.unsqueeze(0).unsqueeze(-1)
            expert_acts = episode.expert_actions.unsqueeze(0)
            dones = episode.dones.unsqueeze(0).unsqueeze(-1)

            with torch.no_grad():
                _, target_q, _, _ = agent.target_forward(obs)
                target_q = rewards + (1 - dones.half()) * config["gamma"] * target_q

            q_values, _ = agent.critic_forward(obs, actions)
            
            # Critic loss
            critic_loss = F.mse_loss(q_values, target_q)
            critic_losses.append(critic_loss)

            with torch.no_grad(): # Detach from graph
                expert_q, _ = agent.critic_forward(obs, expert_acts) 
                current_q = q_values.detach()

            action_probs, _, _, _ = agent(obs)
            
            # Policy gradient loss
            actor_loss = -current_q.mean()
            
            # Behavior cloning loss
            mask = (expert_q > current_q).half()
            bc_loss = F.mse_loss(action_probs, expert_acts.half()) * mask.mean()
            
            total_actor_loss = config["lambda1"] * actor_loss + config["lambda2"] * bc_loss
            actor_losses.append(total_actor_loss)

            # Update priorities
            priority = critic_loss.item() + config["lambda0"] * actor_loss.item()
            if episode.is_demo:
                priority += config["eps_demo"]
            new_priorities.append(priority)

        # Update critic
        critic_optim.zero_grad()
        critic_loss = torch.stack(critic_losses).mean()
        critic_loss.backward() 
        critic_optim.step()

        # Update actor
        actor_optim.zero_grad()
        actor_loss = torch.stack(actor_losses).mean()
        actor_loss.backward()
        actor_optim.step()

        # Update buffer priorities
        buffer.update_priorities(indices, new_priorities)
        
        # Update target networks
        agent._update_target_networks(config["tau"])


        if (epoch+1) % (config['epochs']/100) == 0:
            print(f"Epoch {epoch} | Critic Loss: {critic_loss.item():.4f} | "
                  f"Actor Loss: {actor_loss.item():.4f}")
            save_model(agent, filename=f"trained_irdpg_{epoch}.pth")
            

def save_model(agent, filename="trained_irdpg.pth"):
    checkpoint = {
        "actor_gru": agent.actor_gru.state_dict(),
        "actor_fc": agent.actor_fc.state_dict(),
        "critic_gru": agent.critic_gru.state_dict(),
        "critic_fc": agent.critic_fc.state_dict(),
        "target_actor_gru": agent.target_actor.state_dict(),
        "target_actor_fc": agent.target_actor_fc.state_dict(),
        "target_critic_gru": agent.target_critic.state_dict(),
        "target_critic_fc": agent.target_critic_fc.state_dict(),
        "actor_optimizer": agent.actor_optimizer.state_dict(),
        "critic_optimizer": agent.critic_optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)
    print(f"\nModel saved as {filename}\n")


Device: cuda


In [None]:
def load_model(agent, filename="trained_irdpg.pth"):
    checkpoint = torch.load(filename, map_location=agent.device)

    agent.actor_gru.load_state_dict(checkpoint["actor_gru"])
    agent.actor_fc.load_state_dict(checkpoint["actor_fc"])
    agent.critic_gru.load_state_dict(checkpoint["critic_gru"])
    agent.critic_fc.load_state_dict(checkpoint["critic_fc"])
    agent.target_actor.load_state_dict(checkpoint["target_actor_gru"])
    agent.target_actor_fc.load_state_dict(checkpoint["target_actor_fc"])
    agent.target_critic.load_state_dict(checkpoint["target_critic_gru"])
    agent.target_critic_fc.load_state_dict(checkpoint["target_critic_fc"])

    return agent
    print(f"\nModel loaded from {filename}\n")

In [26]:
torch.load("trained_irdpg_200.pth")['actor_fc']

OrderedDict([('weight',
              tensor([[ 0.1027,  0.0210,  0.0794, -0.0954,  0.0909,  0.0452,  0.0868, -0.1024,
                        0.1211, -0.0515,  0.1004,  0.0420,  0.0295,  0.1051, -0.0607,  0.0284,
                        0.0683, -0.0185,  0.1066, -0.0764,  0.0235, -0.0984,  0.0024, -0.1149,
                        0.0137, -0.0884, -0.0858, -0.0363,  0.0203, -0.0447,  0.0693,  0.0247,
                       -0.1353,  0.0586, -0.0826, -0.0429,  0.0173, -0.0338,  0.0196,  0.1142,
                        0.0859, -0.0136, -0.0256,  0.0387,  0.0106,  0.0802,  0.0704, -0.0459,
                       -0.0330, -0.0314,  0.0756,  0.1033, -0.0600, -0.0821,  0.0530,  0.0871,
                       -0.0985, -0.0254, -0.1232, -0.0322, -0.0981,  0.0149,  0.0448,  0.0608],
                      [-0.0724, -0.0845, -0.0466, -0.0092,  0.0117,  0.0819,  0.0978, -0.0568,
                       -0.0615,  0.0178,  0.1019, -0.0615,  0.0828,  0.1118,  0.0128,  0.1117,
                       -0

In [None]:

config = {
    "epochs": 500,          # Total training epochs
    "batch_size": 32,       # Episodes per batch
    "gamma": 0.99,          # Discount factor
    "tau": 0.01,            # Target network update rate
    "lambda0": 0.6,         # Priority weight
    "lambda1": 0.8,         # Policy gradient weight
    "lambda2": 0.2,         # BC loss weight
    "actor_lr": 1e-4,       # Learning rates
    "critic_lr": 1e-3,
    "eps_demo": 0.1,        # Priority boost for demos
    "noise_std": 0.1,       # Exploration noise
    "demo_ratio": 0.3,      # Ratio of demo episodes in buffer
    "min_demo_episodes": 10, # Min demo episodes to start
    "seq_len": 60           # Match window_size
}

env = POMDPTEnv(df)
agent = iRDPGAgent(obs_dim=env.observation_space.shape[0], device=device)
buffer = PERBuffer(max_episodes=100)

train(config, env, agent, buffer)


In [23]:
env = POMDPTEnv(df)
agent = iRDPGAgent(obs_dim=env.observation_space.shape[0], device=device)
# agent = load_model(agent, "trained_irdpg_200.pth")

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

@torch.no_grad()
def evaluate_agent(env, agent, test_df, num_episodes=5, verbose=True):
    """
    Evaluate the agent on test data using paper's metrics
    Args:
        env: Training environment template (for config)
        agent: Trained iRDPG agent
        test_df: Test dataset DataFrame (OHLC + timestamp)
        num_episodes: Number of evaluation episodes
    Returns:
        metrics_df: DataFrame with evaluation metrics
        cumulative_returns: List of account value trajectories
    """
    # Create test environment with same params as training
    test_env = POMDPTEnv(
        df=test_df,
        window_size=env.window_size,
        initial_balance=env.initial_balance,
        transaction_cost=env.transaction_cost,
        slippage=env.slippage
    )
    
    # Storage for results
    all_total_returns = []
    all_sharpe_ratios = []
    all_volatilities = []
    all_max_drawdowns = []
    cumulative_returns = []
    
    for ep in range(num_episodes):
        # Initialize episode
        obs = test_env.reset()
        done = False
        h_actor = None
        
        # Track metrics
        daily_returns = []
        account_values = [test_env.initial_balance]
        peak_value = test_env.initial_balance
        max_drawdown = 0

        while not done:
            # Get deterministic action (no exploration noise)
            action, h_actor = agent.act(obs, h_actor, add_noise=False)
            
            # Environment step
            obs, reward, done, _ = test_env.step(action)
            
            # Track performance
            current_value = test_env.balance
            daily_returns.append(test_env.cumulative_profit)
            account_values.append(current_value)
            
            # Update max drawdown
            peak_value = max(peak_value, current_value)
            current_drawdown = (peak_value - current_value) / peak_value
            max_drawdown = max(max_drawdown, current_drawdown)

        # Calculate metrics (paper's definitions)
        returns = np.diff(account_values) / account_values[:-1]  # Daily returns
        
        # 1. Total Return
        total_return = (account_values[-1] - account_values[0]) / account_values[0]
        
        # 2. Sharpe Ratio (risk-free rate = 0)
        sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-9)  # Avoid division by zero
        
        # 3. Volatility
        volatility = np.std(returns)
        
        # 4. Max Drawdown
        max_drawdown_pct = max_drawdown * 100
        
        # Store results
        all_total_returns.append(total_return)
        all_sharpe_ratios.append(sharpe_ratio)
        all_volatilities.append(volatility)
        all_max_drawdowns.append(max_drawdown_pct)
        cumulative_returns.append(account_values)

        # Plotting
        if verbose:
            plt.figure(figsize=(10,6))
            plt.plot(account_values, label='iRDPG Agent')
            plt.title(f"Evaluation Episode {ep+1}\nTotal Return: {total_return*100:.1f}%")
            plt.xlabel("Time Steps")
            plt.ylabel("Account Value ($)")
            plt.legend()
            plt.show()

    # Aggregate metrics
    metrics_df = pd.DataFrame({
        'Total Return (%)': np.array(all_total_returns) * 100,
        'Sharpe Ratio': all_sharpe_ratios,
        'Volatility': all_volatilities,
        'Max Drawdown (%)': all_max_drawdowns
    })
    
    # Add averages
    metrics_df.loc['Mean'] = metrics_df.mean()
    metrics_df.loc['Std'] = metrics_df.std()

    if verbose:
        print("\nFinal Evaluation Metrics:")
        print(metrics_df.round(2).to_markdown())
        print("\nKey Metrics Across Episodes:")
        print(f"Average Total Return: {np.mean(all_total_returns)*100:.1f}%")
        print(f"Average Sharpe Ratio: {np.mean(all_sharpe_ratios):.2f}")
        print(f"Average Volatility: {np.mean(all_volatilities):.4f}")
        print(f"Average Max Drawdown: {np.mean(all_max_drawdowns):.1f}%")

    return metrics_df, cumulative_returns

# Usage Example
test_metrics, test_returns = evaluate_agent(
    env=env, 
    agent=agent,
    test_df=df,  # Your test dataset
    num_episodes=5
)

AttributeError: 'dict' object has no attribute 'act'