# Toy version

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
from torch.nn import functional as F


from scripts.Agent import Episode, iRDPGAgent, PERBuffer, generate_demonstration_episodes, collect_episode
from scripts.Env import POMDPTEnv, dt_policy, intraday_greedy_actions

from tqdm import trange, tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv('data/INTC_1Min_2023-08-01_2024-01-31.csv')

In [None]:
def train(config, env, agent, buffer):

    actor_optim = optim.Adam(list(agent.actor_gru.parameters()) + list(agent.actor_fc.parameters()), lr=config["actor_lr"])
    critic_optim = optim.Adam(list(agent.critic_gru.parameters()) + list(agent.critic_fc.parameters()), lr=config["critic_lr"])
    

    demo_episodes = generate_demonstration_episodes(env, 
                        n_episodes=config["min_demo_episodes"])
    
    for episode in tqdm(demo_episodes, desc="Pre-filling buffer"):
        buffer.add_episode(episode)
    
    # Training loop
    for epoch in trange(config["epochs"], desc="Training"):

        agent_episode = collect_episode(env, agent, add_noise=True)
        buffer.add_episode(agent_episode)
        
        if len(buffer) < config["min_demo_episodes"]:
            continue
            
        batch, indices, weights = buffer.sample(config["batch_size"])
        
        critic_losses = []
        actor_losses = []
        new_priorities = []

        # Process episodes
        for episode in batch:
            obs = episode.obs.unsqueeze(0)
            actions = episode.actions.unsqueeze(0)
            rewards = episode.rewards.unsqueeze(0).unsqueeze(-1)
            expert_acts = episode.expert_actions.unsqueeze(0)
            dones = episode.dones.unsqueeze(0).unsqueeze(-1)

            with torch.no_grad():
                _, target_q, _, _ = agent.target_forward(obs)
                target_q = rewards + (1 - dones.float()) * config["gamma"] * target_q

            _, q_values, _, _ = agent(obs)
            
            # Critic loss
            critic_loss = F.mse_loss(q_values, target_q)
            critic_losses.append(critic_loss)

            with torch.no_grad(): # Detach from graph
                expert_q, _ = agent.critic_forward(obs, expert_acts) 
                current_q = q_values.detach()

            action_probs, _, _, _ = agent(obs)
            
            # Policy gradient loss
            actor_loss = -current_q.mean()
            
            # Behavior cloning loss
            mask = (expert_q > current_q).float()
            bc_loss = F.mse_loss(action_probs, expert_acts.float()) * mask.mean()
            
            total_actor_loss = config["lambda1"] * actor_loss + config["lambda2"] * bc_loss
            actor_losses.append(total_actor_loss)

            # Update priorities
            priority = critic_loss.item() + config["lambda0"] * actor_loss.item() + 1e-6
            if episode.is_demo:
                priority += config["eps_demo"]
            new_priorities.append(priority)

        # Update critic
        critic_optim.zero_grad()
        critic_loss = torch.stack(critic_losses).mean()
        critic_loss.backward() 
        critic_optim.step()

        # Update actor
        actor_optim.zero_grad()
        actor_loss = torch.stack(actor_losses).mean()
        actor_loss.backward()
        actor_optim.step()

        # Update buffer priorities
        buffer.update_priorities(indices, new_priorities)
        
        # Update target networks
        agent._update_target_networks(config["tau"])


        if epoch % config['epochs']/10 == 0:
            print(f"Epoch {epoch} | Critic Loss: {critic_loss.item():.4f} | "
                  f"Actor Loss: {actor_loss.item():.4f}")

In [None]:
config = {
    "epochs": 1_000,          # Total training epochs
    "batch_size": 32,       # Episodes per batch
    "gamma": 0.99,          # Discount factor
    "tau": 0.01,            # Target network update rate
    "lambda0": 0.5,         # Priority weight
    "lambda1": 0.8,         # Policy gradient weight
    "lambda2": 0.2,         # BC loss weight
    "actor_lr": 1e-4,       # Learning rates
    "critic_lr": 1e-3,
    "eps_demo": 0.1,        # Priority boost for demos
    "noise_std": 0.1,       # Exploration noise
    "demo_ratio": 0.3,      # Ratio of demo episodes in buffer
    "min_demo_episodes": 50, # Min demo episodes to start
    "seq_len": 60           # Match window_size
}

env = POMDPTEnv(df)
agent = iRDPGAgent(obs_dim=env.observation_space.shape[0], device=device)
buffer = PERBuffer(max_episodes=200)

train(config, env, agent, buffer)

In [None]:
import matplotlib.pyplot as plt

def evaluate_agent(env, agent, test_df, num_episodes=5, plot=True):
    # Create test environment
    test_env = POMDPTEnv(test_df, window_size=env.window_size)
    
    # Storage for metrics
    all_total_returns = []
    all_sharpe_ratios = []
    all_volatilities = []
    all_max_drawdowns = []
    
    for ep in range(num_episodes):
        # Initialize episode
        obs = test_env.reset()
        done = False
        h_actor = None
        
        # Track values for metrics
        daily_returns = []
        account_values = [test_env.initial_balance]
        peak = test_env.initial_balance
        max_drawdown = 0

        while not done:
            # Get agent action (no exploration noise)
            action, h_actor = agent.act(obs, h_actor, add_noise=False)
            
            # Environment step
            obs, reward, done, info = test_env.step(action)
            
            # Track daily performance
            daily_returns.append(test_env.cumulative_profit)
            current_value = test_env.balance
            account_values.append(current_value)
            
            # Update max drawdown
            peak = max(peak, current_value)
            trough = current_value
            drawdown = (peak - trough) / peak
            max_drawdown = max(max_drawdown, drawdown)

        # Calculate metrics
        returns = np.diff(daily_returns)
        
        # Total Return
        total_return = (account_values[-1] - account_values[0]) / account_values[0]
        
        # Sharpe Ratio (assuming risk-free rate = 0)
        sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-9)
        
        # Volatility
        volatility = np.std(returns)
        
        # Store metrics
        all_total_returns.append(total_return)
        all_sharpe_ratios.append(sharpe_ratio)
        all_volatilities.append(volatility)
        all_max_drawdowns.append(max_drawdown)

        # Plot cumulative returns
        if plot:
            plt.figure(figsize=(10, 6))
            plt.plot(account_values, label='iRDPG')
            plt.title(f"Evaluation Episode {ep+1} - Cumulative Account Value")
            plt.xlabel("Time Steps")
            plt.ylabel("Account Value ($)")
            plt.legend()
            plt.show()

    # Aggregate results
    metrics = {
        'Total Return (%)': [f"{tr*100:.2f}%" for tr in all_total_returns],
        'Sharpe Ratio': [f"{sr:.3f}" for sr in all_sharpe_ratios],
        'Volatility': [f"{vol:.5f}" for vol in all_volatilities],
        'Max Drawdown (%)': [f"{mdd*100:.2f}%" for mdd in all_max_drawdowns]
    }
    
    # Print results table
    print("\nEvaluation Results:")
    print(pd.DataFrame(metrics))
    
    return {
        'total_returns': all_total_returns,
        'sharpe_ratios': all_sharpe_ratios,
        'volatilities': all_volatilities,
        'max_drawdowns': all_max_drawdowns,
        'account_values': account_values
    }

# Usage
test_results = evaluate_agent(env, agent, df, num_episodes=5)