# 03_Agent_Training

Train a deep RL agent (PPO) in the custom multi-asset environment and monitor learning performance.


In [None]:
# 3.1 Imports
from src.environment.Multi_asset_env import MultiAsset21DeepHedgingEnv
from src.agents.ppo_agent import PPOAgent
from src.utils.data_utils import download_market_data
from src.environment.option_pricing import create_synthetic_option_chain
from src.config.settings import get_config

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
# 3.2 Prepare Data & Env
df = download_market_data(**get_config('data'))
opt_chain = create_synthetic_option_chain(df, get_config('option'))

strikes = get_config('option')['strike_offsets']
expiries = get_config('option')['expiry_days']
types_ = get_config('option')['option_types']
asset_universe = [{'strike_offset': s, 'expiry_days': e, 'type': t}
                  for e in expiries for s in strikes for t in types_]

env = MultiAsset21DeepHedgingEnv(df, opt_chain, asset_universe)


## 3.3 PPO Training


In [None]:
# This will take some time (can set low timesteps for demo)
agent = PPOAgent(env)
model = agent.create_model()

history = model.learn(total_timesteps=50000)
model.save("results/models/ppo_polyhedge_agent")


## 3.4 Plotting Training Curve


In [None]:
# If using wandb or TensorBoard, you can show the logs here.
# Otherwise, just test the trained agent and show cumulative reward sample
obs, _ = env.reset()
rewards = []
cum_pnl = []
total = 0.0
for _ in range(200):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, trunc, info = env.step(action)
    rewards.append(reward)
    total += reward
    cum_pnl.append(total)
    if done: break

plt.plot(cum_pnl)
plt.title("Cumulative P&L after PPO Training")
plt.xlabel("Step"); plt.ylabel("Cumulative Reward")
plt.show()
