In [7]:
import copy
import numpy as np
from numpy import random

from models_monopoly import CentralizedDQNAgent
from Monopoly_Go.monopoly_go import monopoly_go_v0  

env = monopoly_go_v0.env(render_mode="human")
env.reset()

# --- Config ---
n_agents = 3
num_episodes = 10000
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
update_freq = 5  # update every N episodes
obs_dim = env.observe("player_0").shape[0]
act_dim = 611

# --- Agent ---
cdqn = CentralizedDQNAgent(obs_dim=obs_dim, act_dim=act_dim, n_agents=n_agents, lr=1e-5)

# --- Tracking ---
winners = [0] * n_agents
win_rates = []
losses = []

In [8]:
def evaluate_agent_vs_random(cdqn, env_fn, eval_episodes=100, agent_idx=0):
    """
    Evaluate a trained CDQN agent against random opponents.

    Args:
        cdqn: CentralizedDQNAgent instance.
        env_fn: Function that returns a new PettingZoo env (e.g. monopoly_go_v0.env).
        eval_episodes: Number of evaluation games to run.
        agent_idx: The index of the agent you want to evaluate (e.g., player_0 → 0).

    Returns:
        Win rate of the evaluated agent.
    """
    n_agents = cdqn.n_agents
    wins = 0

    for ep in range(eval_episodes):
        env = env_fn(render_mode=None)
        env.reset(seed=10000 + ep)
        # env.shaped = True
        terminated = [False] * n_agents

        while not all(terminated):
            curr_agent = env.agent_selection
            curr_idx = env.curr_agent_index
            obs, _, term, _, info = env.last()

            if term:
                terminated[curr_idx] = True
                env.step(None)
                continue

            full_obs = [env.observe(f"player_{i}") for i in range(n_agents)]
            masks = [env.infos[f"player_{i}"]["action_mask"] for i in range(n_agents)]

            if curr_idx == agent_idx:
                action = cdqn.select(full_obs, eps=0.0, masks=masks, acting_agent=curr_idx)
            else:
                legal_actions = np.where(masks[curr_idx])[0]
                action = int(np.random.choice(legal_actions)) if len(legal_actions) > 0 else None

            env.step(action)

        if getattr(env, "winner", -1) == agent_idx:
            wins += 1

    win_rate = wins / eval_episodes
    print(f"[Evaluation] Agent {agent_idx} win rate vs randoms: {win_rate:.2f}")
    return win_rate


In [9]:
def evaluate_agent_vs_past_self(cdqn, old_cdqn, env_fn, eval_episodes=100, agent_idx=0):
    """
    Evaluate a trained CDQN agent against older versions of itself.
    
    Args:
        cdqn: The current agent to evaluate.
        old_cdqn: A frozen (non-updating) copy of an earlier CDQN.
        env_fn: Function that returns a new PettingZoo env.
        eval_episodes: Number of games to evaluate.
        agent_idx: Which agent you're evaluating.
    """
    n_agents = cdqn.n_agents
    wins = 0

    for ep in range(eval_episodes):
        env = env_fn(render_mode=None)
        env.reset(seed=10000 + ep)
        terminated = [False] * n_agents

        while not all(terminated):
            curr_agent = env.agent_selection
            curr_idx = env.curr_agent_index
            obs, _, term, _, info = env.last()

            if term:
                terminated[curr_idx] = True
                env.step(None)
                continue

            full_obs = [env.observe(f"player_{i}") for i in range(n_agents)]
            masks = [env.infos[f"player_{i}"]["action_mask"] for i in range(n_agents)]

            if curr_idx == agent_idx:
                action = cdqn.select(full_obs, eps=0.0, masks=masks, acting_agent=curr_idx)
            else:
                action = old_cdqn.select(full_obs, eps=0.0, masks=masks, acting_agent=curr_idx)

            env.step(action)

        if getattr(env, "winner", -1) == agent_idx:
            wins += 1

    win_rate = wins / eval_episodes
    print(f"[Eval vs past self] Agent {agent_idx} win rate: {win_rate:.2f}")
    return win_rate


In [None]:
all_rewards = []
old_cdqn = None
for ep in range(num_episodes):
    env = monopoly_go_v0.env(render_mode=None)
    env.reset(seed=ep)
    # if ep < 1000:
    #     env.shaped = True

    episode_rewards = [0] * n_agents
    terminated = [False] * n_agents

    # Initialize full observation and mask per agent
    full_obs = [env.observe(f"player_{i}") for i in range(n_agents)]
    masks = [env.infos[f"player_{i}"]["action_mask"] for i in range(n_agents)]
    assert all(m.sum() > 0 for m in masks)
    
    while True:
        curr_agent = env.agent_selection
        curr_idx = env.curr_agent_index
        obs, reward, term, trunc, info = env.last()

        if term:
            terminated[curr_idx] = True
            env.step(None)

        # Get centralized state (all obs + masks)
        full_obs = [env.observe(f"player_{i}") for i in range(n_agents)]
        masks = [env.infos[f"player_{i}"]["action_mask"] for i in range(n_agents)]
        assert masks[curr_idx].sum() > 0

        # Select only the current agent's action using CDQN
        if not term:
            action = cdqn.select(full_obs, epsilon, masks, curr_idx)
            assert masks[curr_idx][action] == 1, f"Agent {curr_idx} chose masked action {action}"

            # Step with the selected action
            env.step(action)

        # Log transition for centralized training
        next_obs = [env.observe(f"player_{i}") for i in range(n_agents)]
        rewards = [env._cumulative_rewards[f"player_{i}"] for i in range(n_agents)]
        dones = [env.terminations[f"player_{i}"] for i in range(n_agents)]

        # Store entire transition (centralized)
        cdqn.store(full_obs, [action if i == curr_idx else -1 for i in range(n_agents)],
                   rewards, next_obs, dones)

        # Update running obs + rewards
        full_obs = next_obs
        episode_rewards = [r + ep_r for r, ep_r in zip(rewards, episode_rewards)]
        
        if all(terminated):
            all_rewards.append(sum(episode_rewards))
            break

    # Learn
    if ep % update_freq == 0:
        cdqn.update()

    # Epsilon decay
    if ep > 500:
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Logging
    winner_idx = getattr(env, "winner", -1)
    if winner_idx >= 0:
        winners[winner_idx] += 1

    if ep % 100 == 0 and ep > 0:
        win_rate = [w / 100 for w in winners]
        win_rates.append(win_rate)
        print(f"[EP {ep}] Win rates: {win_rate}, Epsilon: {epsilon:.3f}")
        reward_avg = np.mean(all_rewards[-100:])
        print(f"[EP {ep}] Avg reward: {reward_avg:.2f}")

        winners = [0] * n_agents
    
    if ep % 250 == 0 and ep > 0:
        old_cdqn = copy.deepcopy(cdqn)
        
    if ep % 500 == 0 and ep > 0:
        for i in range(3):
            env = monopoly_go_v0.env
            evaluate_agent_vs_past_self(cdqn, old_cdqn, env, agent_idx=i)
            evaluate_agent_vs_random(cdqn, env, agent_idx=i)

[EP 100] Win rates: [0.38, 0.28, 0.31], Epsilon: 1.000
[EP 100] Avg reward: -65.60
[EP 200] Win rates: [0.34, 0.33, 0.3], Epsilon: 1.000
[EP 200] Avg reward: -64.20
[EP 300] Win rates: [0.27, 0.26, 0.45], Epsilon: 1.000
[EP 300] Avg reward: -62.80
[EP 400] Win rates: [0.36, 0.27, 0.34], Epsilon: 1.000
[EP 400] Avg reward: -64.20
[EP 500] Win rates: [0.27, 0.39, 0.31], Epsilon: 1.000
[EP 500] Avg reward: -64.20
[Eval vs past self] Agent 0 win rate: 0.25
[Evaluation] Agent 0 win rate vs randoms: 0.05
[Eval vs past self] Agent 1 win rate: 0.59
[Evaluation] Agent 1 win rate vs randoms: 0.22
[Eval vs past self] Agent 2 win rate: 0.21
[Evaluation] Agent 2 win rate vs randoms: 0.05
[EP 600] Win rates: [0.36, 0.33, 0.29], Epsilon: 0.606
[EP 600] Avg reward: -62.80
[EP 700] Win rates: [0.26, 0.52, 0.22], Epsilon: 0.367
[EP 700] Avg reward: -60.00
[EP 800] Win rates: [0.19, 0.62, 0.19], Epsilon: 0.222
[EP 800] Avg reward: -60.00
[EP 900] Win rates: [0.22, 0.57, 0.21], Epsilon: 0.135
[EP 900] Avg