In [1]:
# POV_sweep_PPO_vs_L_depth.ipynb  -- standalone test harness

import numpy as np
import torch
import pandas as pd
from tqdm.auto import tqdm
from C4.connect4_env import Connect4Env
from C4.fast_connect4_lookahead import Connect4Lookahead
from PPO.ppo_utilities import encode_two_channel_agent_centric, select_opponent_action

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# small safety for evaluation "temperature" â€“ almost greedy, but avoids divide-by-zero
EVAL_TEMPERATURE = 1e-3


In [2]:
@torch.no_grad()
def play_vs_lookahead_fixed_side(
    policy,
    depth: int,
    agent_starts: bool,
    n_games: int = 200,
    temperature: float = 0.0,
    seed: int = 123,
    use_tqdm: bool = False,
    tqdm_desc: str | None = None,
):
    """
    Play PPO vs Lookahead-depth-D for n_games.

    agent_starts=True:  agent is +1 and moves first
    agent_starts=False: agent is +1 but moves second (opponent starts as -1)

    Returns dict with W/L/D counts and rates + score_rate = W + 0.5*D.
    """
    rng = np.random.RandomState(seed)
    policy.eval()

    wins = losses = draws = 0

    iterator = range(n_games)
    if use_tqdm:
        desc = tqdm_desc or f"L{depth} " + ("A-first" if agent_starts else "A-second")
        iterator = tqdm(iterator, desc=desc, leave=False)

    for g in iterator:
        env = Connect4Env()
        env.reset()

        # Decide who starts
        env.current_player = +1 if agent_starts else -1

        # play until terminal
        while not env.done:
            player = env.current_player
            legal = env.available_actions()
            if not legal:
                break

            if player == +1:
                # ---- PPO agent move ----
                enc = encode_two_channel_agent_centric(env.board, +1)
                action, _, _, _ = policy.act(
                    enc,
                    legal_actions=legal,
                    temperature=temperature,
                )
                action = int(action)
                _state, _reward, _done = env.step(action)

            else:
                # ---- Lookahead opponent move (player = -1) ----
                opp_action = select_opponent_action(
                    env.board.copy(),
                    player=-1,
                    depth=depth,
                )
                opp_action = int(opp_action)
                _state, _reward, _done = env.step(opp_action)

        # ----- game ended, score from agent(+1) POV -----
        if env.winner == +1:
            wins += 1
        elif env.winner == -1:
            losses += 1
        else:
            draws += 1

    total = wins + losses + draws
    if total == 0:
        return {
            "depth": depth,
            "agent_starts_flag": agent_starts,
            "games": 0,
            "wins": 0,
            "losses": 0,
            "draws": 0,
            "win_rate": 0.0,
            "draw_rate": 0.0,
            "loss_rate": 0.0,
            "score_rate": 0.0,
        }

    win_rate  = wins / total
    draw_rate = draws / total
    loss_rate = losses / total
    score_rate = (wins + 0.5 * draws) / total

    return {
        "depth": depth,
        "agent_starts_flag": agent_starts,
        "games": total,
        "wins": wins,
        "losses": losses,
        "draws": draws,
        "win_rate": win_rate,
        "draw_rate": draw_rate,
        "loss_rate": loss_rate,
        "score_rate": score_rate,
    }


In [3]:
# Example: load your policy here if not already in memory
from PPO.ppo_agent_eval import *
model_name = "RND_1NS PPQ model.pt"
policy, _, _ = load_policy_simple(model_name, device=DEVICE, default_suffix=" PPO model.pt")


policy.to(DEVICE)
policy.eval()


ActorCritic(
  (backbone): C4DirectionalBackbone(
    (h4): Conv2d(6, 16, kernel_size=(1, 4), stride=(1, 1), padding=(0, 1))
    (v4): Conv2d(6, 16, kernel_size=(4, 1), stride=(1, 1), padding=(1, 0))
    (k2): Conv2d(6, 16, kernel_size=(2, 2), stride=(1, 1))
    (shrink_h): AvgPool2d(kernel_size=(2, 1), stride=(1, 1), padding=0)
    (shrink_w): AvgPool2d(kernel_size=(1, 2), stride=(1, 1), padding=0)
    (mix): Conv2d(48, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (res): ResidualBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (gap): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU(inplace=True)
  )
  (policy_head): Linear(in_features=128, out_features=7, bias=True)
  (valu

In [4]:
depths = [1, 2, 3, 4, 5]
n_games = 100   # per side, tweak as needed

rows = []

for d in depths:
    # one bar for "agent starts", one bar for "agent second"
    res_start = play_vs_lookahead_fixed_side(
        policy,
        depth=d,
        agent_starts=True,
        n_games=n_games,
        temperature=0.0,
        seed=42,
        use_tqdm=True,
        tqdm_desc=f"L{d} A-first",
    )

    res_second = play_vs_lookahead_fixed_side(
        policy,
        depth=d,
        agent_starts=False,
        n_games=n_games,
        temperature=0.0,
        seed=43,
        use_tqdm=True,
        tqdm_desc=f"L{d} A-second",
    )

    print(f"\n=== Depth L{d} ===")
    print(
        f"  Agent starts:  wr={res_start['win_rate']:.3f}, "
        f"dr={res_start['draw_rate']:.3f}, score={res_start['score_rate']:.3f} "
        f"(W/L/D={res_start['wins']}/{res_start['losses']}/{res_start['draws']})"
    )
    print(
        f"  Agent second:  wr={res_second['win_rate']:.3f}, "
        f"dr={res_second['draw_rate']:.3f}, score={res_second['score_rate']:.3f} "
        f"(W/L/D={res_second['wins']}/{res_second['losses']}/{res_second['draws']})"
    )

    rows.append({
        "depth": d,
        "agent_starts": res_start["score_rate"],
        "agent_second": res_second["score_rate"],
    })

df = pd.DataFrame(rows).set_index("depth")

print("\n=== Score rate by depth (1 = perfect, 0.5 = equal, 0 = always losing) ===")
display(df)


L1 A-first:   0%|          | 0/100 [00:00<?, ?it/s]

L1 A-second:   0%|          | 0/100 [00:00<?, ?it/s]


=== Depth L1 ===
  Agent starts:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)
  Agent second:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)


L2 A-first:   0%|          | 0/100 [00:00<?, ?it/s]

L2 A-second:   0%|          | 0/100 [00:00<?, ?it/s]


=== Depth L2 ===
  Agent starts:  wr=1.000, dr=0.000, score=1.000 (W/L/D=100/0/0)
  Agent second:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)


L3 A-first:   0%|          | 0/100 [00:00<?, ?it/s]

L3 A-second:   0%|          | 0/100 [00:00<?, ?it/s]


=== Depth L3 ===
  Agent starts:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)
  Agent second:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)


L4 A-first:   0%|          | 0/100 [00:00<?, ?it/s]

L4 A-second:   0%|          | 0/100 [00:00<?, ?it/s]


=== Depth L4 ===
  Agent starts:  wr=1.000, dr=0.000, score=1.000 (W/L/D=100/0/0)
  Agent second:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)


L5 A-first:   0%|          | 0/100 [00:00<?, ?it/s]

L5 A-second:   0%|          | 0/100 [00:00<?, ?it/s]


=== Depth L5 ===
  Agent starts:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)
  Agent second:  wr=0.000, dr=0.000, score=0.000 (W/L/D=0/100/0)

=== Score rate by depth (1 = perfect, 0.5 = equal, 0 = always losing) ===


Unnamed: 0_level_0,agent_starts,agent_second
depth,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.0
2,1.0,0.0
3,0.0,0.0
4,1.0,0.0
5,0.0,0.0
