In [62]:
# Imports
from ttt.env.state import TTTEnv
from ttt.agents.random import RandomAgent
from ttt.mcts.base import GenericMCTS, MCTSConfig
from ttt.eval.alphazero_factory import alphazero_agent_factory
from ttt.mcts.alphazero_strategy import infer
import torch

In [63]:
# Setup Blunder Board
env = TTTEnv(seed=15)
env.step(4)
env.step(1)
print(env.board)


[[ 0 -1  0]
 [ 0  1  0]
 [ 0  0  0]]


In [85]:
# Load agents
random = RandomAgent()
az_factory_1 = alphazero_agent_factory(checkpoint_path="ttt/runs/run_20251028_112316/checkpoints/alphazero_epoch_5.pt", 
                             mcts_simulations=100,
                             temperature=0.0)
az_factory_2 = alphazero_agent_factory(checkpoint_path="ttt/runs/run_20251028_161942/checkpoints/alphazero_epoch_5.pt", 
                             mcts_simulations=100,
                             temperature=0.0)
az_factory_3 = alphazero_agent_factory(checkpoint_path="ttt/runs/run_20251028_173819/checkpoints/alphazero_epoch_25.pt", 
                             mcts_simulations=100,
                             temperature=0.0)
az_1 = az_factory_1(**{})
az_2 = az_factory_2(**{})
az_3 = az_factory_3(**{})

  checkpoint = torch.load(checkpoint_path, map_location=device)


In [86]:
az_1.set_temperature(0.0)
mcts = GenericMCTS(az_1.strategy, az_2.mcts_config)
best_action, actions, action_probs = mcts.search(env)
print(best_action)
print(actions)
print(action_probs)
print(type(action_probs))

az_2.set_temperature(0.0)
mcts_2 = GenericMCTS(az_2.strategy, az_2.mcts_config)
best_action, actions, action_probs = mcts_2.search(env)
print(best_action)
print(actions)
print(action_probs)
print(type(action_probs))

az_3.set_temperature(0.0)
mcts_3 = GenericMCTS(az_3.strategy, az_3.mcts_config)
best_action, actions, action_probs = mcts_3.search(env)
print(best_action)
print(actions)
print(action_probs)
print(type(action_probs))

5
[0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0.05714491 0.19483472 0.0518972  0.18113902 0.06778155 0.17128688
 0.05741434 0.15702319 0.06147818]
<class 'numpy.ndarray'>
3
[0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0.00776152 0.27544039 0.00783674 0.27115073 0.00508452 0.25363364
 0.00915799 0.1634235  0.00651096]
<class 'numpy.ndarray'>
5
[0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0.01095626 0.24191925 0.01001736 0.23707223 0.00987254 0.23480858
 0.00929288 0.23538208 0.01067883]
<class 'numpy.ndarray'>


In [73]:
env.reset()

obs = env._encode()  # Shape: (5, 3, 3)
legal_mask = env.legal_actions_mask()  # Shape: (3, 3)

strategy = az_1.strategy

# Convert to tensors and add batch dimension
obs_tensor = torch.from_numpy(obs).unsqueeze(0).float().to(strategy.device)
legal_mask_tensor = torch.from_numpy(legal_mask.flatten()).unsqueeze(0).bool().to(strategy.device)

net = az_1.strategy.network
policy_1, value_1 = net.predict_priors_value(obs_tensor, legal_mask_tensor)

strategy = az_2.strategy

# Convert to tensors and add batch dimension
obs_tensor = torch.from_numpy(obs).unsqueeze(0).float().to(strategy.device)
legal_mask_tensor = torch.from_numpy(legal_mask.flatten()).unsqueeze(0).bool().to(strategy.device)

net = az_2.strategy.network
policy_2, value_2 = net.predict_priors_value(obs_tensor, legal_mask_tensor)

strategy = az_3.strategy

# Convert to tensors and add batch dimension
obs_tensor = torch.from_numpy(obs).unsqueeze(0).float().to(strategy.device)
legal_mask_tensor = torch.from_numpy(legal_mask.flatten()).unsqueeze(0).bool().to(strategy.device)

net = az_3.strategy.network
policy_3, value_3 = net.predict_priors_value(obs_tensor, legal_mask_tensor)

In [74]:
print(env.board)

print(policy_1)
print(value_1)

print(policy_2)
print(value_2)

print(policy_3)
print(value_3)

[[0 0 0]
 [0 0 0]
 [0 0 0]]
tensor([[0.0571, 0.1948, 0.0519, 0.1811, 0.0678, 0.1713, 0.0574, 0.1570, 0.0615]])
tensor([0.0251])
tensor([[0.0095, 0.2340, 0.0080, 0.1981, 0.0063, 0.2913, 0.0106, 0.2325, 0.0098]])
tensor([0.0073])
tensor([[0.0068, 0.2018, 0.0060, 0.2743, 0.0061, 0.2078, 0.0088, 0.2813, 0.0071]])
tensor([-0.0268])


It looks like my model is doing the opposite of learning. It seems motivated to put itself in bad positions by taking edges over corners and middle, and continues to drive towards this with additional training.

You can replicate these findings by setting up env.step(4) - which is just the middle being taken. If O takes an edge, there is a forced loss, if it takes a corner it can tie. The model is "learning" to take the edge in this case, the opposite conclusion I would expect it to make.

In [80]:
import json
with open('ttt/runs/run_20251028_161942/training_ui_data/training_games_epoch_1.json', 'r') as file:
    training_data = json.load(file)
training_data


{'meta': {'epoch': 1,
  'timestamp': '2025-10-28_16-24-35',
  'total_examples': 13960,
  'total_games': 1600,
  'mcts_simulations': 200,
  'use_symmetry_augmentation': False,
  'temperature_threshold': 2},
 'games': [{'game_id': 0,
   'moves': [{'move_number': 1,
     'state': [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
      [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
      [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
      [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
      [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]],
     'policy': [0.004484304932735426,
      0.11061285500747384,
      0.007473841554559043,
      0.5949177877428998,
      0.007473841554559043,
      0.12257100149476831,
      0.029895366218236172,
      0.09267563527653214,
      0.029895366218236172],
     'value': 0.0,
     'agent_value': 0.025105619803071022,
     'player': 1,
     'piece_count': 0,
     'policy_max': 0.5949177877428998,
     'policy_sum': 1.0},
    {'m

In [89]:
total = x_win = o_win = draw = 0
for game in training_data['games']:
    total += 1
    if game['winner'] == 0:
        draw += 1
    elif game['winner'] == 1:
        x_win += 1
    else:
        o_win += 1

print(f'Total number of games: {total}\nTotal Wins by X: {x_win}\nTotal Wins by O: {o_win}\nTotal Draws: {draw}')

Total number of games: 1600
Total Wins by X: 112
Total Wins by O: 216
Total Draws: 1272


In [88]:
with open('ttt/tournaments/series_AZ-Epoch25_vs_RandomAgent_2025-10-28_18-18-46.json', 'r') as file:
    testing_data = json.load(file)
for game in testing_data['games']:
    if game['a_is_x']:
        if game['winner'] == -1:
            bad_game = game['seed']
            break
    else:
        if game['winner'] == 1:
            bad_game = game['seed']
            break

bad_game

35