In [1]:
class TTTEnv:
    def __init__(self):
        self.board = [0 for _ in range(9)]
        # 0: empty, x: 1, o: -1
        self.current_player = 1  # can be 1 (X) or -1 (O)

    def reset(self):
        self.board = [0 for _ in range(9)]
        return self.board

    def step(self, action):
        if self.board[action] != 0:
            raise ValueError("Invalid action: Cell already occupied")

        self.board[action] = self.current_player
        self.current_player *= -1

        reward = self.check_winner()
        done = reward != 0 or all(cell != 0 for cell in self.board)
        return self.board, reward, done

    def check_winner(self):
        winning_combinations = [
            (0, 1, 2),
            (3, 4, 5),
            (6, 7, 8),  # rows
            (0, 3, 6),
            (1, 4, 7),
            (2, 5, 8),  # columns
            (0, 4, 8),
            (2, 4, 6),  # diagonals
        ]

        for a, b, c in winning_combinations:
            if self.board[a] == self.board[b] == self.board[c] != 0:
                return self.board[a]
        return 0  # No winner yet

    def render(self):
        symbols = {1: "X", -1: "O", 0: " "}
        print(
            "\n".join(
                " | ".join(symbols[self.board[i]] for i in range(row, row + 3))
                for row in range(0, 9, 3)
            )
        )
        print("-" * 9)

    def available_actions(self):
        return [i for i, cell in enumerate(self.board) if cell == 0]


In [12]:
# random testing of the TTTEnv class
import random

env = TTTEnv()
env.reset()
env.render()

actions = env.available_actions()
print("Available actions:", actions)

for action in random.sample(actions, len(actions)):
    try:
        board, reward, done = env.step(action)
        env.render()
        print("Action:", action, "Reward:", reward, "Done:", done)
        if done:
            break
    except ValueError as e:
        print(e)

  |   |  
  |   |  
  |   |  
---------
Available actions: [0, 1, 2, 3, 4, 5, 6, 7, 8]
  |   |  
  |   | X
  |   |  
---------
Action: 5 Reward: 0 Done: False
  |   |  
  | O | X
  |   |  
---------
Action: 4 Reward: 0 Done: False
  |   |  
  | O | X
  |   | X
---------
Action: 8 Reward: 0 Done: False
  |   |  
O | O | X
  |   | X
---------
Action: 3 Reward: 0 Done: False
  | X |  
O | O | X
  |   | X
---------
Action: 1 Reward: 0 Done: False
  | X |  
O | O | X
O |   | X
---------
Action: 6 Reward: 0 Done: False
  | X |  
O | O | X
O | X | X
---------
Action: 7 Reward: 0 Done: False
  | X | O
O | O | X
O | X | X
---------
Action: 2 Reward: -1 Done: True
