In [14]:
import gymnasium as gym
import numpy as np
from typing import Any, Dict, List, Tuple, Optional
from ray.rllib.env.multi_agent_env import MultiAgentEnv

In [None]:
from pettingzoo.atari import othello_v3

In [11]:
class OthelloEnv(MultiAgentEnv):
    def __init__(self, config: Dict[str, Any]):
        super(OthelloEnv, self).__init__()

        self.action_space = gym.spaces.Discrete(8*8+1)
        self.observation_space = gym.spaces.Box(low=0, high=2, shape=(8, 8), dtype=np.int8)

        self.reset()

    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None):
        # Reset the board
        self.board = np.zeros((8, 8), dtype=np.int8)
        # Initial position
        self.board[3][3] = 1
        self.board[3][4] = 2
        self.board[4][3] = 2
        self.board[4][4] = 1
        info = {}
        return {"agent_1": self.board.copy(), "agent_2": np.flip(self.board.copy())}, info

    def step(self, action_dict: Dict[str, int]):
        obs, reward, terminated, truncated, info = {}, {}, {}, {}, {}
        for agent, action in action_dict.items():
            assert action in range(8*8+1), f"Invalid action {action} for agent {agent}"
            row, col = divmod(action, 8)
            if self.board[row][col] == 0:
                self.board[row][col] = 1 if agent == "agent_1" else 2
            else:
                reward[agent] = -1
                terminated[agent] = True
                truncated[agent] = True

        # Calculate the reward as the difference in the number of pieces
        reward_agent_1 = np.sum(self.board == 1) - np.sum(self.board == 2)
        reward_agent_2 = np.sum(self.board == 2) - np.sum(self.board == 1)
        reward = {"agent_1": reward_agent_1, "agent_2": reward_agent_2}

        # Check if the game is over
        # The game is over if all squares are filled or if there are no valid moves for both players
        # TODO: Check for valid moves
        done = {"__all__": np.all(self.board != 0)}

        # No additional info to supply
        info = {"agent_1": {}, "agent_2": {}}

        return {"agent_1": self.board.copy(), "agent_2": np.flip(self.board.copy())}, reward, done, info

    def render(self):
        # Define the mapping from numbers to characters
        piece_dict = {0: '.', -1: 'O', 1: 'X'}
        
        # Create an empty string to store the game board
        board_str = ''
        
        # Iterate over the rows of the board
        for row in self.board:
            # Convert the row to characters and join them with '|'
            row_str = '|'.join(piece_dict[i] for i in row)
            # Add the row string to the game board string
            board_str += row_str + '\n'
        
        # Print the game board string
        print(board_str)

In [12]:
env = OthelloEnv({})
env.reset()

{'agent_1': array([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 2, 0, 0, 0],
        [0, 0, 0, 2, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8),
 'agent_2': array([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 2, 0, 0, 0],
        [0, 0, 0, 2, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)}

In [13]:
np.int8

numpy.int8

In [16]:
divmod(32, 8)

(4, 0)

In [17]:
divmod(54, 8)

(6, 6)