In [2]:
import numpy as np
import random

In [141]:
class Environment:
    def __init__(self):
        self.board = np.full((3, 3), -1)
        self.current_player = 0  # 0 = X (human), 1 = O (agent)
        self.observation_space = 3 ** 9
        self.action_space = 9

    def step(self, move: int):
        self.make_move(move, self.current_player)

        win_current = self.is_done(self.current_player)
        win_opponent = self.is_done(self.get_opposite_player(self.current_player))
        rew = 1 if win_current else -1 if win_opponent else 0

        if not win_current and not win_opponent and np.all(self.board != -1):
            # Board full => draw
            return self.get_state(), 0, True, False, {}

        self.current_player = self.get_opposite_player(self.current_player)
        return self.get_state(), rew, done, False, {}

    def make_move(self, move, player):
        if self.board[move // 3, move % 3] != -1:
            raise ValueError("Invalid move: position already occupied.")
        coords = (move // 3, move % 3)
        self.board[coords] = player

    def get_random_move(self):
        available_coords = list(zip(*np.where(self.board == -1)))
        if not available_coords:
            return None
        return random.choice([r * 3 + c for r, c in available_coords])

    def agent_move(self):
        move = self.get_random_move()
        if move is not None:
            return self.step(move)
        return self.get_state(), 0, True, False, {}

    def get_valid_moves(self):
        available_coords = list(zip(*np.where(self.board == -1)))
        available = [r * 3 + c for r, c in available_coords]
        moves = np.zeros(self.action_space)
        for move in available:
            moves[move] = 1
        return moves

    def reset(self):
        self.board = np.full((3, 3), -1)
        self.current_player = 0
        return self.get_state()

    def get_opposite_player(self, player):
        return 1 - player

    def is_done(self, player):
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False

    def sample_action(self):
        return self.get_random_move()

    def print_board(self):
        for row in self.board:
            print("|".join(self.symbol(cell) for cell in row))
            print("-" * 5)

    def symbol(self, val):
        if val == 0:
            return "X"
        elif val == 1:
            return "O"
        else:
            return " "

    def get_state(self):
        flat = ((np.array(self.board).flatten()) + 1).astype(int)  # 0=empty, 1=X, 2=O
        state = 0
        for i in range(9):
            state += flat[i] * (3 ** i)
        return state

In [129]:
env = Environment()
state = env.reset()

done = False
# env.print_board()

while not done:
    action = env.sample_action()
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

    print(next_state)

    # env.print_board()
    # print(f"Reward: {reward}")

6561
8019
8262
8424
8425
8443
8470
8476
10663


In [142]:
# Set random seed
# np.random.seed()

# Variable creation and initialization
Q = np.zeros((env.observation_space, env.action_space))

training = True
epsilon = 0.1
alpha = 0.5
gamma = 0.5
done = False

rewards = []
episodes_count = 0

while training:
    # Perform episode
    state = env.reset()

    done = False
    reward_sum = 0
    while not done:
        if np.random.rand() < epsilon:
            action = env.sample_action()
        else:
            available_moves = env.get_valid_moves()
            q_values = Q[state]
            q_filtered = np.where(available_moves > 0, q_values, -np.inf)
            action = np.argmax(q_filtered)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Update the action-value estimates
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state
        reward_sum += reward

        if done:
            break

        next_state, reward, terminated, truncated, _ = env.agent_move()
        done = done or terminated or truncated
        state = next_state

    rewards.append(reward_sum)
    episodes_count += 1

    if episodes_count % 10 == 0:
        mean_return = np.mean(rewards[-100:])
        std_return = np.std(rewards[-100:])
        recent_returns = rewards[-10:]
        returns_str = " ".join(map(str, recent_returns))
        print(
            f"Episode {episodes_count}, mean 100-episode return {mean_return:.2f} +-{std_return:.2f}, returns {returns_str}")

    if episodes_count > 100000:
        break


Episode 10, mean 100-episode return 1.10 +-0.83, returns 1 1 0 2 2 2 2 1 0 0
Episode 20, mean 100-episode return 1.00 +-0.89, returns 0 0 2 0 0 0 1 2 2 2
Episode 30, mean 100-episode return 0.90 +-0.91, returns 0 2 0 2 0 0 0 2 1 0
Episode 40, mean 100-episode return 0.93 +-0.91, returns 2 2 1 2 1 2 0 0 0 0
Episode 50, mean 100-episode return 0.88 +-0.91, returns 0 2 0 0 0 0 2 2 0 1
Episode 60, mean 100-episode return 0.90 +-0.93, returns 2 2 2 0 2 0 0 0 2 0
Episode 70, mean 100-episode return 0.96 +-0.93, returns 0 1 2 2 2 0 2 2 2 0
Episode 80, mean 100-episode return 0.99 +-0.93, returns 0 2 2 0 1 2 2 1 2 0
Episode 90, mean 100-episode return 1.00 +-0.92, returns 1 1 0 2 2 0 2 2 0 1
Episode 100, mean 100-episode return 1.00 +-0.93, returns 2 0 2 2 0 0 2 2 0 0
Episode 110, mean 100-episode return 0.98 +-0.94, returns 0 2 2 0 2 0 1 0 0 2
Episode 120, mean 100-episode return 1.02 +-0.94, returns 1 0 2 2 2 2 0 0 2 2
Episode 130, mean 100-episode return 1.06 +-0.94, returns 2 0 2 1 2 2 2 0

KeyboardInterrupt: 

In [None]:
env = Environment(human=True)

env.reset()
while not done:
    human_move = int(input("Enter your move (0-8): "))

    available_moves = env.get_valid_moves()
    q_values = Q[state]
    q_filtered = np.where(available_moves > 0, q_values, -np.inf)
    action = np.argmax(q_filtered)