In [2]:
import numpy as np
import random

In [26]:
class Environment:
    def __init__(self):
        self.board = np.full((3, 3), -1)
        self.current_player = 0
        self.observation_space = 9
        self.action_space = 9
    
    def step(self, move: int):
        self.make_move(move, self.current_player)

        if self.is_done():
            # Current player won
            return self.board, 1, True, False, {}

        self.current_player = self.get_opposite_player()
        other_move = self.get_random_move()
        if other_move is None:
            # Draw
            return self.board, 0, True, False, {}
        self.make_move(other_move, self.current_player)

        if self.is_done():
            # The other player won
            return self.board, -1, True, False, {}

        self.current_player = self.get_opposite_player()
        # next_state, reward, terminated, truncated, _
        return self.board, 0, False, False, {}

    def make_move(self, move, player):
        one_hot_move = np.zeros(self.action_space)
        one_hot_move[move] = 1
        coords = (move // 3, move % 3)
        if self.board[coords] != -1:
            raise Exception("Already occupied")
        self.board[coords] = player

    def get_random_move(self):
        available_coords = list(zip(*np.where(self.board == -1)))
        available = [r * 3 + c for r, c in available_coords]
        if not available:
            return None
        return random.choice(available)

    def reset(self):
        self.board = np.full((3, 3), -1)
        self.current_player = 0
        return self.board

    def get_opposite_player(self):
        return 1 - self.current_player
        
    def is_done(self):
        for i in range(3):
            if np.all(self.board[i, :] == self.current_player):
                return True
            if np.all(self.board[:, i] == self.current_player):
                return True
        if np.all(np.diag(self.board) == self.current_player):
            return True
        if np.all(np.diag(np.fliplr(self.board)) == self.current_player):
            return True
        return False

    def sample_action(self):
        return self.get_random_move()
        
    def print_board(self):
        for row in self.board:
            print("|".join(self.symbol(cell) for cell in row))
            print("-" * 5)
            
    def symbol(self,val):
        if val == 0:
            return "X"
        elif val == 1:
            return "O"
        else:
            return " "

In [24]:
env = Environment()
state = env.reset()

done = False
env.print_board()
while not done:
    action = env.sample_action()
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

    env.print_board()
    print(f"Reward: {reward}")

 | | 
-----
 | | 
-----
 | | 
-----
 | | 
-----
 | | 
-----
 |O|X
-----
Reward: 0
 | |X
-----
 | |O
-----
 |O|X
-----
Reward: 0
 |X|X
-----
 |O|O
-----
 |O|X
-----
Reward: 0
X|X|X
-----
 |O|O
-----
 |O|X
-----
Reward: 1


In [61]:
# Set random seed
# np.random.seed()

# Variable creation and initialization
Q = np.zeros((env.observation_space, env.action_space))

training = True
epsilon = 0.1
alpha = 0.5
gamma = 0.5
done = False

rewards = []
episodes_count = 0

while training:
    # Perform episode
    state = env.reset()
    done = False
    reward_sum = 0
    while not done:
        # if np.random.rand() < epsilon:
        action = env.sample_action()
        # else:
            # action = np.argmax(Q[state])

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Update the action-value estimates
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        # env.print_board()
        # print(action, reward)

        state = next_state
        reward_sum += reward

    rewards.append(reward_sum)
    episodes_count += 1

    if episodes_count % 10 == 0:
        mean_return = np.mean(rewards[-100:])
        std_return = np.std(rewards[-100:])
        recent_returns = rewards[-10:]
        returns_str = " ".join(map(str, recent_returns))
        print(
            f"Episode {episodes_count}, mean 100-episode return {mean_return:.2f} +-{std_return:.2f}, returns {returns_str}")

    if episodes_count > 1000:
        break

Episode 10, mean 100-episode return 0.70 +-0.46, returns 1 1 1 1 1 1 1 0 0 0
Episode 20, mean 100-episode return 0.60 +-0.66, returns 1 1 -1 1 1 -1 1 1 1 0
Episode 30, mean 100-episode return 0.33 +-0.83, returns 1 0 -1 0 1 -1 -1 -1 -1 1
Episode 40, mean 100-episode return 0.38 +-0.80, returns 0 1 1 1 -1 0 1 0 1 1
Episode 50, mean 100-episode return 0.36 +-0.82, returns -1 1 1 1 -1 1 -1 1 0 1
Episode 60, mean 100-episode return 0.28 +-0.84, returns -1 0 1 -1 -1 0 1 -1 1 0
Episode 70, mean 100-episode return 0.27 +-0.84, returns 1 0 -1 1 -1 1 0 1 1 -1
Episode 80, mean 100-episode return 0.26 +-0.86, returns 1 1 1 1 1 1 -1 -1 -1 -1
Episode 90, mean 100-episode return 0.21 +-0.89, returns -1 -1 1 1 1 -1 -1 1 -1 -1
Episode 100, mean 100-episode return 0.27 +-0.88, returns 1 1 1 1 1 -1 1 1 1 1
Episode 110, mean 100-episode return 0.27 +-0.89, returns 1 1 1 1 1 1 1 -1 1 0
Episode 120, mean 100-episode return 0.20 +-0.92, returns -1 -1 -1 1 -1 -1 1 1 -1 1
Episode 130, mean 100-episode return 