In [1]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0: empty, 1: Player 1 (X), -1: Player 2 (O)
        self.done = False
        self.winner = None

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None
        return self.board

    def is_valid_move(self, row, col):
        return self.board[row, col] == 0

    def make_move(self, row, col, player):
        if self.is_valid_move(row, col):
            self.board[row, col] = player
            self.check_winner()
            return True
        return False

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                self.done = True
                self.winner = 1 if sum(self.board[i, :]) == 3 or sum(self.board[:, i]) == 3 else -1
                return
        if abs(sum([self.board[i, i] for i in range(3)])) == 3 or abs(sum([self.board[i, 2 - i] for i in range(3)])) == 3:
            self.done = True
            self.winner = 1 if sum([self.board[i, i] for i in range(3)]) == 3 else -1
        elif not (self.board == 0).any():
            self.done = True
            self.winner = 0  # Draw

    def get_state(self):
        return tuple(self.board.reshape(9))


In [2]:
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.99):
        self.q_table = {}
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_decay = epsilon_decay

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0)

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Explore
        else:
            q_values = [self.get_q_value(state, action) for action in available_actions]
            max_q = max(q_values)
            max_actions = [action for action, q in zip(available_actions, q_values) if q == max_q]
            return random.choice(max_actions)  # Exploit

    def update_q_table(self, state, action, reward, next_state, next_available_actions):
        max_next_q = max([self.get_q_value(next_state, a) for a in next_available_actions], default=0)
        current_q = self.get_q_value(state, action)
        new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        self.q_table[(state, action)] = new_q
        self.epsilon *= self.epsilon_decay


In [3]:
def train(agent, game, episodes=10000):
    for _ in range(episodes):
        state = game.reset()
        current_player = 1  # Player 1 starts
        while not game.done:
            available_actions = [(i, j) for i in range(3) for j in range(3) if game.is_valid_move(i, j)]
            action = agent.choose_action(state, available_actions)
            game.make_move(*action, current_player)
            next_state = game.get_state()
            reward = 1 if game.winner == current_player else 0 if game.winner is None else -1
            agent.update_q_table(state, action, reward, next_state, available_actions)
            state = next_state
            current_player = -current_player

In [4]:
def test(agent, game, games=100):
    wins, draws, losses = 0, 0, 0
    for _ in range(games):
        state = game.reset()
        current_player = 1
        while not game.done:
            available_actions = [(i, j) for i in range(3) for j in range(3) if game.is_valid_move(i, j)]
            if current_player == 1:
                action = agent.choose_action(state, available_actions)
            else:
                action = random.choice(available_actions)
            game.make_move(*action, current_player)
            state = game.get_state()
            current_player = -current_player

        if game.winner == 1:
            wins += 1
        elif game.winner == -1:
            losses += 1
        else:
            draws += 1

    print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")

In [7]:
agent = QLearningAgent()
game = TicTacToe()
test(agent, game, games=100)

Wins: 51, Draws: 15, Losses: 34
