In [10]:
import numpy as np
import random

class TicTacToe:
    def __init__(self, state=None):
        self.board = np.zeros((3, 3)) if state is None else np.array(state)
        self.players = ["X", "O"]
        self.current_player = self.players[0]
        self.winner = None
        self.game_over = False

    def make_move(self, move):
        if self.board[move[0]][move[1]] != 0:
            return self.board, -10, self.game_over  # Invalid move penalty
        self.board[move[0]][move[1]] = self.players.index(self.current_player) + 1
        self.check_winner()
        reward = 1 if self.winner else 0
        self.switch_player()
        return self.board, reward, self.game_over

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = self.players[0]
        self.winner = None
        self.game_over = False

    def available_moves(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == 0]

    def switch_player(self):
        self.current_player = self.players[1] if self.current_player == self.players[0] else self.players[0]

    def check_winner(self):
        for i in range(3):
            if self.board[i][0] == self.board[i][1] == self.board[i][2] != 0:
                self.winner = self.players[int(self.board[i][0] - 1)]
                self.game_over = True
        for j in range(3):
            if self.board[0][j] == self.board[1][j] == self.board[2][j] != 0:
                self.winner = self.players[int(self.board[0][j] - 1)]
                self.game_over = True
        if self.board[0][0] == self.board[1][1] == self.board[2][2] != 0:
            self.winner = self.players[int(self.board[0][0] - 1)]
            self.game_over = True
        if self.board[0][2] == self.board[1][1] == self.board[2][0] != 0:
            self.winner = self.players[int(self.board[0][2] - 1)]
            self.game_over = True
        if not any(0 in row for row in self.board):
            self.game_over = True


In [6]:
class QLearningAgent:
    def __init__(self, alpha, epsilon, discount_factor):
        self.Q = {}
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount_factor = discount_factor

    def get_Q_value(self, state, action):
        if (tuple(state.flatten()), action) not in self.Q:
            self.Q[(tuple(state.flatten()), action)] = 0.0
        return self.Q[(tuple(state.flatten()), action)]

    def choose_action(self, state, available_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_moves)
        else:
            Q_values = [self.get_Q_value(state, action) for action in available_moves]
            max_Q = max(Q_values)
            best_moves = [action for action, Q_value in zip(available_moves, Q_values) if Q_value == max_Q]
            return random.choice(best_moves)

    def update_Q_value(self, state, action, reward, next_state, next_available_moves):
        current_Q = self.get_Q_value(state, action)
        if next_available_moves:
            next_Q = max([self.get_Q_value(next_state, next_action) for next_action in next_available_moves])
        else:
            next_Q = 0
        self.Q[(tuple(state.flatten()), action)] = current_Q + self.alpha * (reward + self.discount_factor * next_Q - current_Q)


In [7]:
def train(num_episodes, alpha, epsilon, discount_factor):
    agent = QLearningAgent(alpha, epsilon, discount_factor)
    for _ in range(num_episodes):
        game = TicTacToe()
        state = game.board
        while not game.game_over:
            available_moves = game.available_moves()
            action = agent.choose_action(state, available_moves)
            next_state, reward, game_over = game.make_move(action)
            agent.update_Q_value(state, action, reward, next_state, game.available_moves())
            state = next_state
    return agent


In [8]:
def test(agent, num_games):
    num_wins = 0
    num_draws = 0
    num_losses = 0

    for _ in range(num_games):
        game = TicTacToe()
        state = game.board
        while not game.game_over:
            if game.current_player == "X":  # Assume agent is always "X"
                action = agent.choose_action(state, game.available_moves())
            else:
                action = random.choice(game.available_moves())  # Random opponent

            state, reward, game_over = game.make_move(action)

        if game.winner == "X":  # Agent wins
            num_wins += 1
        elif game.winner == "O":  # Agent loses
            num_losses += 1
        else:  # No winner (draw)
            num_draws += 1

    win_rate = num_wins / num_games * 100
    draw_rate = num_draws / num_games * 100
    loss_rate = num_losses / num_games * 100

    return win_rate, draw_rate, loss_rate


In [11]:
# Parameters
num_episodes = 100000
alpha = 0.5
epsilon = 0.1
discount_factor = 1.0
num_games = 1000

# Train the Q-learning agent
agent = train(num_episodes, alpha, epsilon, discount_factor)

# Test the Q-learning agent
win_rate, draw_rate, loss_rate = test(agent, num_games)

# Display results
print(f"Win Rate: {win_rate:.2f}%")
print(f"Draw Rate: {draw_rate:.2f}%")
print(f"Loss Rate: {loss_rate:.2f}%")


Win Rate: 59.50%
Draw Rate: 13.70%
Loss Rate: 26.80%
