In [6]:
import random
import pickle
import numpy as np

class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = [' '] * 9
        self.done = False
        self.winner = None
        return self.get_state()

    def get_state(self):
        return ''.join(self.board)

    def available_actions(self):
        return [i for i, v in enumerate(self.board) if v == ' ']

    def make_move(self, action, player):
        if self.board[action] == ' ':
            self.board[action] = player
            self.check_game_over()
            return True
        return False

    def check_game_over(self):
        win_combos = [(0,1,2),(3,4,5),(6,7,8),
                      (0,3,6),(1,4,7),(2,5,8),
                      (0,4,8),(2,4,6)]
        for a, b, c in win_combos:
            if self.board[a] == self.board[b] == self.board[c] != ' ':
                self.winner = self.board[a]
                self.done = True
                return
        if ' ' not in self.board:
            self.done = True

class QAgent:
    def __init__(self, player, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.player = player
        self.q_table = {}  # state -> action values
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.last_state = None
        self.last_action = None

    def choose_action(self, state, actions):
        if random.random() < self.epsilon:
            return random.choice(actions)
        q_values = self.q_table.get(state, np.zeros(9))
        best_action = max(actions, key=lambda x: q_values[x])
        return best_action

    def update(self, state, reward):
        if self.last_state is not None:
            q_values = self.q_table.setdefault(self.last_state, np.zeros(9))
            next_q = self.q_table.get(state, np.zeros(9))
            q_values[self.last_action] += self.alpha * (reward + self.gamma * max(next_q) - q_values[self.last_action])

    def remember(self, state, action):
        self.last_state = state
        self.last_action = action

def train(episodes=10000):
    env = TicTacToe()
    agent_X = QAgent('X')
    agent_O = QAgent('O')

    for ep in range(episodes):
        state = env.reset()
        current_player = agent_X
        other_player = agent_O
        while not env.done:
            actions = env.available_actions()
            action = current_player.choose_action(state, actions)
            env.make_move(action, current_player.player)
            next_state = env.get_state()
            if env.done:
                if env.winner == current_player.player:
                    current_player.update(next_state, 1)
                    other_player.update(next_state, -1)
                elif env.winner is None:
                    current_player.update(next_state, 0.5)
                    other_player.update(next_state, 0.5)
                break
            current_player.remember(state, action)
            current_player.update(next_state, 0)
            state = next_state
            current_player, other_player = other_player, current_player
    return agent_X

def play(agent):
    env = TicTacToe()
    state = env.reset()
    while not env.done:
        print("Current board:")
        print_board(env.board)
        if env.board.count('X') <= env.board.count('O'):
            actions = env.available_actions()
            action = agent.choose_action(state, actions)
            env.make_move(action, 'X')
        else:
            action = int(input("Your move (0-8): "))
            env.make_move(action, 'O')
        state = env.get_state()

    print("Game Over!")
    print_board(env.board)
    if env.winner:
        print(f"Winner: {env.winner}")
    else:
        print("Draw!")

def print_board(board):
    print(f"{board[0]} | {board[1]} | {board[2]}")
    print("-" * 5)
    print(f"{board[3]} | {board[4]} | {board[5]}")
    print("-" * 5)
    print(f"{board[6]} | {board[7]} | {board[8]}")

# Example usage:
trained_agent = train(1000000)


In [7]:
play(trained_agent)


Current board:
  |   |  
-----
  |   |  
-----
  |   |  
Current board:
X |   |  
-----
  |   |  
-----
  |   |  
Current board:
X |   |  
-----
  | O |  
-----
  |   |  
Current board:
X |   | X
-----
  | O |  
-----
  |   |  
Current board:
X | O | X
-----
  | O |  
-----
  |   |  
Current board:
X | O | X
-----
  | O |  
-----
  | X |  
Current board:
X | O | X
-----
O | O |  
-----
  | X |  
Current board:
X | O | X
-----
O | O | X
-----
  | X |  
Current board:
X | O | X
-----
O | O | X
-----
  | X |  
Current board:
X | O | X
-----
O | O | X
-----
  | X | O
Game Over!
X | O | X
-----
O | O | X
-----
X | X | O
Draw!
