Import required libraries

In [None]:
import random
import numpy as np
from collections import defaultdict

Initialize Hyperparameters

In [14]:
alpha = 0.1
gamma = 0.9
epsilon = 0.1
epoch= 10000000000

Function to Check Winner in Tic-Tac-Toe

In [None]:
def check_winner(board):
    lines = [(0,1,2),(3,4,5),(6,7,8),
             (0,3,6),(1,4,7),(2,5,8),
             (0,4,8),(2,4,6)]
    for i,j,k in lines:
        if board[i] == board[j] == board[k] and board[i] != " ":
            return 1 if board[i] == "x" else -1
    if " " not in board:
        return 0
    return None

Initialize Q-Table

In [None]:
Q_table = defaultdict(lambda: np.zeros(9))

Epsilon-Greedy Action Selection

In [None]:
def choose_action(state, available_moves, epsilon=0.1):
   # With probability epsilon → explore (random move)
    if random.random() < epsilon:
        return random.choice(available_moves)
    state=tuple(state)
    q_vals = Q_table[state]
    return max(available_moves, key=lambda a: q_vals[a])

Training Function for Q-Learning Agent in Tic-Tac-Toe

In [None]:
def train(epoch, alpha, gamma, epsilon):
    for i in range(epoch):
        # Start with an empty board
        state = [" "] * 9
        done = False

        while not done:
            available_moves = [i for i in range(9) if state[i] == " "]
            if not available_moves:
                break  # No moves left

            # Save current state (as tuple for Q-table key)
            prev_state = tuple(state)

            # Agent (X) move
            actionx = choose_action(prev_state, available_moves, epsilon)
            state[actionx] = "x"

            # Check if agent’s move ends the game
            reward = check_winner(state)
            if reward is not None:
                old_val = Q_table[prev_state][actionx]
                # Update Q-value using  reward
                Q_table[prev_state][actionx] = old_val + alpha * (reward - old_val)
                done = True
                break

            #  Opponent (O) random move
            available_opp = [i for i in range(9) if state[i] == " "]
            opp_move = random.choice(available_opp)
            state[opp_move] = "o"

            # Check result after opponent’s move
            reward = check_winner(state)
            next_state = tuple(state)

            old_val = Q_table[prev_state][actionx]
            if reward is not None:
                Q_table[prev_state][actionx] = old_val + alpha * (reward - old_val)
                done = True
            else:
                # use Bellman update
                next_max = np.max(Q_table[next_state])
                Q_table[prev_state][actionx] = old_val + alpha * (0 + gamma * next_max - old_val)

Train the agent

In [None]:
train(epoch, alpha, gamma, epsilon)

Save Q-Table

In [15]:
import pickle
with open("q_table.pkl", "wb") as f:
 pickle.dump(dict(Q_table), f)

evaluate

In [None]:
def agent_move(state, Q_table):
    state_t = tuple(state)
    q_vals = Q_table[state_t]
    available = [i for i in range(9) if state[i] == " "]
    return max(available, key=lambda a: q_vals[a])

# ---- Play one game ----
def play_game(use_qtable=False, Q_table=None):
    state = [" "] * 9

    while True:
        # X move
        available = [i for i in range(9) if state[i] == " "]
        if not available:
            return 0  # draw
        if use_qtable:
            move = agent_move(state, Q_table)
        else:
            move = random.choice(available)
        state[move] = "x"
        result = check_winner(state)
        if result is not None:
            return result

        # O random move
        available = [i for i in range(9) if state[i] == " "]
        if not available:
            return 0
        move = random.choice(available)
        state[move] = "o"
        result = check_winner(state)
        if result is not None:
            return result

# ---- Evaluate accuracy ----
def evaluate(n_games=1000, Q_table=None):
    def run_eval(use_qtable):
        results = {1:0, -1:0, 0:0}
        for _ in range(n_games):
            res = play_game(use_qtable=use_qtable, Q_table=Q_table)
            results[res] += 1
        return {k: v/n_games for k,v in results.items()}

    rand_stats = run_eval(use_qtable=False)
    qtab_stats = run_eval(use_qtable=True)

    print("📊 Evaluation over", n_games, "games")
    print("\n-- Random X vs Random O --")
    print(f"Wins : {rand_stats[1]:.2%}")
    print(f"Draws: {rand_stats[0]:.2%}")
    print(f"Loss : {rand_stats[-1]:.2%}")

    print("\n-- Q-table X vs Random O --")
    print(f"Wins : {qtab_stats[1]:.2%}")
    print(f"Draws: {qtab_stats[0]:.2%}")
    print(f"Loss : {qtab_stats[-1]:.2%}")

In [None]:
evaluate(100000, Q_table=Q_table)

📊 Evaluation over 100000 games

-- Random X vs Random O --
Wins : 58.49%
Draws: 12.65%
Loss : 28.86%

-- Q-table X vs Random O --
Wins : 99.24%
Draws: 0.76%
Loss : 0.00%
