In [1]:
import numpy as np
import random
import pickle

class TicTacToe:
    def __init__(self):
        self.board = [' '] * 9  # 3x3 board in a 1D list
        self.current_player = 'X'
    
    def reset(self):
        self.board = [' '] * 9
        self.current_player = 'X'
        return tuple(self.board)

    def get_available_actions(self):
        return [i for i in range(9) if self.board[i] == ' ']

    def make_move(self, action):
        if self.board[action] == ' ':
            self.board[action] = self.current_player
            if self.check_winner(self.current_player):
                return tuple(self.board), 1 if self.current_player == 'X' else -1, True
            elif ' ' not in self.board:
                return tuple(self.board), 0.5, True
            self.current_player = 'O' if self.current_player == 'X' else 'X'
            return tuple(self.board), 0, False
        return tuple(self.board), -1, False  # Invalid move penalty

    def check_winner(self, player):
        win_states = [(0,1,2), (3,4,5), (6,7,8), 
                      (0,3,6), (1,4,7), (2,5,8), 
                      (0,4,8), (2,4,6)]
        return any(all(self.board[i] == player for i in combo) for combo in win_states)

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.q_table = {}  # Q-values storage
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0)

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:  # Explore
            return random.choice(available_actions)
        # Exploit: Pick best action based on Q-values
        q_values = {a: self.get_q_value(state, a) for a in available_actions}
        return max(q_values, key=q_values.get)

    def update_q_table(self, state, action, reward, next_state, done):
        old_value = self.get_q_value(state, action)
        future_max = max([self.get_q_value(next_state, a) for a in range(9) if next_state[a] == ' '], default=0)
        new_value = old_value + self.alpha * (reward + self.gamma * future_max * (1 - int(done)) - old_value)
        self.q_table[(state, action)] = new_value

def train_q_agent(episodes=10000):
    env = TicTacToe()
    agent = QLearningAgent()

    for _ in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.choose_action(state, env.get_available_actions())
            next_state, reward, done = env.make_move(action)
            agent.update_q_table(state, action, reward, next_state, done)
            state = next_state

    with open("q_table.pkl", "wb") as f:
        pickle.dump(agent.q_table, f)
    print("Training completed and Q-table saved!")

if __name__ == "__main__":
    train_q_agent()


Training completed and Q-table saved!


In [2]:
def play_against_q_agent():
    with open("q_table.pkl", "rb") as f:
        q_table = pickle.load(f)

    env = TicTacToe()
    agent = QLearningAgent()
    agent.q_table = q_table  # Load trained Q-values

    state = env.reset()
    print("Your move! (Positions: 0-8)")
    
    while True:
        print("\n" + "\n".join([' '.join(env.board[i:i+3]) for i in range(0, 9, 3)]))
        if env.current_player == 'X':  # Human
            move = int(input("Enter position: "))
        else:  # AI
            move = agent.choose_action(state, env.get_available_actions())
            print(f"AI chooses: {move}")

        state, reward, done = env.make_move(move)
        if done:
            print("\nFinal Board:")
            print("\n".join([' '.join(env.board[i:i+3]) for i in range(0, 9, 3)]))
            if reward == 1:
                print("X wins!")
            elif reward == -1:
                print("O wins!")
            else:
                print("It's a draw!")
            break

play_against_q_agent()


Your move! (Positions: 0-8)

     
     
     


Enter position:  1



  X  
     
     
AI chooses: 7

  X  
     
  O  


Enter position:  0



X X  
     
  O  
AI chooses: 8

X X  
     
  O O


Enter position:  2



Final Board:
X X X
     
  O O
X wins!
