Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [12]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
import random

from tqdm.auto import tqdm
import numpy as np

In [13]:
State = namedtuple('State', ['x', 'o'])

In [14]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [15]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

In [16]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State,agent_player):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        if agent_player == 'x':
            return 1
        else:
            return -1
    elif win(pos.o):
        if agent_player == 'o':
            return 1
        else:
            return -1
    else:
        return 0
    
    

**Q-Learning Model free**

In [17]:
class QLearning:
    def __init__(self, alpha, epsilon, dis_factor):
        self.Q = defaultdict(float)
        self.alpha = alpha
        self.epsilon = epsilon
        self.dis_factor = dis_factor

    def get_Q(self, state, action):
        state_key = (tuple(state.x), tuple(state.o))
        return self.Q[(state_key, action)]


    def choose_action(self, state, available):
        if random.random() < self.epsilon:
            return random.choice(available)
        else:
            Q_vals= [self.get_Q(state, action) for action in available]
            max_Q = max(Q_vals)
            best_moves = [i for i in range(len(available)) if Q_vals[i] == max_Q]
            index = random.choice(best_moves)
            return available[index]

    def update_Q(self, state, action, reward, next_state, available):
        state_key = (tuple(state.x), tuple(state.o))
        next_Q_vals = [self.get_Q(next_state, next_action) for next_action in available]
        max_next_Q = max(next_Q_vals,default=0.0)
        self.Q[(state_key, action)] = (1 - self.alpha) * self.Q[(state_key, action)] + self.alpha * (reward + self.dis_factor * max_next_Q)
    

def train(num_episodes, alpha, epsilon, disc_factor, agent_player):
    agent = QLearning(alpha, epsilon, disc_factor)
    for i in range(num_episodes):
        state = State(set(), set())
        available = list(range(1, 9 + 1))
        player_turn = 'x' #we assumed that x is starting always firstly in our games function

        while available and not win(state):
            if player_turn == agent_player:
                action = agent.choose_action(state, available)
            else:
                #if the current turn is of the adversary it is doing a random move
                action = choice(available)

            previous_state = deepcopy(state)

            if player_turn == 'x':
                state.x.add(action)
            else:
                state.o.add(action)

            available.remove(action)

            reward = state_value(state, agent_player)
            agent.update_Q(previous_state, action, reward, state, available)

            # Switching the player
            player_turn = 'o' if player_turn == 'x' else 'x'

        #Switching the agent to train the model in both the sides
        agent_player = 'x' if agent_player == 'o' else 'o'

    return agent


In [18]:
def game(agent,agent_player): #Game which is considering both when the agent starts first or second thanks to the variable agent_player
    trajectory = list()
    state = State(set(), set())
    available = list(range(1, 9+1))
    if agent_player == 'x':
        while available:
            x = agent.choose_action(state,available)
            state.x.add(x)
            trajectory.append(deepcopy(state))
            available.remove(x)
            if win(state.x) or not available:
                break

            o = choice(list(available))
            state.o.add(o)
            trajectory.append(deepcopy(state))
            available.remove(o)
            if win(state.o) or not available:
                break
    elif agent_player == 'o':
        while available:
            x = choice(list(available))
            state.x.add(x)
            trajectory.append(deepcopy(state))
            available.remove(x)
            if win(state.x) or not available:
                break

            o = agent.choose_action(state,available)
            state.o.add(o)
            trajectory.append(deepcopy(state))
            available.remove(o)
            if win(state.o) or not available:
                break
    return trajectory

In [20]:
NUM_GAMES = 1000
NUM_EPISODES = 10000
agent_player = 'o' #variable tunable to choose the agent player, we assumed that 'x' is always the first player while 'o' is always the second one
#List of parameters to find a best value
epsilon_values = [0.1,0.3,0.5]
alpha_values = [0.2,0.5,0.9]
disc_factor_values = [0.3, 0.7,1.0]

In [21]:
def search_best_parameters(epsilon_values, alpha_values, disc_factor_values,agent_player,num_episodes):
    best_agent = None
    best_percentage_win_agent = 0

    for epsilon in epsilon_values:
        for alpha in alpha_values:
            for disc_factor in disc_factor_values:
                agent = train(num_episodes=num_episodes, alpha=alpha, epsilon=epsilon, disc_factor=disc_factor,agent_player=agent_player)

                num_win_agent=0
                num_win_random_player=0
                num_draw=0
                for i in range(NUM_GAMES):
                    trajectory=game(agent,agent_player)
                    val_finished_game=state_value(trajectory[-1],agent_player)
                    if val_finished_game == 1:
                        num_win_agent+=1
                    elif val_finished_game == -1:
                        num_win_random_player+=1
                    else:
                        num_draw+=1

                total_games = num_win_agent + num_win_random_player + num_draw
                percentage_win_agent = (num_win_agent / total_games) * 100

                if percentage_win_agent > best_percentage_win_agent:
                    best_percentage_win_agent = percentage_win_agent
                    best_agent = agent

    return best_agent



In [23]:
best_agent = search_best_parameters(epsilon_values, alpha_values, disc_factor_values,agent_player,NUM_EPISODES)

num_win_agent=0
num_win_random_player=0
num_draw=0
for i in range(NUM_GAMES):
    trajectory=game(best_agent,agent_player)
    val_finished_game=state_value(trajectory[-1],agent_player)
    if val_finished_game == 1:
        num_win_agent+=1
    elif val_finished_game == -1:
        num_win_random_player+=1
    else:
        num_draw+=1

total_games = num_win_random_player + num_win_agent + num_draw

percentage_win_agent = (num_win_agent / total_games) * 100
percentage_win_random_player = (num_win_random_player / total_games) * 100
percentage_draw = (num_draw / total_games) * 100
percentage_win_agent_respect_random_player = (num_win_agent/(num_win_agent+num_win_random_player))*100

print("Best parameters found")
print(f"Epsilon: {best_agent.epsilon}")
print(f"Alpha: {best_agent.alpha}")
print(f"Discount Factor: {best_agent.dis_factor}")
print("\nResults:")
print(f"Percentage wins of the agent: {percentage_win_agent}%")
print(f"Percentage wins of the random player: {percentage_win_random_player}%")
print(f"Percentage of draws: {percentage_draw}%")
print(f"Percentage wins of the agent with respect to the random player: {percentage_win_agent_respect_random_player}%")