# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
import numpy as np
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
from abc import ABC, abstractclassmethod

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
State = namedtuple('Position', ['x', 'o'])

### Player class
- Random player: plays always randomly (i.e. choses an action from the set of available actions).
- RL player: uses an $\epsilon$-greedy strategy. This means that it takes a random action with probability $\epsilon$, the best action (according to the values in the Q-table) with probability 1-$\epsilon$.

In [3]:
class TicTacToe_Player(ABC):
    @abstractclassmethod
    def next_action(self, current_state):
        pass

class Random_Player(TicTacToe_Player):
    def next_action(self, current_state):
        available_moves = set(range(1, 10)) - current_state.x - current_state.o
        return choice(list(available_moves))
    
class RL_Player(TicTacToe_Player):
    def __init__(self, epsilon, Q=None) -> None:
        if not Q:
            self.Q = defaultdict(float)
        else:
            self.Q = Q
        self.epsilon = epsilon

    def next_action(self, current_state):
        available_moves = list(set(range(1, 10)) - current_state.x - current_state.o)
        if np.random.random() < self.epsilon:
            return choice(available_moves)
        else:
            # generate action a as the best action we can take in state s
            s = (frozenset(current_state.x), frozenset(current_state.o))
            q_star_index = np.argmax([self.Q[(s, a)] for a in available_moves])
            return available_moves[q_star_index]
        

### Tic-Tac-Toe Class

In [4]:
class TicTacToe():
    def __init__(self) -> None:
        self.MAGIC = [2, 7, 6,
                      9, 5, 1,
                      4, 3, 8]

    def win(self, elements):
        """Checks if elements is winning"""
        return any(sum(c) == 15 for c in combinations(elements, 3))
    
    def state_action_value(self, state_action) -> int:
        """Evaluate position: +1 first player wins, -1 first player loses, 0 game not over"""

        state, action = state_action
        
        if len(state.x) > len(state.o):
            state.o.add(action)
        else:   
            state.x.add(action)
        
        if self.win(state.x):
            return 1
        elif self.win(state.o):
            return -1
        else:
            return 0
        
    def print_board(self, pos):
        """Print the board"""
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if self.MAGIC[i] in pos.x:
                    print('X', end='')
                elif self.MAGIC[i] in pos.o:
                    print('O', end='')
                else:
                    print('.', end='')
            print()
        print()

    def play(self, player1: TicTacToe_Player, player2: TicTacToe_Player, verbose=False):
        trajectory = list()
        state = State(set(), set())
        available = set(range(1, 10)) - state.x - state.o
        
        while available:
            x = player1.next_action(state)
            trajectory.append((deepcopy(state), x))
            state.x.add(x)
            available.remove(x)
            if verbose:
                self.print_board(state)
            if self.win(state.x) or not available:
                break

            o = player2.next_action(state)
            trajectory.append((deepcopy(state), o))
            state.o.add(o)
            available.remove(o)
            if verbose:
                self.print_board(state)
            if self.win(state.o) or not available:
                break

        return trajectory

### Game example

In [5]:
game = TicTacToe()
player1 = Random_Player()
player2 = Random_Player()
trajectory = game.play(player1, player2, True)

X..
...
...

X..
O..
...

X..
O..
.X.

XO.
O..
.X.

XO.
O..
.XX

XO.
O.O
.XX

XOX
O.O
.XX

XOX
OOO
.XX



### Training of the agent
The agent is trained through model-free Q-learning, i.e., using the formula:
$$
Q^*_{t+1}(s,a) = (1-\alpha)Q^*_{t}(s,a) + \alpha(r + \gamma Q^*_{t}(s',a'))
$$

Representation adopted:
- state: tuple in the form ({positions of X}, {positions of O}) 
- action: int representing the next position occupied
- state/action: tuple in the form (state, action)

Thus, the Q-table contains the values associated to each state/action pair. The training is made over 2.000.000 episodes: 1.000.000 episodes playing the agent as first, the remaing playing the agent as second.

Note:
- s = state in which the agent is
- a = action executed by the agent
- s' = next state in which the agent is has to move again
- a' = best action to take in s', according to Q

In [6]:
EPSILON = 0.01
ALPHA = 0.9
DISCOUNT_FACTOR = 0.9
EPISODES = 2_000_000

In [7]:
game = TicTacToe()

player1 = Random_Player()
player2 = RL_Player(epsilon=EPSILON)

Q = player2.Q
starting_player = 0

for episode in tqdm(range(EPISODES)):
    # switch the role of the players
    player1, player2 = player2, player1

    # store all the states/actions taken in the game
    trajectory = game.play(player1, player2)

    # verify who won
    final_reward = game.state_action_value(trajectory[-1])

    # switch the role of the players (needed for Q table update)
    starting_player = 1 - starting_player

    # the reward must be inverted when the agent starts second (i.e. if player 1 loses -> agent wins)
    if starting_player == 1:
        final_reward = -final_reward
    
    # for every state/action taken by the agent ([start::2]) execept for the terminal state ([start:-1])
    for state, action in trajectory[starting_player:-1:2]:
        available_moves = list(set(range(1, 10)) - state.x - state.o)

        s = (frozenset(state.x), frozenset(state.o))
        a = action

        # generate state s'
        if len(state.x) < len(state.o):
            state.x.add(a)
        else:
            state.o.add(a)
        s1 = (frozenset(state.x), frozenset(state.o))

        # generate action a' as the best action we can take in state s'
        q_star_index = np.argmax([Q[(s1, action)] for action in available_moves])
        a1 = available_moves[q_star_index]

        # update the Q table using the formula
        Q[(s, a)] = (1 - ALPHA) * Q[(s, a)] +\
                           ALPHA * (final_reward + DISCOUNT_FACTOR * Q[(s1, a1)])

100%|██████████| 2000000/2000000 [05:04<00:00, 6564.83it/s]


In [8]:
print("Best states/actions")
sorted(Q.items(), key=lambda e: e[1], reverse=True)[:10]

Best states/actions


[(((frozenset({1, 3, 4}), frozenset({2, 8})), 6), 1.728),
 (((frozenset({2, 8}), frozenset({5})), 1), 1.7182790837090822),
 (((frozenset({2, 3, 9}), frozenset({6, 8})), 4), 1.7127990000000002),
 (((frozenset({3, 9}), frozenset({8})), 6), 1.7000001816216224),
 (((frozenset({2, 6, 9}), frozenset({3, 7})), 1), 1.700000000991009),
 (((frozenset({6, 7}), frozenset({8})), 4), 1.7),
 (((frozenset({1, 3, 9}), frozenset({2, 5})), 7), 1.6470000000000002),
 (((frozenset({1, 6}), frozenset({8})), 3), 1.6469375899065894),
 (((frozenset({5, 7, 9}), frozenset({3, 8})), 1), 1.6380000000000003),
 (((frozenset({1, 7}), frozenset({2})), 4), 1.6298917731798381)]

In [9]:
print("Worst states/actions")
sorted(Q.items(), key=lambda e: e[1], reverse=False)[:10]

Worst states/actions


[(((frozenset({1}), frozenset({8})), 6), -1.0),
 (((frozenset({1}), frozenset({8})), 7), -1.0),
 (((frozenset({1, 2}), frozenset({3, 8})), 5), -1.0),
 (((frozenset({1, 2}), frozenset({3, 8})), 6), -1.0),
 (((frozenset({1, 2}), frozenset({3, 8})), 9), -1.0),
 (((frozenset({1, 2, 4, 5}), frozenset({3, 8, 9})), 7), -1.0),
 (((frozenset({2, 6}), frozenset({1})), 5), -1.0),
 (((frozenset({2, 5, 6}), frozenset({1, 3})), 8), -1.0),
 (((frozenset({2, 5, 6}), frozenset({1, 3})), 9), -1.0),
 (((frozenset({2, 5, 6}), frozenset({1, 3})), 4), -1.0)]

### Testing the agent:
 - when starting first

In [10]:
N_GAMES = 1_000

wins = 0
losses = 0

# epsilon=0: always take the best action according to the Q table
player1 = RL_Player(Q=Q, epsilon=0)
player2 = Random_Player()

for i in range(N_GAMES):
    trajectory = game.play(player1, player2)
    final_reward = game.state_action_value(trajectory[-1])

    if final_reward == 1:
        wins += 1
    elif final_reward == -1:
        losses += 1

print(f"Results after {N_GAMES} games:")
print(f"Wins: {wins}\t Draws: {N_GAMES-wins-losses}\t Losses: {losses}")

Results after 1000 games:
Wins: 938	 Draws: 62	 Losses: 0


- when starting second

In [12]:
N_GAMES = 1_000

wins = 0
losses = 0

# epsilon = 0: always take the best action according to the Q table
player1 = Random_Player()
player2 = RL_Player(Q=Q, epsilon=0)

for i in range(N_GAMES):
    trajectory = game.play(player1, player2)
    final_reward = game.state_action_value(trajectory[-1])

    if final_reward == 1:
        losses += 1
    elif final_reward == -1:
        wins += 1

print(f"Results after {N_GAMES} games:")
print(f"Wins: {wins}\t Draws: {N_GAMES-wins-losses}\t Losses: {losses}")

Results after 1000 games:
Wins: 670	 Draws: 216	 Losses: 114
