Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [15]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

In [16]:
State = namedtuple("State", ["x", "o"])
StateMove = namedtuple("StaeMove", ["state", "move"])
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]
WINNING_REWARD = 10
LOSING_REWARD = -1
GAME_PLAYED = 1000
EPSILON = 0.001


In [18]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))


def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return WINNING_REWARD
    elif win(pos.o):
        return LOSING_REWARD
    else:
        return 0


def clever_player(state: State, player=1):
    """checks if he has a winnig move and if the opponent has a winng move player 
    if player=0 plays the first move of every turn"""
    if player == 1:
        state = deepcopy(state)
        available = set(range(1, 9 + 1)) - state.x - state.o  # create a set with the all possible moves
        for c in combinations(state.o, 2):
            if 15 - sum(c) in available:
                state.o.add(15 - sum(c))
                return state
        for c in combinations(state.x, 2):
            if 15 - sum(c) in available:
                state.o.add(15 - sum(c))
                return state
        rand = choice(list(available))
        state.o.add(rand)
        return state
    else:
        state = deepcopy(state)
        available = set(range(1, 9 + 1)) - state.x - state.o  # create a set with the all possible moves
        for c in combinations(state.x, 2):
            if 15 - sum(c) in available:
                state.x.add(15 - sum(c))
                return state
        for c in combinations(state.o, 2):
            if 15 - sum(c) in available:
                state.x.add(15 - sum(c))
                return state
        rand = choice(list(available))
        state.x.add(rand)
        return state
        

In [19]:
def clever_game():
    """play a game using the clever strategy"""
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9 + 1))
    while available:
        new_state = clever_player(deepcopy(state), player=0)
        x = new_state.x - state.x
        x = x.pop()
        trajectory.append(StateMove(deepcopy(state), x))
        state.x.add(x)
        available.remove(x)
        if win(state.x) or not available:
            trajectory.append(StateMove(deepcopy(state), 100))#100 to signal that there is no move
            break
        new_state = clever_player(deepcopy(state), player=1)
        o = new_state.o - state.o
        o = o.pop()
        state.o.add(o)
        available.remove(o)
        if win(state.o):
            break
    return trajectory

In [20]:
def random_game(player=0):
    """play a game with legal and random move"""
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9 + 1))
    while available:
        x = choice(list(available))
        if player == 0:
            trajectory.append(StateMove(deepcopy(state), x))
        state.x.add(x)
        available.remove(x)
        if win(state.x) or not available:
            if player == 0:
                trajectory.append(StateMove(deepcopy(state), x))
            break
        o = choice(list(available))
        if player == 1:
            trajectory.append(StateMove(deepcopy(state), x))
        state.o.add(o)
        available.remove(o)
        if win(state.o):
            break
    return trajectory

In [21]:
value_dictionary = defaultdict(float)

# add a new hashmap considering the acual state and the next state of the trajectory
for steps in tqdm(range(500_000)):
    trajectory = clever_game()
    final_reward = state_value(trajectory[-1].state)
    for state_move in trajectory:
        hashable_state = (frozenset(state_move.state.x), frozenset(state_move.state.o), state_move.move)
        value_dictionary[hashable_state] = value_dictionary[hashable_state] + EPSILON * (
            final_reward - value_dictionary[hashable_state]
        )

  0%|          | 280/500000 [00:00<11:03, 753.32it/s]

100%|██████████| 500000/500000 [07:10<00:00, 1162.48it/s]


In [23]:
def montcarlo_player(state: State):
    """this function chooses based on the dict build before during learning """
    state = deepcopy(state)
    optimal_value = -float('inf')  # smallest float in python
    optimal_move = 10  # just initializing the variable
    avaibles = set(range(1, 9 + 1)) - state.x - state.o
    for option in avaibles:
        hashable_state = (frozenset(state.x), frozenset(state.o), option)
        if value_dictionary[hashable_state] > optimal_value:
            optimal_value = value_dictionary[hashable_state]
            optimal_move = option
    state.x.add(optimal_move)
    return state

In [26]:
wins = 0
losses = 0
for _ in range(GAME_PLAYED):
    state = State(set(), set())
    while 1:
        state = montcarlo_player(state)
        if state_value(state) == WINNING_REWARD:
            wins += 1
            break
        if len(state.x) + len(state.o) == 9:
            break
        state = clever_player(state)
        if state_value(state) == LOSING_REWARD:
            losses += 1
            break
print(f"wins={wins} losses={losses}")

wins=869 losses=119
