Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [92]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, random
from copy import deepcopy
from math import factorial

from tqdm.auto import tqdm
import numpy as np

In [93]:
State = namedtuple('State', ['x', 'o'])

In [94]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [95]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

In [96]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0
    
    

In [97]:
def random_game():
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
    return trajectory

In [98]:
"""value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.001

for steps in tqdm(range(500_000)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])"""

'value_dictionary = defaultdict(float)\nhit_state = defaultdict(int)\nepsilon = 0.001\n\nfor steps in tqdm(range(500_000)):\n    trajectory = random_game()\n    final_reward = state_value(trajectory[-1])\n    for state in trajectory:\n        hashable_state = (frozenset(state.x), frozenset(state.o))\n        hit_state[hashable_state] += 1\n        value_dictionary[hashable_state] = value_dictionary[\n            hashable_state\n        ] + epsilon * (final_reward - value_dictionary[hashable_state])'

In [99]:
n_actions = 9
q_table = defaultdict(float)
state = State(set(), set())
hashable_state = (frozenset(state.x), frozenset(state.o))
print(q_table[hashable_state])
q_table[hashable_state] = np.zeros(n_actions)
print(q_table[hashable_state])

0.0
[0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [100]:
alpha = 0.1
gamma = 0.9
fixed_epsilon = 0.2
epsilon = fixed_epsilon

class agent:
  def __init__(self):
    states = factorial(9)
    # states = list(range(1, 9+1))
    actions = 9
    self.q_table = defaultdict(None)
  
  def get_action(self, state, available_actions):
    if np.random.rand() < epsilon:
      action =  choice(available_actions)
      available_actions.remove(action)
      return action, available_actions
    else:
      hashable_state = (frozenset(state.x), frozenset(state.o))
    
      if self.q_table[hashable_state] == None:
        self.q_table[hashable_state] = np.zeros(9)
      best_action = np.argmax(self.q_table[hashable_state])
      available_actions.remove(best_action)
      return best_action, available_actions
  
  def update(self, state, action, reward, next_state, done):
    if done:
      target_value = reward
    else:
      hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))
      target_value = reward + gamma * np.max(self.q_table[hashable_next_state])
    
    hashable_state = (frozenset(state.x), frozenset(state.o))
    self.q_table[hashable_state, action] = self.q_table[hashable_state, action] + alpha * (target_value - self.q_table[hashable_state, action])
    return
  
  def perform_action(action, state):
    if state.x:
      state.o.add(action)
    else:
      state.x.add(action)
    return state
  
  def get_reward(state):
    if win(state.x):
      return 1
    elif win(state.o):
      return -1
    else:
      return 0
  

In [101]:
a = agent()
done = False
available_actions = list(range(9))
while not done:
  action, available_actions = a.get_action(state, available_actions)
  next_state = a.perform_action(action, deepcopy(state))
  reward = a.get_reward(state)
  if reward == 1:
    done = True
  a.update(state, action, reward, next_state, done)
  

KeyError: (frozenset(), frozenset())