# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [158]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, uniform, randint
from copy import deepcopy
from time import sleep
from tqdm.auto import tqdm
import numpy as np

In [111]:
State = namedtuple('State', ['x', 'o'])

In [112]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

### *Our most significant additions to the proposed code*

In [113]:
## the following functions are used to to create and recognize different symmetries of the same position.
## In this way the training algorith is able to aggregate "different" positions (that are truly the same, just rotated) 
## thus improving the learning process

def apply_symmetry(state, x_changes, o_changes):
    set_x, set_o = state
    new_set_x = {(x + x_changes.get(x, 0)) for x in set_x}
    new_set_o = {(o + o_changes.get(o, 0)) for o in set_o}
    return new_set_x, new_set_o

def horizontal_symmetry(state):
    return apply_symmetry(state, {1: 2, 4: 2, 7: 2, 3: -2, 6: -2, 9: -2}, {1: 2, 4: 2, 7: 2, 3: -2, 6: -2, 9: -2})

def vertical_symmetry(state):
    return apply_symmetry(state, {1: 6, 2: 6, 3: 6, 7: -6, 8: -6, 9: -6}, {1: 6, 2: 6, 3: 6, 7: -6, 8: -6, 9: -6})

def d1_symmetry(state):
    return apply_symmetry(state, {2: 4, 4: 4, 1: 8, 6: -4, 8: -4, 9: -8}, {2: 4, 4: 4, 1: 8, 6: -4, 8: -4, 9: -8})

def d2_symmetry(state):
    return apply_symmetry(state, {2: 2, 6: 2, 3: 4, 4: -2, 8: -2, 7: -4}, {2: 2, 6: 2, 3: 4, 4: -2, 8: -2, 7: -4})

## checks if a position its actually new or just a rotated version of a previous one
def check_board(board, board_dict):
    board_symmetries = [horizontal_symmetry(board), vertical_symmetry(board), d1_symmetry(board), d2_symmetry(board)]
    
    for sym in board_symmetries:
        hashable_sym = (frozenset(sym[0]), frozenset(sym[1]))
        if hashable_sym in board_dict:
            return sym
    
    return board

def my_print_board(board):
    for row in range(3):
        for col in range(3):
            if row * 3 + col +1 in board[0]:
                print('X', end='')
            elif  row * 3 + col +1 in board[1]:
                print('O', end='')
            else:
                print('.', end='')
        print()


## the Player
class Agent():

    def __init__(self, dict) -> None:
        self.available = set(range(1,9+1))
        self.value_dictionary = deepcopy(dict)


    #change the pool of possible items to add
    def set_available(self, state):
        for x in state[0]:
            if x in self.available:
                self.available.remove(x)
        for o in state[1]:
            if o in self.available:
                self.available.remove(o)

    ### returns a list: every element is a list of a possible move with all its possible symmetries
    def find_moves(self, start_board: State) -> set:
        possible_moves = list()
        for i in self.available:
            symmetries = [] 
            
            cpy = deepcopy(start_board)
            cpy.x.add(i)
            true_cpy = deepcopy(cpy)
            sym = [vertical_symmetry(true_cpy), horizontal_symmetry(true_cpy), d1_symmetry(true_cpy),
                                    d2_symmetry(true_cpy)]
            symmetries.append(true_cpy)
            symmetries+=sym

            for i in range(len(symmetries)):
                symmetries[i] = (frozenset(symmetries[i][0]), frozenset(symmetries[i][1]))
            possible_moves.append(symmetries)
        return possible_moves

    #finds the best move in its dictionary for a given board
    def pick_move(self, board:State):
        moves_values = defaultdict(float)
        possible_moves = self.find_moves(board)
        
        # for each move, find it in the dictionary (could be any of its versions)
        ## 0: base board
        ## 1: vertical rotation
        ## 2: horizontal rotation
        ## 3: d1 rotation
        ## 4: d2 rotation
        found = False
        for move in possible_moves: 
            for i in range(5):
                if move[i] in self.value_dictionary:  
                    value = self.value_dictionary.get(move[i])
                    hashable_move = (frozenset(move[0][0]), frozenset(move[0][1]))   
                    moves_values[hashable_move] = value  
                    found = True
                    break

        #if it doesn't know the board, it plays randomly
        if not found:
            temp = []
            for move in possible_moves:
                temp.append(move[0])
            best = choice(temp)
        else:
            best =  max(moves_values.items(), key=lambda e: e[1])[0]
        self.set_available(best)
        return best



In [305]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos[0]:
                print('X', end='')
            elif MAGIC[i] in pos[1]:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()


In [306]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

#another way to check end game
def check_win(state):
    winning_combinations = [
        [1, 2, 3], [4, 5, 6], [7, 8, 9],  # Rows
        [1, 4, 7], [2, 5, 8], [3, 6, 9],  # Columns
        [1, 5, 9], [3, 5, 7]              # Diagonals
    ]

    for combo in winning_combinations:
        if all(position in state.x for position in combo):
            return 1
        elif all(position in state.o for position in combo):
            return -1
    if len(state.x) + len(state.o) == 9:
        return 0
    return None

def state_reward(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0    

In [116]:
def random_game():
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
   # print(state)
    return trajectory

### **TRAINING SECTION**

In [312]:
#moves dictionaries for different agents
value_dictionary = defaultdict(float)
value_dictionary2 =  defaultdict(float)
value_dictionary3 =  defaultdict(float)

epsilon = 0.001

#train agent 1
for steps in tqdm(range(5000)):
    trajectory = random_game()
    final_reward = state_reward(trajectory[-1])
    for state in trajectory:
        true_state = check_board(state, value_dictionary)
        hashable_state = (frozenset(true_state[0]), frozenset(true_state[1]))
       # hashable_state = (frozenset(state.x), frozenset(state.o))
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])

#train agent 2
for steps in tqdm(range(50_000)):
    trajectory = random_game()
    final_reward = state_reward(trajectory[-1])
    for state in trajectory:
        true_state = check_board(state, value_dictionary2)
        hashable_state = (frozenset(true_state[0]), frozenset(true_state[1]))
       # hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary2[hashable_state] = value_dictionary2[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary2[hashable_state])

#train agent 3
for steps in tqdm(range(500_000)):
    trajectory = random_game()
    final_reward = state_reward(trajectory[-1])
    for state in trajectory:
        true_state = check_board(state, value_dictionary3)
        hashable_state = (frozenset(true_state[0]), frozenset(true_state[1]))
       # hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary3[hashable_state] = value_dictionary3[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary3[hashable_state])

100%|██████████| 5000/5000 [00:01<00:00, 2605.83it/s]
100%|██████████| 50000/50000 [00:14<00:00, 3467.89it/s]
100%|██████████| 500000/500000 [01:54<00:00, 4353.80it/s]


### **TESTING SECTION**

In [321]:
#plays a game between two agents
def game(agent1, agent2):
    first_move = randint(1,9)
    game = State(set({first_move}), set())
    new_board = deepcopy(game)
   # my_print_board(new_board)
    
    while agent1.available:
        #agent2 moves
        reverse_game = State(set(new_board[1]), set(new_board[0]))
        new_board = agent2.pick_move(reverse_game)
        game = State(set(new_board[1]), set(new_board[0]))

     
        agent1.set_available(game)
        agent2.set_available(game)
        #my_print_board(game)
        #print(game)
        
        if check_win(game) == -1:
            my_print_board(game)
            print("agent2 won!!!")
            return -1    
        print()

        #agent1 move
        new_board = agent1.pick_move(game)
        game = State(set(new_board[0]), set(new_board[1]))
        #my_print_board(game)
        #print(game)
        agent1.set_available(game)
        agent2.set_available(game)
        if check_win(game) == 1:
            my_print_board(game)
            print("agent1 won!!!")
            return 1
        
a1_wins = 0
a2_wins = 0


## here we play 100 games and get average results
for _ in range(100):
    agent1 = Agent(value_dictionary3)
    agent2 = Agent(value_dictionary2)

    result = game(agent1, agent2)
    if result == 1:
        a1_wins+=1
    if result ==-1:
        a2_wins+=1  

print(f"a1 wins {a1_wins}   a2 wins {a2_wins}")





XX.
XOO
X.O
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!



X.O
XOO
XX.
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!



X.O
XOO
XX.
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



.OO
XO.
XXX
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!




OOX
OXX
XOX
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!




OOX
OXX
XOX
agent1 won!!!



XX.
XOO
X.O
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



.OO
XO.
XXX
agent1 won!!!



XXX
XO.
.OO
agent1 won!!!



XXX
XO.
.OO
agent1 won!!!



XX.
XOO
X.O
agent1 won!!!



XXX
XO.
.OO
agent1 won!!!



O.O
XO.
XXX
agent1 won!!!



XXX
XO.
.OO
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!



X.O
XOO
XX.
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!




OOX
OXX
XOX
agent1 won!!!



OXX
.OX
O.X
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!



XXX
XO.
O.O
agent1 won!!!



XX.
XOO
X.O
agent1 won!!!




OOX
OXX