Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

## Imports:

In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Definition of State:

In [246]:
State = namedtuple('State', ['x', 'o'])

## Definition of Magic Squares:

In [247]:
MAGIC = [2, 7, 6,
         9, 5, 1,
         4, 3, 8]

## Function to Print the Board:

In [248]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

## Functions to Check Win and Computate Rewards:

In [747]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value_X(pos: State):

    if win(pos.x):
        return 2
    elif win(pos.o):
        return -1
    else:
        return 0
    
def state_value_O(pos: State):

    if win(pos.o):
        return 3
    elif win(pos.x):
        return -1
    else:
        return 2

def state_value_O2(pos: State):

    if win(pos.o):
        return 300
    elif win(pos.x):
        return -2
    else:
        return 100
    
def state_value_O3(pos: State):

    if win(pos.o):
        return 3
    elif win(pos.x):
        return 0
    else:
        return 0

## Function to Make a RANDOM Move:

In [250]:
def make_a_move_random(available):

    return choice(list(available))

## Functions to Make INTELLIGENT Moves:

In [262]:
def make_a_move_intelligent_X(available, current_state, dictionary_best_moves_X):
    
    max_next_state_value = float('-inf')
    best_move = None

    for move in available:
        
        next_state_x = current_state.x.union({move})

        hashable_next_state = (frozenset(next_state_x), frozenset(current_state.o))

        if(dictionary_best_moves_X[hashable_next_state]):
            if dictionary_best_moves_X[hashable_next_state] > max_next_state_value:
                max_next_state_value = dictionary_best_moves_X[hashable_next_state]
                best_move = move

    return best_move if best_move is not None else choice(list(available))

def make_a_move_intelligent_O(available, current_state, dictionary_best_moves_O):
    
    max_next_state_value = float('-inf')
    best_move = None

    for move in available:
        
        next_state_o = current_state.o.union({move})

        hashable_next_state = (frozenset(current_state.x), frozenset(next_state_o))

        if(dictionary_best_moves_O[hashable_next_state]):
            if dictionary_best_moves_O[hashable_next_state] > max_next_state_value:
                max_next_state_value = dictionary_best_moves_O[hashable_next_state]
                best_move = move

    return best_move if best_move is not None else choice(list(available))

## Function to Simulate a Random Game:

In [252]:
def random_game():

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_random(available)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        if win(pos.x) or not available:
            break

        o = make_a_move_random(available)
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        if win(pos.o):
            break

    return state_list

## Functions to Simulate Games:

In [253]:
def intelligent_game_X(dictionary_best_moves_X):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_intelligent_X(available, pos, dictionary_best_moves_X)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        if win(pos.x) or not available:
            break

        o = make_a_move_random(available)
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        if win(pos.o):
            break
        
    return state_list

def intelligent_game_O(dictionary_best_moves_O):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_random(available)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        if win(pos.x) or not available:
            break

        o = make_a_move_intelligent_O(available, pos, dictionary_best_moves_O)
            
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        if win(pos.o):
            break
        
    return state_list

def intelligent_game(dictionary_best_moves_X, dictionary_best_moves_O):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_intelligent_X(available, pos, dictionary_best_moves_X)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        if win(pos.x) or not available:
            break

        o = make_a_move_intelligent_O(available, pos, dictionary_best_moves_O)
            
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        if win(pos.o):
            break
        
    return state_list

### Functions to Simulate Games and Print Games:

In [254]:
def intelligent_game_with_printing_X(dictionary_best_moves_X):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_intelligent_X(available, pos, dictionary_best_moves_X)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        print("X to move:")
        print_board(pos)

        if win(pos.x) or not available:
            break

        o = make_a_move_random(available)
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        print("O to move:")
        print_board(pos)

        if win(pos.o):
            break
        
    return state_list

def intelligent_game_with_printing_O(dictionary_best_moves_O):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    while available:
        x = make_a_move_random(available)
        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        print("X to move:")
        print_board(pos)

        if win(pos.x) or not available:
            break

        o = make_a_move_intelligent_O(available, pos, dictionary_best_moves_O)
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        print("O to move:")
        print_board(pos)

        if win(pos.o):
            break
        
    return state_list

## Training Phase Playing Casual Moves:


### Training Playing with X:

In [199]:
position_dictionary_with_values = defaultdict(float)

epsilon = 0.001

for steps in tqdm(range(500_000)):

    state_list = random_game()

    final_reward = state_value_X(state_list[-1])

    for pos in state_list:
        
        hashable_state = (frozenset(pos.x), frozenset(pos.o))

        position_dictionary_with_values[hashable_state] = position_dictionary_with_values[hashable_state] + epsilon * (final_reward - position_dictionary_with_values[hashable_state])


  0%|          | 230/500000 [00:00<03:37, 2298.31it/s]

100%|██████████| 500000/500000 [03:04<00:00, 2714.86it/s]


### Training Playing with O:

In [754]:
lost_games = []

In [560]:
def lost_game(moves):

    state_list = list()
    pos = State(set(), set())
    available = set(range(1, 9+1))

    moves_list = list(moves)
    count = 0

    while available:

        if count < len(moves_list):
            if(moves_list[count] in available):
                x = moves_list[count]
            else:
                x = make_a_move_random(available)

            count += 1
        else:
                x = make_a_move_random(available)    

        pos.x.add(x)
        state_list.append(deepcopy(pos))
        available.remove(x)

        if win(pos.x) or not available:
            break

        o = make_a_move_random(available)
        pos.o.add(o)
        state_list.append(deepcopy(pos))
        available.remove(o)

        if win(pos.o):
            break
    
    return state_list

#### PHASE 1

In [815]:
position_dictionary_with_values_O = defaultdict(float)

epsilon = 0.001

for steps in tqdm(range(500_000)):

    state_list = random_game()

    final_reward = state_value_O(state_list[-1])

    for pos in state_list:
        
        hashable_state = (frozenset(pos.x), frozenset(pos.o))

        position_dictionary_with_values_O[hashable_state] = position_dictionary_with_values_O[hashable_state] + epsilon * (final_reward - position_dictionary_with_values_O[hashable_state])

100%|██████████| 500000/500000 [02:46<00:00, 3000.80it/s]


#### PHASE 2

In [455]:
for steps in tqdm(range(100_000)):

    state_list = intelligent_game_X(position_dictionary_with_values)

    final_reward = state_value_O2(state_list[-1])

    for pos in state_list:
        
        hashable_state = (frozenset(pos.x), frozenset(pos.o))

        position_dictionary_with_values_O[hashable_state] = position_dictionary_with_values_O[hashable_state] + epsilon * (final_reward - position_dictionary_with_values_O[hashable_state])


  0%|          | 262/100000 [00:00<00:38, 2615.48it/s]

100%|██████████| 100000/100000 [00:27<00:00, 3655.41it/s]


#### PHASE 3

In [756]:
lost_games_list = list(set(map(frozenset, lost_games)))

for game in tqdm(lost_games_list):
        
    for steps in range(100_000):
    
        state_list = lost_game(game)

        final_reward = state_value_O3(state_list[-1])

        for pos in state_list:
            
            hashable_state = (frozenset(pos.x), frozenset(pos.o))

            position_dictionary_with_values_O[hashable_state] = position_dictionary_with_values_O[hashable_state] + epsilon * (final_reward - position_dictionary_with_values_O[hashable_state])



0it [00:00, ?it/s]


## Training Phase Playing Intelligent Moves:

### Training Playing with X:

In [530]:
position_dictionary_with_values_AI = defaultdict(float)

epsilon = 0.01

for steps in tqdm(range(500_000)):

    state_list = intelligent_game_X(position_dictionary_with_values_AI)

    final_reward = state_value_X(state_list[-1])

    for pos in state_list:
        
        hashable_state = (frozenset(pos.x), frozenset(pos.o))

        position_dictionary_with_values_AI[hashable_state] = position_dictionary_with_values_AI[hashable_state] + epsilon * (final_reward - position_dictionary_with_values_AI[hashable_state])


  0%|          | 0/500000 [00:00<?, ?it/s]

100%|██████████| 500000/500000 [02:42<00:00, 3070.65it/s]


## Games VS Random:

### IA Player with X:

In [394]:
win_counter_against_random = 0
loss_counter_against_random = 0
draw_counter_against_random = 0

for game in tqdm(range(1000)):

    state_list = intelligent_game_X(position_dictionary_with_values)

    if(win(state_list[-1].x) == 1):
        win_counter_against_random += 1
    elif (win(state_list[-1].o) == 1):
        loss_counter_against_random += 1
    else:
        draw_counter_against_random += 1

print("Statistics against the random player over 1000 games (Using X):")
print(f"Wins: {win_counter_against_random}")
print(f"Loss: {loss_counter_against_random}")
print(f"Draws: {draw_counter_against_random}")

 35%|███▌      | 351/1000 [00:00<00:00, 3479.25it/s]

100%|██████████| 1000/1000 [00:00<00:00, 2136.10it/s]

Statistics against the random player over 1000 games (Using X):
Wins: 995
Loss: 0
Draws: 5





### IA Player with O:

In [884]:
lost_games = []

win_counter_against_random = 0
loss_counter_against_random = 0
draw_counter_against_random = 0

for game in tqdm(range(1000)):

    state_list = intelligent_game_O(position_dictionary_with_values_O)

    if(win(state_list[-1].o) == 1):
        win_counter_against_random += 1
    elif (win(state_list[-1].x) == 1):
        loss_counter_against_random += 1
        lost_games.append(state_list[-1].x)
    else:
        draw_counter_against_random += 1

print("Statistics against the random player over 1000 games (Using O):")
print(f"Wins: {win_counter_against_random}")
print(f"Loss: {loss_counter_against_random}")
print(f"Draws: {draw_counter_against_random}")

print(lost_games)

100%|██████████| 1000/1000 [00:00<00:00, 2765.56it/s]

Statistics against the random player over 1000 games (Using O):
Wins: 894
Loss: 5
Draws: 101
[{8, 9, 2, 4}, {9, 2, 4, 6}, {9, 2, 4, 6}, {8, 1, 4, 6}, {8, 9, 2, 4}]





## Games VS Random with Intelligent Training:

In [531]:
win_counter_against_random = 0
loss_counter_against_random = 0
draw_counter_against_random = 0

for game in tqdm(range(1000)):

    state_list = intelligent_game_X(position_dictionary_with_values_AI)

    if(win(state_list[-1].x) == 1):
        win_counter_against_random += 1
    elif (win(state_list[-1].o) == 1):
        loss_counter_against_random += 1
    else:
        draw_counter_against_random += 1

print("Statistics against the random player over 1000 games (Using X):")
print(f"Wins: {win_counter_against_random}")
print(f"Loss: {loss_counter_against_random}")
print(f"Draws: {draw_counter_against_random}")

100%|██████████| 1000/1000 [00:00<00:00, 1813.93it/s]

Statistics against the random player over 1000 games (Using X):
Wins: 654
Loss: 277
Draws: 69





## Games AI vs AI:

In [568]:
win_counter_X = 0
loss_counter_X = 0
draw_counter = 0

for game in tqdm(range(1000)):

    state_list = intelligent_game(position_dictionary_with_values, position_dictionary_with_values_O)

    if(win(state_list[-1].x) == 1):
        win_counter_X += 1
    elif (win(state_list[-1].o) == 1):
        loss_counter_X += 1
    else:
        draw_counter += 1

print("Statistics AI vs AI:")
print(f"Wins X: {win_counter_X}")
print(f"Loss X: {loss_counter_X}")
print(f"Draws: {draw_counter}")

100%|██████████| 1000/1000 [00:00<00:00, 1569.63it/s]

Statistics AI vs AI:
Wins X: 0
Loss X: 0
Draws: 1000





## Games VS Random with Moves Printed:

### IA Player with X:

In [None]:
state_list = intelligent_game_with_printing_X(position_dictionary_with_values)

if(win(state_list[-1].x) == 1):
        print("X WON!")
elif (win(state_list[-1].o) == 1):
        print("O WON!")
else:
        print("DRAW!")

### IA Player with O:

In [None]:
state_list = intelligent_game_with_printing_O(position_dictionary_with_values_O)

if(win(state_list[-1].x) == 1):
        print("X WON!")
elif (win(state_list[-1].o) == 1):
        print("O WON!")
else:
        print("DRAW!")

## Test your skills against AI!
### P.S. The moves from 1 to 9 refer to the corresponding box in the Magic Square!

In [259]:
def play_against_intelligent_player_X():
    pos = State(set(), set())
    available = set(range(1, 9+1))

    print("Start the game!")
    print_board(pos)

    while available:
        
        # IA Player turn
        print("IA to move...")
        move_intelligent = make_a_move_intelligent_X(available, pos, position_dictionary_with_values)
        pos.x.add(move_intelligent)
        available.remove(move_intelligent)
        print_board(pos)

        if win(pos.x):
            print("IA won!")
            break
        elif not available:
            print("Draw!")
            break

        # Human Player turn
        print("Your turn...")
        move_human = int(input("Make your move (1-9): "))
        while move_human not in available:
            print("Invalid move. Please try again.")
            move_human = int(input("Make your move (1-9): "))

        pos.o.add(move_human)
        available.remove(move_human)
        print_board(pos)

        if win(pos.o):
            print("Congratulations! You won!")
            break
        elif not available:
            print("Draw!")
            break

    print("End of Game!")


In [260]:
def play_against_intelligent_player_O():
    pos = State(set(), set())
    available = set(range(1, 9+1))

    print("Start the game!")
    print_board(pos)

    while available:
        
        # Human Player turn
        print("Your turn...")
        move_human = int(input("Make your move (1-9): "))
        while move_human not in available:
            print("Invalid move. Please try again.")
            move_human = int(input("Make your move (1-9): "))

        pos.x.add(move_human)
        available.remove(move_human)
        print_board(pos)

        if win(pos.x):
            print("Congratulations! You won!")
            break
        elif not available:
            print("Draw!")
            break

        # IA Player turn
        print("IA to move...")
        move_intelligent = make_a_move_intelligent_O(available, pos, position_dictionary_with_values_O)
        pos.o.add(move_intelligent)
        available.remove(move_intelligent)
        print_board(pos)

        if win(pos.o):
            print("IA won!")
            break
        elif not available:
            print("Draw!")
            break

    print("End of Game!")

In [None]:
play_against_intelligent_player_X()

In [None]:
play_against_intelligent_player_O()