# Artificial Neural Networks (CS-456)
## Miniproject 1: Tic Tac Toe
- Mickaël Achkar (322190)
- Yehya El Hassan (325932)


In [211]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer
from typing import Dict, List
import hashlib
import random
from tqdm import tqdm

In [212]:
environment = TictactoeEnv()

In [213]:
class RlAgent():
    def __init__(self,player:str,epsilon:float,learning_rate:float = 0.05,discount_factor:float = 0.99):
        self.player = player
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor

        # Form the the q_table and initialize it to empty
        self.q_table:Dict[str,np.ndarray] = {}

        # Actions take values between 0 and 9 representing the possible positions on the board
        self.list_of_possible_actions:List[int] = [] 

        # Initialize the current state to None and current action to None
        self.current_state = None

        # Initialize the current reward
        self.reward = 0

    def update(self, board, reward):
        # Recieves information from the environment
        self._update_board(board)
        self._update_current_state(get_state_from_board(board))
        self._update_reward(reward)
        self._update_list_of_possible_actions()


    def update_q_table(self,previous_state,previous_action, current_action):
        previous_state = get_state_from_board(previous_state)
        self.q_table[previous_state][previous_action] = self.q_table[previous_state][previous_action] + self.learning_rate*(self.reward + self.discount_factor*(self.q_table[self.current_state][current_action]))

    def act(self):
        # Sample from a uniform distribution
        if (epsilon>random.uniform(0,1)):
            return self._choose_random_action()
        else:
            return self._choose_best_action()

    def _update_board(self,board):
        # Get's the latest board configuration from the Game
        self.board = board

    def _update_current_state(self, current_state):
        if (not isinstance(self.q_table.get(current_state), np.ndarray)):
            self.q_table[current_state] = np.zeros(9)
        self.current_state = current_state
  
    def _update_current_action(self, current_action):
        self.current_action = current_action
    
    def _update_list_of_possible_actions(self):
        # Get's the available positions on the board
        avail = []
        for i in range(9):
            pos = (int(i/3), i % 3)
            if self.board[pos] == 0:
                avail.append(i)
        self.list_of_possible_actions = avail
        return self.list_of_possible_actions

    def _update_reward(self, reward):
        # Updates the current reward
        self.reward = reward

    def _choose_best_action(self):
        maximum_q_values = np.where(self.q_table[self.current_state][self.list_of_possible_actions] == np.max(self.q_table[self.current_state][self.list_of_possible_actions]))[0]
        random_between_max = np.random.choice(maximum_q_values)
        return (self.list_of_possible_actions[random_between_max])

    def _choose_random_action(self):
        return np.random.choice(self.list_of_possible_actions)


def get_state_from_board(board):
     # Convert the Board configuration (Matrix) into a unique key for the state
    return hashlib.sha1(board).hexdigest()


def logger(winner, player_1, player_2):
    print('-------------------------------------------')
    print(f'Game end, winner is player {str(winner)}')
    print(f'Optimal player 1 = {str(player_1)}')
    print(f'RL Agent player 2 = {str(player_2)}')
    
def choose_players(index):
    if index%2 == 0:
        player_1 = 'X'
        player_2 = 'O'
    else:
        player_1 = 'O'
        player_2 = 'X' 
        
    return player_1,player_2   

def initialize_rl_moves():
    number_of_moves_of_rl_agent = 0
    return number_of_moves_of_rl_agent

Implementing a Tic Tac Toc Player using Q-learning. With that, we will create a Q-table representing all the states and actions possible and we will progressively update the values in the table.

In [214]:
def train_rl_agent(environment: TictactoeEnv,number_of_episodes: int, optimal_level : float, epsilon:float, verbose: bool = False):
    number_of_rl_wins = 0
    player_optimal = OptimalPlayer(epsilon=optimal_level, player="X")
    player_rl_agent = RlAgent(epsilon=epsilon, player="O")
    for i in tqdm(range(number_of_episodes)):
        
        environment.reset()
        grid, _, __ = environment.observe()

        player_1,player_2 = choose_players(index = i)

        player_optimal.player = player_1
        player_rl_agent.player = player_2
        
        number_of_moves_of_rl_agent = initialize_rl_moves()

        for j in range(9):
            player_rl_agent.update(grid,environment.reward(player=player_2))

            if environment.current_player == player_optimal.player:
                move = player_optimal.act(grid)
            else:
                rl_move = player_rl_agent.act()
                number_of_moves_of_rl_agent +=1

                if (number_of_moves_of_rl_agent%2 == 1):
                    stored_move = rl_move
                    stored_state = grid
                    
                elif (number_of_moves_of_rl_agent%2 == 0 and number_of_moves_of_rl_agent>0) :
                    player_rl_agent.update_q_table(stored_state,stored_move, rl_move)

                move = (int(rl_move/3),rl_move%3)

            grid, end, winner = environment.step(move, print_grid=False)

            if end:
                player_rl_agent.update(grid,environment.reward(player=player_2))
                player_rl_agent.update_q_table(stored_state,stored_move, rl_move)

                if winner == player_rl_agent.player:
                    number_of_rl_wins+=1
                if verbose:
                    logger(winner,player_1, player_2)
                    environment.render()

                environment.reset()
                break
    return player_rl_agent, number_of_rl_wins    


In [215]:
# RL Hyper-params
number_of_episodes = 20000
epsilon = 0.1

player_rl_agent, number_of_rl_wins = train_rl_agent(environment, number_of_episodes=number_of_episodes, optimal_level=0.5,epsilon=epsilon)

100%|██████████| 20000/20000 [01:01<00:00, 326.41it/s]


In [None]:
print(number_of_rl_wins)
print(player_rl_agent.q_table)
