In [1]:
import numpy as np
from itertools import permutations
from random import choice
from collections import defaultdict
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ALFA = 0.001
LEARNING_RATE = 0.1
CUT_DOWN = 0.5

In [3]:
def get_state(moves1: list[int], moves2: list[int]) -> str:  #get the string representing the state
    string_state = ""
    for m in moves1:
        string_state += str(m)
    string_state += '|'
    for m in moves2:
        string_state += str(m)
    return string_state

Q = defaultdict(lambda: defaultdict(lambda: 0.0))

In [4]:
class Player():
    def __init__(self):
        self.moves = []

    def make_move(self, available_moves):
        move = choice(available_moves)
        self.moves.append(move)
        return move
    
class MyPlayer(Player):
    def __init__(self, Q):
        super().__init__()
        self.Q = Q

    def make_move(self, available_moves: list[int]):
        # first I need to see which moves are already made by the opponent
        opponent_moves = list(set(list(range(9))) - set(available_moves) - set(self.moves))
        state = get_state(sorted(self.moves), sorted(opponent_moves))

        move = max(available_moves, key = lambda e: self.Q[state][str(e)])
        self.moves.append(move)
        return move

class TicTacToe():
    def __init__(self):
        self.board = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]])
        self.available_moves = list(range(9))
        self.current_player = "X"

    def check_winner(self, player_moves):  # checks if a player has won the game
        if sum(sum(h) == 12 for h in permutations(player_moves, 3)):
            return True
        return False

    def play_game(self, players: list[Player], train=True, my_player=0):
        current_player = 0
        n_moves = 0
        if my_player == 1:
            n_moves += 1
            init_state = get_state(sorted(players[0].moves), sorted(players[1].moves)) # state before the player performs the move 
            move = players[current_player].make_move(self.available_moves)
            self.available_moves.remove(move)
            current_player = 1
        while len(self.available_moves) > 0:
            n_moves += 1
            init_state = get_state(sorted(players[0].moves), sorted(players[1].moves)) # state before the player performs the move 
            move = players[current_player].make_move(self.available_moves)
            self.available_moves.remove(move)
            next_state = get_state(sorted(players[0].moves), sorted(players[1].moves)) # state after the player has done its move
            reward = 1 if self.check_winner(players[current_player].moves) else 0
            # print(f"#{n_moves}, init_state: {init_state}, move: {move}")
            if reward == 1:
                if train:
                    Q[init_state][str(move)] = (1 - LEARNING_RATE) * Q[init_state][str(move)] + LEARNING_RATE * (reward + CUT_DOWN * max(Q[next_state].values(), default=0))
                return current_player
            
            if len(self.available_moves) == 0:
                Q[init_state][str(move)] = (1 - LEARNING_RATE) * Q[init_state][str(move)] + LEARNING_RATE * (reward + CUT_DOWN * max(Q[next_state].values(), default=0)) # reward is 0
                return None

            current_player = 1 - current_player
            n_moves += 1
            move = players[current_player].make_move(self.available_moves)
            self.available_moves.remove(move)
            reward = -1 if self.check_winner(players[current_player].moves) else 0
            if train:
                # print(f"#{n_moves}, init_state: {next_state}, move: {move}")
                Q[init_state][str(move)] = (1 - LEARNING_RATE) * Q[init_state][str(move)] + LEARNING_RATE * (reward + CUT_DOWN * max(Q[next_state].values(), default=0))
            if reward == -1:
                return current_player
            current_player = 1 - current_player
            
        return None


In [5]:
for _ in tqdm(range(1_000_000)):
   g = TicTacToe()
   ply1 = Player()
   ply2 = Player()
   # g.play_game([ply1, ply2], my_player=0)
   # g = TicTacToe()
   g.play_game([ply1, ply2], my_player=1)

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [00:58<00:00, 17188.00it/s]


In [6]:
n_wins = 0
n_draws = 0
n_losses = 0

for _ in range(100):
    mp = MyPlayer(Q)
    opp = Player()
    g = TicTacToe()
    winner = g.play_game([mp, opp], train=False, my_player=0)
    if winner is None:
        n_draws += 1
    elif winner == 0:
        n_wins += 1
    else:
        n_losses += 1
print(f"{n_wins}, {n_draws}, {n_losses}")


57, 34, 9


In [7]:
n_wins = 0
n_draws = 0
n_losses = 0

for _ in range(100):
    mp = MyPlayer(Q)
    opp = Player()
    g = TicTacToe()
    winner = g.play_game([opp, mp], train=False, my_player=1)
    if winner is None:
        n_draws += 1
    elif winner == 1:
        n_wins += 1
    else:
        n_losses += 1
print(f"{n_wins}, {n_draws}, {n_losses}")

11, 46, 43
