In [1]:
from tictactoe import Tictactoe
from minimax import minimax
from typing import List
import random
import pickle
import os
from tqdm import tqdm

In [3]:
class Qtable:
    def __init__(self, alpha = 0.3, gamma = 0.95):
        self.qtable = dict()
        self.alpha = alpha
        self.gamma = gamma

    def set_value(self, state, action, value) -> None:
        self.qtable.setdefault(state, [0 for i in range(9)])[action] = value 

    def get_values(self, state) -> List:
        return self.qtable.get(state, [0 for i in range(9)])

    def update_value(self, q0, q1, state, action, reward):
        q0[action] = q0[action] + self.alpha * (reward + self.gamma * max(q1) - q0[action])
        self.set_value(state, action, q0[action])

    def __str__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    
    def __repr__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    def __len__(self):
        return len(self.qtable)
    
    def save(self, name = 'qtable'):
        with open(name, 'wb') as f:
            pickle.dump(self.qtable, f)
            
    def load(self, name = 'qtable'):
        self.qtable = pickle.load(open(name, 'rb'))

In [35]:
#Agent stuff

def get_reward(result, player):
    if player == 1:
        if result == 1:
            return 1
        elif result == -1:
            return -1
        elif result == 0:
            return 0.5
        else:
            return 0
    else:
        if result == 1:
            return -1
        elif result == -1:
            return 1
        elif result == 0:
            return 0.5
        else:
            return 0
        
def choose_action(state, moves, qtable, epsilon = 0.7) -> int:
    if random.uniform(0, 1) < epsilon:
        #print("exploration")
        return random.choice(moves)
    else:
        #print("exploitation")
        best_value = -float("inf")
        best_move = None
        for move in moves:
            value = qtable.get_values(state)[move]
            if value > best_value:
                best_value = value
                best_move = move
        return best_move


In [42]:
player1 = 1
player2 = -1

#qtableA = Qtable()
#qtableB = Qtable()

epsilon = 0.7
agent_wins = 0
agent_draws = 0
minimax_wins = 0
minimax_draws = 0
episodes = 100000
for episode in tqdm(range(episodes)):
    board = Tictactoe()
    history = []
    while True:
        #player 1
        a_or_b = random.choice([1, 0])
        current_state = board.get_state()
        q0 = qtableA.get_values(current_state) if a_or_b else qtableB.get_values(current_state)
        result = board.check()
        if a_or_b:
            action = choose_action(board.get_state(), board.available_moves(), qtableA, epsilon)
        else:
            action = choose_action(board.get_state(), board.available_moves(), qtableB, epsilon)
        board.move(action, player1)
        next_state = board.get_state()
        q1 = qtableB.get_values(next_state) if a_or_b else qtableA.get_values(next_state)
        history.append((current_state, next_state, action))
        result = board.check()
        reward = get_reward(result, player1)
        if a_or_b:
            qtableA.update_value(q0, q1, current_state, action, reward)
        else:
            qtableB.update_value(q0, q1, current_state, action, reward)
        if result != 3:
            if result == 1:
                while len(history):
                    current_state, next_state, action = history.pop()
                    q0 = qtableA.get_values(current_state) if a_or_b else qtableB.get_values(current_state)
                    q1 = qtableB.get_values(next_state) if a_or_b else qtableA.get_values(next_state)
                    if a_or_b:
                        qtableA.update_value(q0, q1, current_state, action, reward)
                    else:
                        qtableB.update_value(q0, q1, current_state, action, reward)
                    reward = q0[action]
                agent_wins += 1
            if result == 0:
                agent_draws += 1
            break
        
        #Player 2
        if random.choice([1]) == 1:
            action = random.choice(board.available_moves())
        else:
            action = minimax(board, player2)[0]
        board.move(action, player2)
        result = board.check()
        if result != 3:
            if result == -1:
                minimax_wins += 1
            if result == 0:
                minimax_draws += 1
            break
    epsilon = max(0.1, 0.999*epsilon)

100%|████████████████████████████████| 100000/100000 [00:07<00:00, 12741.29it/s]


In [43]:
qtableA

000000000: [0.12564583331590795, 0.12402820723823307, 0.1093457570612881, 0.12332795155420377, 0.0742735136864719, 0.11713631149147814, 0.0769692749421583, 0.10190603977076555, 0.1955924246941814]
1000-10000: [0, 0.5641944414827091, 0.5043184246992665, 0.3301615544370188, 0, 0.1635714143520921, 0.407421414907588, 0.1566658582406684, 0.07372523923794394]
1-100-10001: [0, 0, 0.0919466966672762, 0.0889445539752334, 0, 0.2286404465993151, 0.2678542915749035, 0.1421216111799431, 0]
1-1-10-11001: [0, 0, 0, 0.0, 0, 0, 0.28202999999957806, 0.21902999396984205, 0]
100-100000: [0, 0.5643030122149149, 0.43806513012145654, 0, 0.391525326447687, 0.1196653202147224, 0.1453127632553496, 0.2652397960201398, 0.42225079880217686]
001000-100: [0.49471920853858464, 0.3815207832741668, 0, 0.03034322055451439, 0.2626477029351722, 0.34392326774646764, 0, 0.2962802973558273, 0.393755843443049]
00001-1000: [0.3613712562138477, 0.3895382320953347, 0.4064879843321393, 0.2903160532280222, 0, 0, 0.4045625767136496

In [44]:
qtableB

000000000: [0.28821505440079165, 0.3346860836283109, 0.3299550877040048, 0.2727663903581534, 0.3430475880690146, 0.3233726751979194, 0.38891279653409666, 0.3166490963562205, 0.3061896540849794]
110-10-1000: [0, 0, 0.9999999999999999, 0, 0.3329015902642152, 0, 0.5024746206743773, 0.8623835182346521, 0.4470432014238638]
100-100000: [0, 0.37780586145636214, 0.2711076117501181, 0, 0.39654339880877576, 0.19308686248286355, 0.22914711151582198, 0.19112883132091904, 0.34088813858762074]
10100-1-100: [0, 0.9999999999999999, 0, 0.524579759554561, 0.31183383018018235, 0, 0, 0.29895250570727855, 0.6735889969484818]
001000-100: [0.3648985325853763, 0.3966576698282397, 0, 0.18336448827904875, 0.29786631915938977, 0.23651875289900443, 0, 0.05928099695216217, 0.4035201473599912]
0000001-10: [0.4076319169961833, 0.4383660076407494, 0.5093675469184122, 0.28348720950659656, 0.5503501427764499, 0.10764131640743711, 0, 0, 0.25555778895326187]
0-110001-10: [0.47136777947708464, 0, 0, 0.350444330781453, 0.9

In [45]:
print(f'agent wins: {agent_wins}')
print(f'agent draws: {agent_draws}')
print(f'agent win%: {(agent_wins/(agent_wins + minimax_wins + agent_draws + minimax_draws))*100}')
print(f'minimax wins: {minimax_wins}')
print(f'minimax draws: {minimax_draws}')
print(f'minimax win%: {(minimax_wins/(agent_wins + minimax_wins + agent_draws + minimax_draws))*100}')

agent wins: 91365
agent draws: 2540
agent win%: 91.365
minimax wins: 6095
minimax draws: 0
minimax win%: 6.095


In [46]:
len(qtableA)

2364

In [47]:
len(qtableB)

2371