In [1]:
from tictactoe import Tictactoe
from minimax import minimax
from typing import List
import random
import pickle
import os
from tqdm import tqdm

In [59]:
class Qtable:
    def __init__(self, alpha = 0.3, gamma = 0.95):
        self.qtable = dict()
        self.alpha = alpha
        self.gamma = gamma

    def set_value(self, state, action, value) -> None:
        self.qtable.setdefault(state, [0 for i in range(9)])[action] = value 

    def get_values(self, state) -> List:
        return self.qtable.get(state, [0 for i in range(9)])

    def update_value(self, q0, q1, state, action, reward):
        q0[action] = q0[action] + self.alpha * (reward + self.gamma * max(q1) - q0[action])
        self.set_value(state, action, q0[action])

    def __str__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    
    def __repr__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    def __len__(self):
        return len(self.qtable)
    
    def save(self, name = 'qtable'):
        with open(name, 'wb') as f:
            pickle.dump(self.qtable, f)
            
    def load(self, name = 'qtable'):
        self.qtable = pickle.load(open(name, 'rb'))

In [65]:
#Agent stuff

def get_reward(result, player):
    if player == 1:
        if result == 1:
            return 2
        elif result == -1:
            return -2
        elif result == 0:
            return 0.5
        else:
            return 0
    else:
        if result == 1:
            return -2
        elif result == -1:
            return 2
        elif result == 0:
            return 0.5
        else:
            return 0
        
def choose_action(state, moves, qtable, epsilon = 0.7) -> int:
    if random.uniform(0, 1) < epsilon:
        #print("exploration")
        return random.choice(moves)
    else:
        #print("exploitation")
        best_value = -float("inf")
        best_move = None
        for move in moves:
            value = qtable.get_values(state)[move]
            if value >= best_value:
                best_value = value
                best_move = move
        return best_move

In [71]:
player1 = 1
player2 = -1

#qtableX = Qtable(alpha = 0.9, gamma = 0.95)
#qtableO = Qtable(alpha = 0.9, gamma = 0.95)

epsilon = 0.8
X_wins = 0
X_draws = 0
O_wins = 0
O_draws = 0
episodes = 100000
for episode in tqdm(range(episodes)):
    board = Tictactoe()
    historyX = []
    historyO = []
    while True:
        #player X
        current_state = board.get_state()
        q0 = qtableX.get_values(current_state)
        result = board.check()
        action = choose_action(board.get_state(), board.available_moves(), qtableX, epsilon)
        board.move(action, player1)
        next_state = board.get_state()
        q1 = qtableX.get_values(next_state)
        historyX.append((current_state, next_state, action))
        result = board.check()
        reward = get_reward(result, player1)
        qtableX.update_value(q0, q1, current_state, action, reward)
        #print(board)
        if result != 3:
            if result == 1:
                while len(historyX):
                    current_state, next_state, action = historyX.pop()
                    q0 = qtableX.get_values(current_state)
                    q1 = qtableX.get_values(next_state)
                    qtableX.update_value(q0, q1, current_state, action, reward)
                    reward = q0[action]
                X_wins += 1
            if result == 0:
                X_draws += 1
            break
        
        #player O
        current_state = board.get_state()
        q0 = qtableO.get_values(current_state)
        result = board.check()
        action = choose_action(board.get_state(), board.available_moves(), qtableO, epsilon)
        board.move(action, player2)
        next_state = board.get_state()
        q1 = qtableO.get_values(next_state)
        historyO.append((current_state, next_state, action))
        result = board.check()
        reward = get_reward(result, player2)
        qtableO.update_value(q0, q1, current_state, action, reward)
        #print(board)
        if result != 3:
            if result == -1:
                while len(historyO):
                    current_state, next_state, action = historyO.pop()
                    q0 = qtableO.get_values(current_state)
                    q1 = qtableO.get_values(next_state)
                    qtableO.update_value(q0, q1, current_state, action, reward)
                    reward = q0[action]
                O_wins += 1
            if result == 0:
                O_draws += 1
            break
    epsilon = max(0.1, 0.999*epsilon)

100%|█████████████████████████████████| 100000/100000 [00:11<00:00, 8769.29it/s]


In [72]:
qtableX

000000000: [0.15024943373044475, 0.0013123782015189312, 0.01651404958677402, 0.0013388724294211007, 1.4907275689419834, 0.016376998084828204, 0.001636365272740378, 0.00014611091072740524, 0.0013388431079024935]
0-10000001: [1.6234361999819753e-13, 0, 0.018181818181818188, 0.0014877520510910708, 1.6365141835021506, 1.8181818018150614e-17, 0.16499471074380168, 1.8181818181818181, 0]
0-1-1000011: [1.8181818181818181, 0, 0, 0.0, 0.17999999999999994, 0.1799999982, 2.0, 0, 0]
1-1-100-1011: [0, 0, 0, 0, 0, 0, 1.99999998, 0, 0]
0000-10001: [1.6200003239999961e-12, 1.6216362129599714e-17, 1.81619999999999e-24, 0.001487603453206629, 0, 1.6201620180000113e-13, 1.472728910591635e-12, 1.8162001800179918e-14, 0]
0000-10-111: [0.0, 0.0, 1.8180000000179966e-14, 0.0, 0, 1.7999999999999827e-32, 0, 0, 0]
0-100-11-111: [1.818181817999982, 0, 2.0, 0.00017819999999999945, 0, 0, 0, 0, 0]
-1-101-11-111: [0, 0, 2.0, 0, 0, 0, 0, 0, 0]
0000000-11: [1.8163639654545456, 0.14889719172036364, 1.8181818181818181, 1.5

In [73]:
qtableO

000000001: [1.487618033072534e-11, 1.636363489076327e-42, 1.6362000016199976e-05, 1.4876032807621419e-24, 1.5026282480841469, 1.4890909090909128e-26, 1.6214727272743423e-08, 1.6200000000016073e-38, 0]
0-10000011: [1.636363636054353e-39, 0, 1.79999999999999e-12, 1.636199999999982e-39, 1.6363636216363375e-42, 0.0, 1.6362163621621466e-39, 0, 0]
1-1-1000011: [0, 0, 0, 0.0, 0.0, 0.0, 0.0017999999999999995, 0, 0]
0000-10011: [1.636362163636204e-22, 0.0, 1.800000000180164e-22, 1.7999999999999893e-23, 0, 1.7999999999999876e-25, 0.01800016200000179, 0, 0]
0000-11-111: [0.0, 1.80000018, 2.0, 0.0, 0, 0, 0, 0, 0]
0-101-11-111: [0.0, 0, 2.0, 0, 0, 0, 0, 0, 0]
0010000-11: [1.6199999999999296e-131, 1.8000000000001572e-55, 0, 1.6361967599999303e-147, 1.8, 0.0, 0.0, 0, 0]
01100-10-11: [0.0, 0, 0, 0, 0, 0, 0.0, 0, 0]
01110-1-1-11: [0.0, 0, 0, 0, 0.0, 0, 0, 0, 0]
010000000: [1.6363652578380102e-17, 0, 1.6363636525785309e-16, 1.458000001623062e-15, 1.6363651079305732e-16, 1.636363636352305e-16, 1.47433126

In [74]:
print(f"X wins: {X_wins}, {round((X_wins/episodes)*100, 2)}%")
print(f"X draws: {X_draws}, {round((X_draws/episodes)*100, 2)}%")
print(f"O wins: {O_wins}, {round((O_wins/episodes)*100, 2)}%")
print(f"O draws: {O_draws}, {round((O_draws/episodes)*100, 2)}%")

X wins: 82641, 82.64%
X draws: 2859, 2.86%
O wins: 14500, 14.5%
O draws: 0, 0.0%


In [75]:
epsilon

0.1