In [2]:
from tictactoe import Tictactoe
from minimax import minimax
from typing import List
import random
import pickle
import os
from tqdm import tqdm

In [3]:
class Qtable:
    def __init__(self):
        self.qtable = dict()
        self.alpha = 0.9
        self.gamma = 0.9

    def set_value(self, state, action, value) -> None:
        self.qtable.setdefault(state, [0 for i in range(9)])[action] = value 

    def get_values(self, state) -> List:
        return self.qtable.get(state, [0 for i in range(9)])

    def update_value(self, q0, q1, state, action, reward):
        q0[action] = q0[action] + self.alpha * (reward + self.gamma * max(q1) - q0[action])
        self.set_value(state, action, q0[action])

    def save_qtable(self):
        pass

    def load_qtable(self):
        pass


    def __str__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table

In [4]:
def get_reward(result, player):
    if player == 1:
        if result == 1:
            return 1
        elif result == -1:
            return -1
        elif result == 0:
            return 0.5
        else:
            return 0
    else:
        if result == 1:
            return -1
        elif result == -1:
            return 1
        elif result == 0:
            return 0.5
        else:
            return 0

In [13]:
def choose_action(state, moves, qtable, epsilon = 0.7) -> int:
    if random.uniform(0, 1) < epsilon:
        #print("exploration")
        return random.choice(moves)
    else:
        #print("exploitation")
        best_value = -float("inf")
        best_move = None
        for move in moves:
            value = qtable.get_values(state)[move]
            if value > best_value:
                best_value = value
                best_move = move
        return best_move

In [110]:
board = Tictactoe()
qtableX = Qtable()
qtableO = Qtable()
player1 = 1
player2 = -1

In [120]:
qtableX = Qtable()
qtableO = Qtable()
player1 = 1
player2 = -1

epsilon = 0.9

episodes = 500000
for episode in tqdm(range(episodes)):
    board = Tictactoe()
    while True:
        #print('*'*30)
        #print(board)
        current_state = board.get_state()
        q0 = qtableX.get_values(current_state)
        #print(f"current state: {current_state}")
        #print(f"values of current state: {q0}")

        result = board.check()
        if result != 3:
            #game over, terminal state reached
            #update qtable without next state
            print("terminal state reached")
                
            break
        else:
            action = choose_action(board.get_state(), board.available_moves(), qtableX, epsilon)
            #print(f"chosen action: {action}")

            board.move(action, player1)

            next_state = board.get_state()
            q1 = qtableX.get_values(next_state)
            #print(f"next state: {next_state}")
            #print(f"values of next state: {q1}")
            #print(board)

            result = board.check()
            reward = get_reward(result, player1)
            #print(f"result: {result}")
            #print(f"reward: {reward}")
            qtableX.update_value(q0, q1, current_state, action, reward)
            #print('Qtable X')
            #print(qtableX)
            if result != 3:
                #print("end of the episode")
                qtableX.update_value(q0, q1, current_state, action, 1)
                qtableO.update_value(q0, q1, current_state, action, -1)
                #end the episode
                break
        #print('-'*30)
        #print(board)
        current_state = board.get_state()
        q0 = qtableO.get_values(current_state)
        #print(f"current state: {current_state}")
        #print(f"values of current state: {q0}")

        result = board.check()
        if result != 3:
            #game over, terminal state reached
            #update qtable without next state
            print("terminal state reached")
            break
        else:
            action = choose_action(board.get_state(), board.available_moves(), qtableO, epsilon)
            #print(f"chosen action: {action}")

            board.move(action, player2)

            next_state = board.get_state()
            q1 = qtableO.get_values(next_state)
            #print(f"next state: {next_state}")
            #print(f"values of next state: {q1}")
            #print(board)

            result = board.check()
            reward = get_reward(result, player2)
            #print(f"result: {result}")
            #print(f"reward: {reward}")

            qtableO.update_value(q0, q1, current_state, action, reward)
            #print('Q table O')
            #print(qtableO)
            if result != 3:
                #print("end of the episode")
                qtableO.update_value(q0, q1, current_state, action, 1)
                qtableX.update_value(q0, q1, current_state, action, -1)
                #end the episode
                break
        #print('*'*30)
    epsilon = max(0.1, 0.999*epsilon)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500000/500000 [01:14<00:00, 6745.91it/s]


In [121]:
board = Tictactoe()

In [128]:
print(board)
current_state = board.get_state()
q0 = qtableX.get_values(current_state)
print(f"current state: {current_state}")
print(f"values of current state: {q0}")
action = choose_action(board.get_state(), board.available_moves(), qtableX, epsilon = 0)
print(f"chosen action: {action}")
board.move(action, player1)
print(board)

X|O|X
O|X| 
O| | 

current state: 1-11-110-100
values of current state: [0, 0, 0, 0, 0, 0.0, 0, 0.0, 0.08675799086758001]
chosen action: 8
X|O|X
O|X| 
O| |X



In [127]:
print(board)
current_state = board.get_state()
q0 = qtableO.get_values(current_state)
print(f"current state: {current_state}")
print(f"values of current state: {q0}")
action = choose_action(board.get_state(), board.available_moves(), qtableO, epsilon = 0)
print(f"chosen action: {action}")
board.move(action, player2)
print(board)

X|O|X
O|X| 
 | | 

current state: 1-11-110000
values of current state: [0, 0, 0, 0, 0, 0.078082191780822, 0.07808219178082203, 0.07808219178082203, 0.07808219178082185]
chosen action: 6
X|O|X
O|X| 
O| | 



In [93]:
print(qtableX)

000000000: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-100000010: [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0]
-1-10000110: [0, 0, 0.0, 0.07803774959794513, 0.07376260304212495, 0.07720687835283893, 0, 0, 0.08675799086757985]
-1-1010011-1: [0, 0, 0.07653898301178778, 0, 0.07802869141815084, 0.07808081000544795, 0, 0, 0]
-1-1010111-1: [0, 0, 0.08675799086737068, 0, 0.08675799086758001, 0, 0, 0, 0]
0000001-10: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0.0]
-1000001-11: [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0]
-1100-101-11: [0, 0, 0.0, 0.0, 0, 0.0, 0, 0, 0]
-11-11-101-11: [0, 0, 0, 0, 0, -0.02511415525114158, 0, 0, 0]
-100010000: [0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0]
-11-1010000: [0, 0, 0, 0.0, 0, 0.0, 0.0, 0.08675799086757985, 0.0]
-11-111-1000: [0, 0, 0, 0, 0, 0, 0.07803856883310616, 0.08675799086757985, 0.0]
-11-111-11-10: [0, 0, 0, 0, 0, 0, 0, 0, -0.02511415525114158]
1000000-10: [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0]
10-10100-10: [0, 0.0, 0, 0.0, 0, 0.0, 0.0, 0, 0.08675799086757985]
1