In [1]:
from tictactoe import Tictactoe
from minimax import minimax
from typing import List
import random
import pickle
import os
from tqdm import tqdm

In [2]:
class Qtable:
    def __init__(self, alpha = 0.3, gamma = 0.95):
        self.qtable = dict()
        self.alpha = alpha
        self.gamma = gamma

    def set_value(self, state, action, value) -> None:
        self.qtable.setdefault(state, [0 for i in range(9)])[action] = value 

    def get_values(self, state) -> List:
        return self.qtable.get(state, [0 for i in range(9)])

    def update_value(self, q0, q1, state, action, reward):
        q0[action] = q0[action] + self.alpha * (reward + self.gamma * max(q1) - q0[action])
        self.set_value(state, action, q0[action])

    def __str__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    
    def __repr__(self):
        table = ''
        for key in self.qtable:
            table += f'{key}: {self.qtable.get(key, [0 for i in range(9)])}\n'
        return table
    def __len__(self):
        return len(self.qtable)

In [3]:
def get_reward(result, player):
    if player == 1:
        if result == 1:
            return 3
        elif result == -1:
            return -1.5
        elif result == 0:
            return 1
        else:
            return 0
    else:
        if result == 1:
            return -1
        elif result == -1:
            return 1
        elif result == 0:
            return 0.8
        else:
            return 0

In [4]:
def choose_action(state, moves, qtable, epsilon = 0.7) -> int:
    if random.uniform(0, 1) < epsilon:
        #print("exploration")
        return random.choice(moves)
    else:
        #print("exploitation")
        best_value = -float("inf")
        best_move = None
        for move in moves:
            value = qtable.get_values(state)[move]
            if value > best_value:
                best_value = value
                best_move = move
        return best_move

In [33]:
# Q vs 50%minimax

#qtableX = Qtable()
player1 = 1
player2 = -1

epsilon = 0.8
agent_wins = 0
agent_draws = 0
minimax_wins = 0
minimax_draws = 0
episodes = 100
for episode in tqdm(range(episodes)):
    board = Tictactoe()
    history = []
    while True:
        #print('*'*30)
        #print(board)
        ###Player 1
        current_state = board.get_state()
        q0 = qtableX.get_values(current_state)
        #print(f"current state: {current_state}")
        #print(f"values of current state: {q0}")

        result = board.check()
        if result != 3:
            #game over, terminal state reached
            #update qtable without next state
            print("terminal state reached")
            agent_wins += 1
            break
        else:
            action = choose_action(board.get_state(), board.available_moves(), qtableX, epsilon)
            #print(f"chosen action: {action}")

            board.move(action, player1)

            next_state = board.get_state()
            q1 = qtableX.get_values(next_state)
            #print(f"next state: {next_state}")
            #print(f"values of next state: {q1}")
            #print(board)
            history.append((current_state, next_state, action))
            result = board.check()
            reward = get_reward(result, player1)
            #print(f"result: {result}")
            #print(f"reward: {reward}")
            qtableX.update_value(q0, q1, current_state, action, reward)
            #print('Qtable X')
            #print(qtableX)
            if result != 3:
                #print("end of the episode")
                #end the episode
                if result == 1:
                    #qtableX.update_value(q0, q1, current_state, action, 1)
                    while len(history):
                        current_state, next_state, action = history.pop()
                        q0 = qtableX.get_values(current_state)
                        q1 = qtableX.get_values(next_state)
                        qtableX.update_value(q0, q1, current_state, action, reward)
                        reward = q0[action]
                    agent_wins += 1
                if result == 0:
                    #qtableX.update_value(q0, q1, current_state, action, 0.5)
                    agent_draws += 1
                break
        #print('-'*30)
        #print(board)
        
        #Player 2
        result = board.check()
        if result != 3:
            break
        else:
            if random.choice([1]) == 2:
                action = random.choice(board.available_moves())
            else:
                action = minimax(board, player2)[0]

            board.move(action, player2)
            result = board.check()

            if result != 3:
                if result == -1:
                    minimax_wins += 1
                if result == 0:
                    minimax_draws += 1
                break
        #print('*'*30)
    epsilon = max(0.1, 0.999*epsilon)

100%|█████████████████████████████████████████| 100/100 [02:05<00:00,  1.26s/it]


In [7]:
qtableX

000000000: [4.6342498821465476e-07, 3.880659580072147e-07, 7.634902191958977e-07, 5.690371831369058e-07, 5.099765283544923e-07, 5.029115182605165e-07, 6.75621754807187e-07, 2.9465956586788497e-07, 8.006457510169991e-07]
1000-10000: [0, 0.024707667949615396, 0.015550492774836503, 0.017863732021722645, 0, 0.009848266460536624, 0.029578252292052302, 0.010548764487524835, 0.002655191633113687]
1-100-10010: [0, 0, 0.06885329325769118, 0.39569171905961, 0, 0.3169231840368776, 0.5857076272492742, 0, 0.46921844544937413]
1-110-1001-1: [0, 0, 0, 0.01588156154875982, 0, 0.0, 0.01872382874454989, 0, 0]
1-11-1-1011-1: [0, 0, 0, 0, 0, 0.59, 0, 0, 0]
0-10001000: [0.3210529444381057, 0, 0.3292608059594815, 0.30823477962174484, 0.5828341454919229, 0, 0.29009463532491, 0.32857862421136697, 0.3597513393053524]
1-10-101000: [0, 0, 0.40575532105416756, 0, 0.41401293995624355, 0, 0.2905243093070312, 0.21853266407154665, 0.5881868961327575]
1-11-1010-10: [0, 0, 0, 0, 0.588235294117647, 0, 0.5708495584813862

In [35]:
len(qtableX)

2423

In [36]:
print(f'agent wins: {agent_wins}')
print(f'agent win%: {(agent_wins/(agent_wins + minimax_wins + agent_draws))*100}')
print(f'minimax wins: {minimax_wins}')
print(f'minimax win%: {(minimax_wins/(agent_wins + minimax_wins + agent_draws))*100}')

agent wins: 0
agent win%: 0.0
minimax wins: 67
minimax win%: 67.0


In [37]:
agent_draws

33

In [38]:
minimax_draws

0

In [279]:
import pickle

In [6]:
with open('qtableX', 'wb') as f:
    pickle.dump(qtableX, f)

In [5]:
qtableX = pickle.load(open('qtableX', 'rb'))

In [6]:
qtableX

000000000: [0.16669771871035602, 0.037849583904169565, 0.14108151586731218, 0.0736046624493234, 0.06596519759865883, 0.09149357756432266, 0.20080122527791067, 0.05444865850570766, 0.10081403498744304]
1000-10000: [0, 0.07203401734581749, 0.18882429666497674, 0.07440121625040667, 0, 0.04101735302181017, 0.0862339716969455, 0.08966301870415247, 0.06579812389791292]
1-100-10010: [0, 0, 0.06885329325769118, 0.39569171905961, 0, 0.3169231840368776, 0.5857076272492742, 0, 0.46921844544937413]
1-110-1001-1: [0, 0, 0, 0.022687945069656885, 0, 0.0, 0.026748326777928412, 0, 0]
1-11-1-1011-1: [0, 0, 0, 0, 0, 0.49999999999999994, 0, 0, 0]
0-10001000: [0.3210529444381057, 0, 0.3292608059594815, 0.30823477962174484, 0.5828341454919229, 0, 0.29009463532491, 0.32857862421136697, 0.3597513393053524]
1-10-101000: [0, 0, 0.40575532105416756, 0, 0.41401293995624355, 0, 0.2905243093070312, 0.21853266407154665, 0.5881868961327575]
1-11-1010-10: [0, 0, 0, 0, 0.588235294117647, 0, 0.5708495584813862, 0, 0.999

In [65]:
board = Tictactoe()

In [70]:
action = choose_action(board.get_state(), board.available_moves(), qtableX, 0)
board.move(action, 1)
print(board)

 | |X
O|X|O
 | |X



In [69]:
player_action = 5
board.move(player_action, -1)
print(board)

 | |X
O| |O
 | |X

