## RL agent Q-learning for TicTacToe env

The [Tic-Tac-Toe](https://github.com/MauroLuzzatto/OpenAI-Gym-TicTacToe-Environment) is a simple game environment that allows to train reinforcement learning agents.

In [1]:
# load the python modules
import time
import sys
import warnings

import gym
import numpy as np
from tqdm import tqdm
import gym_TicTacToe

from src.qagent import QLearningAgent
from src.play_tictactoe import play_tictactoe, play_tictactoe_with_random

from src.utils import (
    create_state_dictionary,
    reshape_state,
    save_qtable,
    load_qtable
)

# ignore warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
class Player:
    def __init__(self, color, episodes: int):
        self.color = color
        self.reward_array = np.zeros(episodes)
        self.reset_reward()
        self.name = f"Player {color}"

    def reset_reward(self):
        self.reward = 0

In [3]:
# initialize the tictactoe environment
env = gym.envs.make("TTT-v0", small=-1, large=10)

In [4]:
state_dict = create_state_dictionary()
state_size = len(state_dict.keys())
action_size = env.action_space.n

Number of legal states: 17906


In [5]:
# set training parameters
episodes = 260_000
max_steps = 9

In [6]:
exploration_parameters = {
    "max_epsilon": 1.0,
    "min_epsilon": 0.0,
    "decay_rate": 0.00001,
}

In [7]:
qagent = QLearningAgent(exploration_parameters, state_size, action_size, learning_rate= 0.55, gamma=0.99)

In [8]:

def play(qagent:QLearningAgent, player: Player, state: int, action_space: np.array) -> tuple:
    action = qagent.get_action(state, action_space)

    # remove action from the action space
    action_space = action_space[action_space != action]

    new_state, reward, done, _ = env.step((action, player.color))
    #TODO: maybe should change a marker after this agent turn 
    new_state = np.append(new_state, player.color)
    new_state = state_dict[reshape_state(new_state)]    

    qagent.qtable[state, action] = qagent.update_qtable(
        state, new_state, action, reward, done
    )
    # new state
    state = new_state
    return state, action_space, done

In [9]:
def play_random(qagent:QLearningAgent, player: Player, state: int, action_space: np.array) -> tuple:
    action = np.random.choice(action_space)
    action_space = action_space[action_space != action]
    new_state, reward, done, _ = env.step((action, player.color))
    new_state = np.append(new_state, player.color)
    new_state = state_dict[reshape_state(new_state)]
    state = new_state
    return state, action_space, done

In [10]:
visited_states = np.zeros((state_size, 1))

In [11]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
import random

start_time = time.time()

player_1 = Player(color=1, episodes=episodes)
player_2 = Player(color=2, episodes=episodes)

track_progress = np.zeros(episodes)

win_history = []

rewards = []
for episode in tqdm(range(episodes)):
    
    action_space = np.arange(9)

    player_1.reset_reward()
    player_2.reset_reward()

    # randomly change the order players
    start = np.random.choice([1,2])

    state, _ = env.reset()
    state = np.append(state, start)
    state = state_dict[reshape_state(state)]

    for _step in range(start, max_steps + start):

        # change a turn
        if _step % 2 == 0:
            #state, action_space, done = play_random(qagent, player_1, state, action_space)
            state, action_space, done = play(qagent, player_1, state, action_space)
        else:
            state, action_space, done = play(qagent, player_2, state, action_space)
        visited_states[state] += 1
        if done == True:
            break

    # reduce epsilon for exporation-exploitation tradeoff
    qagent.update_epsilon(episode)

    #check how good is agent
    if episode % 1_000 == 0:
        num_games = 50
        cur_win_rate, reward = play_tictactoe_with_random(env, qagent.qtable, state_dict, num_test_games=num_games)
        win_history.append(sum(cur_win_rate)/num_games)
        print("WinRate:", sum(cur_win_rate)/num_games)
        # rewards.append(reward)
        # clear_output(True)
        # # plt.title('eps = {:e}, mean reward = {:.1f}'.format(agent.epsilon, np.mean(rewards[-10:])))
        # plt.plot(rewards)
        # plt.show()
    if episode % 25_000 == 0:

        sum_q_table = np.sum(qagent.qtable)
        time_passed = round((time.time() - start_time) / 60.0, 2)

        print(
            f"episode: {episode}, \
            epsilon: {round(qagent.epsilon, 2)}, \
            sum q-table: {sum_q_table}, \
            elapsed time [min]: {time_passed},  \
            done [%]: {episode / episodes * 100} \
            "
        )


  0%|          | 64/260000 [00:00<12:09, 356.35it/s]

WinRate: 0.24
episode: 0,             epsilon: 1.0,             sum q-table: 103.12211567998595,             elapsed time [min]: 0.0,              done [%]: 0.0             


  0%|          | 1170/260000 [00:02<07:45, 556.08it/s]

WinRate: 0.2


  1%|          | 2146/260000 [00:03<08:03, 533.18it/s]

WinRate: 0.12


  1%|          | 3155/260000 [00:05<08:06, 527.46it/s]

WinRate: 0.08


  2%|▏         | 4180/260000 [00:07<07:52, 540.94it/s]

WinRate: 0.24


  2%|▏         | 5087/260000 [00:09<08:02, 528.81it/s]

WinRate: 0.0


  2%|▏         | 6097/260000 [00:11<08:17, 510.36it/s]

WinRate: 0.04


  3%|▎         | 7061/260000 [00:12<08:10, 516.13it/s]

WinRate: 0.2


  3%|▎         | 8092/260000 [00:14<08:41, 483.12it/s]

WinRate: 0.0


  3%|▎         | 9097/260000 [00:16<08:17, 504.33it/s]

WinRate: 0.0


  4%|▍         | 10096/260000 [00:18<08:44, 476.12it/s]

WinRate: 0.2


  4%|▍         | 11075/260000 [00:20<08:24, 493.05it/s]

WinRate: 0.44


  5%|▍         | 12053/260000 [00:21<08:28, 487.26it/s]

WinRate: 0.28


  5%|▌         | 13072/260000 [00:23<07:50, 525.06it/s]

WinRate: -0.08


  5%|▌         | 14086/260000 [00:25<08:04, 507.81it/s]

WinRate: 0.4


  6%|▌         | 15069/260000 [00:27<08:16, 493.01it/s]

WinRate: -0.16


  6%|▌         | 16062/260000 [00:29<08:13, 494.31it/s]

WinRate: 0.4


  7%|▋         | 17109/260000 [00:30<07:52, 513.88it/s]

WinRate: 0.28


  7%|▋         | 18071/260000 [00:32<07:49, 515.19it/s]

WinRate: 0.12


  7%|▋         | 19117/260000 [00:34<07:53, 508.96it/s]

WinRate: 0.0


  8%|▊         | 20104/260000 [00:36<08:19, 480.30it/s]

WinRate: 0.0


  8%|▊         | 21084/260000 [00:38<08:02, 495.59it/s]

WinRate: 0.24


  9%|▊         | 22103/260000 [00:40<08:13, 481.68it/s]

WinRate: -0.08


  9%|▉         | 23072/260000 [00:42<08:10, 483.15it/s]

WinRate: 0.4


  9%|▉         | 24116/260000 [00:43<07:41, 511.10it/s]

WinRate: 0.36


 10%|▉         | 25065/260000 [00:45<07:46, 503.77it/s]

WinRate: 0.16
episode: 25000,             epsilon: 0.78,             sum q-table: 135087.84679239147,             elapsed time [min]: 0.76,              done [%]: 9.615384615384617             


 10%|█         | 26098/260000 [00:47<08:32, 456.64it/s]

WinRate: 0.24


 10%|█         | 27066/260000 [00:49<07:59, 485.37it/s]

WinRate: 0.4


 11%|█         | 28144/260000 [00:51<07:49, 493.52it/s]

WinRate: 0.4


 11%|█         | 29121/260000 [00:53<07:33, 508.74it/s]

WinRate: 0.24


 12%|█▏        | 30077/260000 [00:55<07:48, 491.03it/s]

WinRate: 0.24


 12%|█▏        | 31100/260000 [00:57<07:52, 484.48it/s]

WinRate: 0.16


 12%|█▏        | 32116/260000 [00:58<07:31, 505.20it/s]

WinRate: 0.4


 13%|█▎        | 33147/260000 [01:00<07:28, 505.60it/s]

WinRate: 0.0


 13%|█▎        | 34100/260000 [01:02<08:05, 465.70it/s]

WinRate: 0.4


 13%|█▎        | 35089/260000 [01:04<07:41, 487.60it/s]

WinRate: 0.24


 14%|█▍        | 36072/260000 [01:06<07:50, 475.75it/s]

WinRate: 0.32


 14%|█▍        | 37091/260000 [01:08<07:39, 485.21it/s]

WinRate: 0.44


 15%|█▍        | 38149/260000 [01:10<07:31, 491.49it/s]

WinRate: 0.24


 15%|█▌        | 39149/260000 [01:12<07:26, 494.81it/s]

WinRate: 0.48


 15%|█▌        | 40160/260000 [01:14<07:25, 493.90it/s]

WinRate: 0.32


 16%|█▌        | 41056/260000 [01:16<08:57, 407.39it/s]

WinRate: 0.32


 16%|█▌        | 42069/260000 [01:18<07:47, 466.11it/s]

WinRate: 0.16


 17%|█▋        | 43106/260000 [01:20<07:46, 464.73it/s]

WinRate: 0.24


 17%|█▋        | 44167/260000 [01:22<07:14, 496.74it/s]

WinRate: 0.12


 17%|█▋        | 45090/260000 [01:24<08:47, 407.46it/s]

WinRate: 0.56


 18%|█▊        | 46049/260000 [01:26<07:53, 451.47it/s]

WinRate: 0.48


 18%|█▊        | 47083/260000 [01:28<07:40, 462.46it/s]

WinRate: 0.4


 19%|█▊        | 48102/260000 [01:30<07:36, 464.41it/s]

WinRate: 0.16


 19%|█▉        | 49083/260000 [01:32<07:45, 452.63it/s]

WinRate: 0.44


 19%|█▉        | 50102/260000 [01:34<07:30, 466.23it/s]

WinRate: 0.56
episode: 50000,             epsilon: 0.61,             sum q-table: 203255.40008461542,             elapsed time [min]: 1.57,              done [%]: 19.230769230769234             


 20%|█▉        | 51071/260000 [01:36<07:26, 468.43it/s]

WinRate: 0.36


 20%|██        | 52118/260000 [01:38<07:19, 473.02it/s]

WinRate: 0.44


 20%|██        | 53090/260000 [01:40<07:31, 457.93it/s]

WinRate: 0.24


 21%|██        | 54054/260000 [01:42<07:33, 453.98it/s]

WinRate: 0.4


 21%|██        | 55110/260000 [01:44<07:06, 480.28it/s]

WinRate: 0.56


 22%|██▏       | 56098/260000 [01:46<07:32, 450.74it/s]

WinRate: 0.36


 22%|██▏       | 57100/260000 [01:48<07:52, 429.03it/s]

WinRate: 0.32


 22%|██▏       | 58080/260000 [01:50<07:05, 474.23it/s]

WinRate: 0.28


 23%|██▎       | 59064/260000 [01:51<07:27, 448.59it/s]

WinRate: 0.6


 23%|██▎       | 60074/260000 [01:53<07:30, 444.03it/s]

WinRate: 0.56


 23%|██▎       | 61086/260000 [01:56<07:06, 465.93it/s]

WinRate: 0.36


 24%|██▍       | 62069/260000 [01:58<07:30, 439.24it/s]

WinRate: 0.48


 24%|██▍       | 63059/260000 [02:00<07:19, 448.06it/s]

WinRate: 0.64


 25%|██▍       | 64127/260000 [02:02<07:03, 463.04it/s]

WinRate: 0.48


 25%|██▌       | 65100/260000 [02:04<06:56, 467.84it/s]

WinRate: 0.52


 25%|██▌       | 66103/260000 [02:06<07:06, 454.64it/s]

WinRate: 0.64


 26%|██▌       | 67076/260000 [02:08<06:41, 480.48it/s]

WinRate: 0.64


 26%|██▌       | 68073/260000 [02:10<06:39, 480.16it/s]

WinRate: 0.48


 27%|██▋       | 69092/260000 [02:12<07:39, 415.41it/s]

WinRate: 0.72


 27%|██▋       | 70073/260000 [02:14<07:11, 440.37it/s]

WinRate: 0.2


 27%|██▋       | 71099/260000 [02:16<07:23, 426.39it/s]

WinRate: 0.72


 28%|██▊       | 72059/260000 [02:18<07:06, 440.65it/s]

WinRate: 0.6


 28%|██▊       | 73064/260000 [02:20<06:56, 448.41it/s]

WinRate: 0.52


 28%|██▊       | 74098/260000 [02:22<07:02, 440.01it/s]

WinRate: 0.4


 29%|██▉       | 75077/260000 [02:24<08:21, 368.59it/s]

WinRate: 0.48
episode: 75000,             epsilon: 0.47,             sum q-table: 219942.57753026995,             elapsed time [min]: 2.4,              done [%]: 28.846153846153843             


 29%|██▉       | 76071/260000 [02:26<06:49, 449.28it/s]

WinRate: 0.6


 30%|██▉       | 77115/260000 [02:28<06:33, 464.44it/s]

WinRate: 0.6


 30%|███       | 78095/260000 [02:30<06:56, 437.01it/s]

WinRate: 0.48


 30%|███       | 79079/260000 [02:32<07:28, 403.18it/s]

WinRate: 0.52


 31%|███       | 80073/260000 [02:34<07:15, 413.09it/s]

WinRate: 0.64


 31%|███       | 81062/260000 [02:37<07:01, 424.67it/s]

WinRate: 0.52


 32%|███▏      | 82079/260000 [02:39<06:54, 429.05it/s]

WinRate: 0.48


 32%|███▏      | 83104/260000 [02:41<06:26, 457.11it/s]

WinRate: 0.64


 32%|███▏      | 84081/260000 [02:43<06:52, 426.72it/s]

WinRate: 0.44


 33%|███▎      | 85080/260000 [02:45<06:39, 437.55it/s]

WinRate: 0.56


 33%|███▎      | 86067/260000 [02:47<06:27, 449.18it/s]

WinRate: 0.56


 34%|███▎      | 87128/260000 [02:49<06:19, 455.97it/s]

WinRate: 0.6


 34%|███▍      | 88115/260000 [02:51<06:11, 463.13it/s]

WinRate: 0.48


 34%|███▍      | 89100/260000 [02:53<06:25, 443.41it/s]

WinRate: 0.4


 35%|███▍      | 90103/260000 [02:55<06:15, 452.47it/s]

WinRate: 0.64


 35%|███▌      | 91126/260000 [02:57<06:15, 449.16it/s]

WinRate: 0.52


 35%|███▌      | 92122/260000 [02:59<05:48, 481.93it/s]

WinRate: 0.44


 36%|███▌      | 93073/260000 [03:01<06:21, 437.85it/s]

WinRate: 0.6


 36%|███▌      | 94102/260000 [03:03<06:26, 429.72it/s]

WinRate: 0.48


 37%|███▋      | 95081/260000 [03:05<06:18, 435.99it/s]

WinRate: 0.36


 37%|███▋      | 96084/260000 [03:08<06:15, 436.03it/s]

WinRate: 0.56


 37%|███▋      | 97101/260000 [03:10<05:50, 464.87it/s]

WinRate: 0.48


 38%|███▊      | 98084/260000 [03:12<06:08, 439.49it/s]

WinRate: 0.84


 38%|███▊      | 99077/260000 [03:14<06:00, 446.04it/s]

WinRate: 0.48


 38%|███▊      | 100053/260000 [03:16<06:02, 440.89it/s]

WinRate: 0.52
episode: 100000,             epsilon: 0.37,             sum q-table: 225267.92700841345,             elapsed time [min]: 3.27,              done [%]: 38.46153846153847             


 39%|███▉      | 101086/260000 [03:18<06:06, 434.10it/s]

WinRate: 0.72


 39%|███▉      | 102063/260000 [03:20<05:53, 446.87it/s]

WinRate: 0.68


 40%|███▉      | 103059/260000 [03:22<05:54, 443.18it/s]

WinRate: 0.64


 40%|████      | 104090/260000 [03:24<05:54, 439.19it/s]

WinRate: 0.48


 40%|████      | 105072/260000 [03:26<05:49, 443.27it/s]

WinRate: 0.52


 41%|████      | 106158/260000 [03:28<05:24, 473.43it/s]

WinRate: 0.6


 41%|████      | 107064/260000 [03:30<05:40, 449.55it/s]

WinRate: 0.48


 42%|████▏     | 108056/260000 [03:32<05:42, 443.47it/s]

WinRate: 0.72


 42%|████▏     | 109106/260000 [03:34<05:37, 446.66it/s]

WinRate: 0.64


 42%|████▏     | 110146/260000 [03:36<05:30, 453.17it/s]

WinRate: 0.56


 43%|████▎     | 111084/260000 [03:38<05:29, 451.55it/s]

WinRate: 0.56


 43%|████▎     | 112076/260000 [03:40<05:35, 441.33it/s]

WinRate: 0.6


 43%|████▎     | 113083/260000 [03:42<05:29, 446.46it/s]

WinRate: 0.72


 44%|████▍     | 114057/260000 [03:44<05:35, 434.38it/s]

WinRate: 0.56


 44%|████▍     | 115153/260000 [03:47<05:18, 454.31it/s]

WinRate: 0.56


 45%|████▍     | 116082/260000 [03:48<05:30, 435.60it/s]

WinRate: 0.72


 45%|████▌     | 117062/260000 [03:51<05:26, 438.29it/s]

WinRate: 0.6


 45%|████▌     | 118115/260000 [03:53<05:05, 464.38it/s]

WinRate: 0.92


 46%|████▌     | 119103/260000 [03:55<05:17, 443.19it/s]

WinRate: 0.72


 46%|████▌     | 120068/260000 [03:57<05:23, 432.27it/s]

WinRate: 0.68


 47%|████▋     | 121109/260000 [03:59<04:51, 476.39it/s]

WinRate: 0.56


 47%|████▋     | 122095/260000 [04:01<05:10, 444.05it/s]

WinRate: 0.4


 47%|████▋     | 123072/260000 [04:03<05:11, 439.33it/s]

WinRate: 0.48


 48%|████▊     | 124053/260000 [04:05<05:11, 436.42it/s]

WinRate: 0.6


 48%|████▊     | 125130/260000 [04:07<04:58, 452.27it/s]

WinRate: 0.72
episode: 125000,             epsilon: 0.29,             sum q-table: 227456.0477772408,             elapsed time [min]: 4.12,              done [%]: 48.07692307692308             


 49%|████▊     | 126112/260000 [04:09<04:46, 466.60it/s]

WinRate: 0.44


 49%|████▉     | 127062/260000 [04:11<05:04, 436.15it/s]

WinRate: 0.52


 49%|████▉     | 128073/260000 [04:13<05:09, 425.61it/s]

WinRate: 0.52


 50%|████▉     | 129112/260000 [04:15<04:45, 459.13it/s]

WinRate: 0.64


 50%|█████     | 130087/260000 [04:17<04:59, 433.71it/s]

WinRate: 0.6


 50%|█████     | 131054/260000 [04:19<04:50, 443.84it/s]

WinRate: 0.88


 51%|█████     | 132101/260000 [04:22<04:48, 443.23it/s]

WinRate: 0.8


 51%|█████     | 133074/260000 [04:24<05:22, 393.57it/s]

WinRate: 0.64


 52%|█████▏    | 134051/260000 [04:26<05:07, 409.03it/s]

WinRate: 0.68


 52%|█████▏    | 135113/260000 [04:28<04:50, 430.30it/s]

WinRate: 0.64


 52%|█████▏    | 136066/260000 [04:30<04:53, 422.46it/s]

WinRate: 0.64


 53%|█████▎    | 137079/260000 [04:32<04:47, 427.02it/s]

WinRate: 0.64


 53%|█████▎    | 138149/260000 [04:35<04:27, 456.28it/s]

WinRate: 0.76


 53%|█████▎    | 139067/260000 [04:37<04:53, 412.06it/s]

WinRate: 0.92


 54%|█████▍    | 140069/260000 [04:39<04:38, 430.99it/s]

WinRate: 0.8


 54%|█████▍    | 141046/260000 [04:41<04:52, 406.77it/s]

WinRate: 0.88


 55%|█████▍    | 142092/260000 [04:43<05:02, 390.27it/s]

WinRate: 0.72


 55%|█████▌    | 143074/260000 [04:45<04:24, 442.65it/s]

WinRate: 0.44


 55%|█████▌    | 144123/260000 [04:47<04:21, 443.43it/s]

WinRate: 0.48


 56%|█████▌    | 145082/260000 [04:49<04:26, 430.54it/s]

WinRate: 0.88


 56%|█████▌    | 146097/260000 [04:52<04:14, 447.07it/s]

WinRate: 0.56


 57%|█████▋    | 147075/260000 [04:54<04:18, 437.27it/s]

WinRate: 0.6


 57%|█████▋    | 148059/260000 [04:56<04:24, 423.31it/s]

WinRate: 0.84


 57%|█████▋    | 149114/260000 [04:58<04:03, 455.15it/s]

WinRate: 0.64


 58%|█████▊    | 150067/260000 [05:00<04:02, 453.91it/s]

WinRate: 0.56
episode: 150000,             epsilon: 0.22,             sum q-table: 228570.2190379467,             elapsed time [min]: 5.01,              done [%]: 57.692307692307686             


 58%|█████▊    | 151126/260000 [05:02<03:53, 466.63it/s]

WinRate: 0.64


 59%|█████▊    | 152111/260000 [05:04<03:52, 464.76it/s]

WinRate: 0.52


 59%|█████▉    | 153053/260000 [05:06<04:06, 434.74it/s]

WinRate: 0.72


 59%|█████▉    | 154077/260000 [05:08<04:07, 428.39it/s]

WinRate: 0.52


 60%|█████▉    | 155120/260000 [05:10<03:54, 447.83it/s]

WinRate: 0.6


 60%|██████    | 156102/260000 [05:12<03:53, 445.45it/s]

WinRate: 0.72


 60%|██████    | 157076/260000 [05:14<04:07, 416.22it/s]

WinRate: 0.76


 61%|██████    | 158061/260000 [05:16<04:00, 423.88it/s]

WinRate: 0.72


 61%|██████    | 159074/260000 [05:19<04:20, 388.06it/s]

WinRate: 0.84


 62%|██████▏   | 160069/260000 [05:21<04:09, 401.29it/s]

WinRate: 0.68


 62%|██████▏   | 161069/260000 [05:23<04:15, 386.90it/s]

WinRate: 0.6


 62%|██████▏   | 162066/260000 [05:25<03:51, 423.32it/s]

WinRate: 0.8


 63%|██████▎   | 163040/260000 [05:28<04:36, 350.47it/s]

WinRate: 0.6


 63%|██████▎   | 164043/260000 [05:30<04:23, 364.43it/s]

WinRate: 0.72


 63%|██████▎   | 165092/260000 [05:32<03:42, 427.14it/s]

WinRate: 0.8


 64%|██████▍   | 166093/260000 [05:34<03:46, 415.44it/s]

WinRate: 0.52


 64%|██████▍   | 167102/260000 [05:37<03:35, 430.93it/s]

WinRate: 0.68


 65%|██████▍   | 168068/260000 [05:39<03:34, 429.40it/s]

WinRate: 0.84


 65%|██████▌   | 169070/260000 [05:41<03:28, 435.42it/s]

WinRate: 0.6


 65%|██████▌   | 170075/260000 [05:43<03:25, 438.54it/s]

WinRate: 0.76


 66%|██████▌   | 171074/260000 [05:45<03:24, 433.82it/s]

WinRate: 0.52


 66%|██████▌   | 172128/260000 [05:47<03:07, 468.16it/s]

WinRate: 0.72


 67%|██████▋   | 173106/260000 [05:49<03:13, 449.09it/s]

WinRate: 0.88


 67%|██████▋   | 174124/260000 [05:51<03:07, 457.58it/s]

WinRate: 0.76


 67%|██████▋   | 175076/260000 [05:53<03:17, 430.44it/s]

WinRate: 0.6
episode: 175000,             epsilon: 0.17,             sum q-table: 229210.16331054483,             elapsed time [min]: 5.9,              done [%]: 67.3076923076923             


 68%|██████▊   | 176050/260000 [05:56<03:21, 416.82it/s]

WinRate: 0.48


 68%|██████▊   | 177102/260000 [05:58<03:08, 439.89it/s]

WinRate: 0.4


 68%|██████▊   | 178072/260000 [06:00<03:09, 431.29it/s]

WinRate: 0.84


 69%|██████▉   | 179148/260000 [06:02<03:00, 448.88it/s]

WinRate: 0.6


 69%|██████▉   | 180146/260000 [06:04<02:58, 447.32it/s]

WinRate: 0.68


 70%|██████▉   | 181113/260000 [06:06<02:52, 458.53it/s]

WinRate: 0.64


 70%|███████   | 182063/260000 [06:08<03:03, 424.01it/s]

WinRate: 0.68


 70%|███████   | 183118/260000 [06:11<02:54, 440.44it/s]

WinRate: 0.64


 71%|███████   | 184076/260000 [06:13<02:57, 427.36it/s]

WinRate: 0.56


 71%|███████   | 185146/260000 [06:15<02:51, 436.87it/s]

WinRate: 0.8


 72%|███████▏  | 186052/260000 [06:17<02:50, 432.71it/s]

WinRate: 0.68


 72%|███████▏  | 187143/260000 [06:19<02:39, 456.51it/s]

WinRate: 0.64


 72%|███████▏  | 188097/260000 [06:21<02:47, 428.00it/s]

WinRate: 0.76


 73%|███████▎  | 189070/260000 [06:23<02:47, 424.05it/s]

WinRate: 0.76


 73%|███████▎  | 190104/260000 [06:25<02:37, 443.71it/s]

WinRate: 0.8


 73%|███████▎  | 191058/260000 [06:27<02:38, 435.22it/s]

WinRate: 0.64


 74%|███████▍  | 192078/260000 [06:30<02:40, 423.55it/s]

WinRate: 0.6


 74%|███████▍  | 193053/260000 [06:32<02:48, 396.26it/s]

WinRate: 0.6


 75%|███████▍  | 194059/260000 [06:34<02:30, 437.68it/s]

WinRate: 0.72


 75%|███████▌  | 195114/260000 [06:36<02:25, 445.75it/s]

WinRate: 0.52


 75%|███████▌  | 196075/260000 [06:38<02:24, 441.27it/s]

WinRate: 0.64


 76%|███████▌  | 197088/260000 [06:40<02:23, 437.65it/s]

WinRate: 0.76


 76%|███████▌  | 198104/260000 [06:42<02:19, 444.62it/s]

WinRate: 0.72


 77%|███████▋  | 199136/260000 [06:44<02:11, 463.69it/s]

WinRate: 0.88


 77%|███████▋  | 200132/260000 [06:46<02:11, 455.88it/s]

WinRate: 0.84
episode: 200000,             epsilon: 0.14,             sum q-table: 229578.07862390782,             elapsed time [min]: 6.78,              done [%]: 76.92307692307693             


 77%|███████▋  | 201074/260000 [06:48<02:06, 466.09it/s]

WinRate: 0.76


 78%|███████▊  | 202057/260000 [06:50<02:12, 438.34it/s]

WinRate: 0.76


 78%|███████▊  | 203052/260000 [06:53<02:15, 420.22it/s]

WinRate: 0.8


 78%|███████▊  | 204079/260000 [06:55<02:07, 437.55it/s]

WinRate: 0.56


 79%|███████▉  | 205090/260000 [06:57<02:03, 446.14it/s]

WinRate: 0.72


 79%|███████▉  | 206054/260000 [06:59<02:03, 436.29it/s]

WinRate: 0.56


 80%|███████▉  | 207080/260000 [07:01<02:01, 434.29it/s]

WinRate: 0.68


 80%|████████  | 208070/260000 [07:03<02:06, 410.25it/s]

WinRate: 0.72


 80%|████████  | 209137/260000 [07:05<01:55, 441.65it/s]

WinRate: 0.76


 81%|████████  | 210151/260000 [07:08<01:50, 449.46it/s]

WinRate: 0.56


 81%|████████  | 211106/260000 [07:10<01:46, 459.49it/s]

WinRate: 0.76


 82%|████████▏ | 212083/260000 [07:12<01:56, 411.48it/s]

WinRate: 0.68


 82%|████████▏ | 213144/260000 [07:14<01:42, 457.86it/s]

WinRate: 0.72


 82%|████████▏ | 214060/260000 [07:16<01:45, 435.87it/s]

WinRate: 0.6


 83%|████████▎ | 215079/260000 [07:18<01:40, 445.87it/s]

WinRate: 0.84


 83%|████████▎ | 216056/260000 [07:20<01:38, 445.36it/s]

WinRate: 0.76


 83%|████████▎ | 217088/260000 [07:22<01:35, 451.49it/s]

WinRate: 0.84


 84%|████████▍ | 218068/260000 [07:24<01:39, 422.74it/s]

WinRate: 0.76


 84%|████████▍ | 219127/260000 [07:27<01:30, 453.82it/s]

WinRate: 0.84


 85%|████████▍ | 220091/260000 [07:29<01:34, 423.01it/s]

WinRate: 0.56


 85%|████████▌ | 221048/260000 [07:31<01:29, 436.56it/s]

WinRate: 0.64


 85%|████████▌ | 222083/260000 [07:33<01:24, 447.50it/s]

WinRate: 0.88


 86%|████████▌ | 223068/260000 [07:35<01:28, 418.20it/s]

WinRate: 0.76


 86%|████████▌ | 224078/260000 [07:37<01:22, 434.41it/s]

WinRate: 0.68


 87%|████████▋ | 225094/260000 [07:39<01:23, 420.30it/s]

WinRate: 0.76
episode: 225000,             epsilon: 0.11,             sum q-table: 229728.4158340501,             elapsed time [min]: 7.66,              done [%]: 86.53846153846155             


 87%|████████▋ | 226087/260000 [07:41<01:19, 427.21it/s]

WinRate: 0.68


 87%|████████▋ | 227090/260000 [07:44<01:19, 413.46it/s]

WinRate: 0.72


 88%|████████▊ | 228075/260000 [07:46<01:12, 437.65it/s]

WinRate: 0.76


 88%|████████▊ | 229070/260000 [07:48<01:14, 416.84it/s]

WinRate: 0.64


 88%|████████▊ | 230065/260000 [07:50<01:11, 420.78it/s]

WinRate: 0.44


 89%|████████▉ | 231097/260000 [07:52<01:05, 442.49it/s]

WinRate: 0.6


 89%|████████▉ | 232131/260000 [07:54<01:00, 457.51it/s]

WinRate: 0.6


 90%|████████▉ | 233103/260000 [07:56<01:00, 445.60it/s]

WinRate: 0.64


 90%|█████████ | 234075/260000 [07:58<01:01, 424.72it/s]

WinRate: 0.8


 90%|█████████ | 235131/260000 [08:01<00:56, 438.39it/s]

WinRate: 0.6


 91%|█████████ | 236063/260000 [08:03<00:55, 428.79it/s]

WinRate: 0.72


 91%|█████████ | 237054/260000 [08:05<00:51, 441.89it/s]

WinRate: 0.64


 92%|█████████▏| 238079/260000 [08:07<00:52, 420.85it/s]

WinRate: 0.52


 92%|█████████▏| 239085/260000 [08:09<00:51, 406.62it/s]

WinRate: 0.8


 92%|█████████▏| 240079/260000 [08:11<00:47, 422.15it/s]

WinRate: 0.72


 93%|█████████▎| 241093/260000 [08:13<00:43, 439.22it/s]

WinRate: 0.6


 93%|█████████▎| 242141/260000 [08:16<00:40, 446.24it/s]

WinRate: 0.6


 93%|█████████▎| 243065/260000 [08:17<00:39, 432.47it/s]

WinRate: 0.76


 94%|█████████▍| 244122/260000 [08:20<00:34, 458.31it/s]

WinRate: 0.76


 94%|█████████▍| 245090/260000 [08:22<00:34, 434.30it/s]

WinRate: 0.6


 95%|█████████▍| 246050/260000 [08:24<00:32, 429.39it/s]

WinRate: 0.88


 95%|█████████▌| 247062/260000 [08:26<00:30, 431.12it/s]

WinRate: 0.68


 95%|█████████▌| 248074/260000 [08:28<00:26, 444.19it/s]

WinRate: 0.6


 96%|█████████▌| 249128/260000 [08:30<00:24, 443.27it/s]

WinRate: 0.64


 96%|█████████▌| 250095/260000 [08:32<00:22, 435.53it/s]

WinRate: 0.64
episode: 250000,             epsilon: 0.08,             sum q-table: 229799.2214184917,             elapsed time [min]: 8.54,              done [%]: 96.15384615384616             


 97%|█████████▋| 251076/260000 [08:34<00:19, 456.25it/s]

WinRate: 0.8


 97%|█████████▋| 252129/260000 [08:36<00:17, 455.41it/s]

WinRate: 0.64


 97%|█████████▋| 253114/260000 [08:38<00:15, 454.96it/s]

WinRate: 0.48


 98%|█████████▊| 254104/260000 [08:40<00:13, 453.34it/s]

WinRate: 0.64


 98%|█████████▊| 255119/260000 [08:42<00:10, 466.37it/s]

WinRate: 0.8


 99%|█████████▊| 256121/260000 [08:44<00:08, 448.77it/s]

WinRate: 0.84


 99%|█████████▉| 257074/260000 [08:46<00:06, 453.20it/s]

WinRate: 0.84


 99%|█████████▉| 258105/260000 [08:48<00:04, 453.28it/s]

WinRate: 0.52


100%|█████████▉| 259102/260000 [08:50<00:02, 442.35it/s]

WinRate: 0.72


100%|██████████| 260000/260000 [08:52<00:00, 488.05it/s]


In [12]:
num_games = 1000
cur_win_rate, _ = play_tictactoe_with_random(env, qagent.qtable, state_dict, num_test_games=num_games)
win_history.append(sum(cur_win_rate)/num_games)
print("WinRate:", sum(cur_win_rate)/num_games)

WinRate: 0.678


In [43]:
qtable = qagent.qtable
save_qtable(qtable, 'tables', "q_table_070")

q_table_070.npy saved!


In [None]:
q_table = load_qtable('tables', "q_table_067")

In [78]:
#check how correct is q-table

# state = np.random.choice(np.arange(env.observation_space.n))
# # state_dict[state]
# print(state)

# key = list(filter(lambda x: state_dict[x] == state, state_dict))[0]
# print(np.array(key[:-1]).reshape(3,3))
# print("Turn was:", key[-1])
# print(np.round(qagent.qtable[state].reshape(3,3),1))

4919
[[1 2 0]
 [0 2 1]
 [0 0 2]]
Turn was: 2
[[0.  0.  7.2]
 [7.2 0.  0. ]
 [7.7 5.8 0. ]]
