## RL agent Q-learning for TicTacToe env

The [Tic-Tac-Toe](https://github.com/MauroLuzzatto/OpenAI-Gym-TicTacToe-Environment) is a simple game environment that allows to train reinforcement learning agents. These notebook contains an implemetation of Q-learning with epsilon-greedy strategy for TicTacToe env.

In [86]:
# load the python modules
import time
import sys
import warnings

import gym
import numpy as np
from tqdm import tqdm
import gym_TicTacToe

from src.qagent import QLearningAgent
from src.play_tictactoe import play_tictactoe, play_tictactoe_with_random

from src.utils import (
    create_state_dictionary,
    reshape_state,
    save_qtable,
    load_qtable
)

# ignore warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [87]:
class Player:
    def __init__(self, color, episodes: int):
        self.color = color
        self.reward_array = np.zeros(episodes)
        self.reset_reward()
        self.name = f"Player {color}"

    def reset_reward(self):
        self.reward = 0

In [91]:

# initialize the tictactoe environment
env = gym.envs.make("TTT-v0", small=-1, large=10)

In [15]:
state_dict = create_state_dictionary()
state_size = len(state_dict.keys())
action_size = env.action_space.n

Number of legal states: 17906


In [129]:
# set training parameters
episodes = 660_000
max_steps = 9

In [17]:
exploration_parameters = {
    "max_epsilon": 1.0,
    "min_epsilon": 0.0,
    "decay_rate": 0.00001,
}

In [154]:
qagent = QLearningAgent(exploration_parameters, state_size, action_size, learning_rate=0.1, gamma=0.99)

In [149]:

def play(qagent:QLearningAgent, player: Player, state: int, action_space: np.array) -> tuple:
    action = qagent.get_action(state, action_space)

    # remove action from the action space
    action_space = action_space[action_space != action]

    new_state, reward, done, _ = env.step((action, player.color))
    #TODO: maybe should change a marker after this agent turn 
    new_state = np.append(new_state, player.color)
    new_state = state_dict[reshape_state(new_state)] 

    qagent.qtable[state, action] = qagent.update_qtable(
        state, new_state, action, reward, done
    )
    # new state
    state = new_state
    return state, action_space, done

In [94]:
def play_random(qagent:QLearningAgent, player: Player, state: int, action_space: np.array) -> tuple:
    action = np.random.choice(action_space)
    action_space = action_space[action_space != action]
    new_state, reward, done, _ = env.step((action, player.color))
    new_state = np.append(new_state, player.color)
    new_state = state_dict[reshape_state(new_state)]
    state = new_state
    return state, action_space, done

In [150]:
visited_states = np.zeros((state_size, 1))

In [163]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
import random

start_time = time.time()

player_1 = Player(color=1, episodes=episodes)
player_2 = Player(color=2, episodes=episodes)

track_progress = np.zeros(episodes)

win_history = []

rewards = []
lear_rate = [0.55, 0.65, 0.8, 0.9]
# lr = 0.4, gamma = 0.9, winrate = 0.64
# Learning rate: 0.5, Win rate: 0.72, Gamma: 0.9
# Learning rate: 0.6, Win rate: 0.5, Gamma: 0.9
# Learning rate: 0.7, Win rate: 0.66, Gamma: 0.9
# Learning rate: 0.65, Win rate: 0.76, Gamma: 0.8

# best 
# Learning rate: 0.8, Win rate: 0.8, Gamma: 0.9
gamma = [0.8, 0.85, 0.9, 0.99]


for lr in lear_rate:
    for g in gamma:
        qagent = QLearningAgent(exploration_parameters, state_size, action_size, learning_rate=lr, gamma=g)
        # qagent_old = qagent
        for episode in tqdm(range(episodes)):
            
            action_space = np.arange(9)

            player_1.reset_reward()
            player_2.reset_reward()

            # randomly change the order players
            start = np.random.choice([1,2])

            state, _ = env.reset()
            state = np.append(state, start)
            state = state_dict[reshape_state(state)]

            # if episode % 10_000 == 0:
                # save_qtable(qagent.qtable, 'tables', "q_table_old")

            for _step in range(start, max_steps + start):

                # change a turn
                if _step % 2 == 0:
                    #state, action_space, done = play_random(qagent, player_1, state, action_space)
                    # qagent_old.qtable = load_qtable('tables', "q_table_old")
                    state, action_space, done = play(qagent, player_1, state, action_space)
                else:
                    state, action_space, done = play(qagent, player_2, state, action_space)
                visited_states[state] += 1
                if done == True:
                    break

            # reduce epsilon for exporation-exploitation tradeoff
            qagent.update_epsilon(episode)

        cur_win_rate, reward = play_tictactoe_with_random(env, qagent.qtable, state_dict, num_test_games=100)
        print(f"Learning rate: {lr}, Win rate: {sum(cur_win_rate)/100}, Gamma: {g}")

            #check how good is agent
            # if episode % 1_000 == 0:
            #     num_games = 50
            #     cur_win_rate, reward = play_tictactoe_with_random(env, qagent.qtable, state_dict, num_test_games=num_games)
            #     win_history.append(sum(cur_win_rate)/num_games)
            #     print("WinRate:", sum(cur_win_rate)/num_games)
            #     # rewards.append(reward)
            #     # clear_output(True)
            #     # # plt.title('eps = {:e}, mean reward = {:.1f}'.format(agent.epsilon, np.mean(rewards[-10:])))
            #     # plt.plot(rewards)
            #     # plt.show()
            # if episode % 25_000 == 0:

            #     sum_q_table = np.sum(qagent.qtable)
            #     time_passed = round((time.time() - start_time) / 60.0, 2)

            #     print(
            #         f"episode: {episode}, \
            #         epsilon: {round(qagent.epsilon, 2)}, \
            #         sum q-table: {sum_q_table}, \
            #         elapsed time [min]: {time_passed},  \
            #         done [%]: {episode / episodes * 100} \
            #         "
            #     )


  0%|          | 0/660000 [00:00<?, ?it/s]

100%|██████████| 660000/660000 [14:46<00:00, 744.83it/s]


Learning rate: 0.55, Win rate: 0.68, Gamma: 0.8


100%|██████████| 660000/660000 [15:02<00:00, 731.08it/s]


Learning rate: 0.55, Win rate: 0.6, Gamma: 0.85


100%|██████████| 660000/660000 [15:08<00:00, 726.39it/s]


Learning rate: 0.55, Win rate: 0.66, Gamma: 0.9


100%|██████████| 660000/660000 [15:12<00:00, 723.67it/s]


Learning rate: 0.55, Win rate: 0.58, Gamma: 0.99


100%|██████████| 660000/660000 [15:24<00:00, 713.92it/s]


Learning rate: 0.65, Win rate: 0.76, Gamma: 0.8


100%|██████████| 660000/660000 [15:15<00:00, 720.81it/s]


Learning rate: 0.65, Win rate: 0.72, Gamma: 0.85


100%|██████████| 660000/660000 [15:15<00:00, 720.64it/s]


Learning rate: 0.65, Win rate: 0.7, Gamma: 0.9


100%|██████████| 660000/660000 [15:30<00:00, 709.44it/s]


Learning rate: 0.65, Win rate: 0.58, Gamma: 0.99


100%|██████████| 660000/660000 [15:51<00:00, 693.43it/s]


Learning rate: 0.8, Win rate: 0.68, Gamma: 0.8


100%|██████████| 660000/660000 [15:45<00:00, 697.88it/s]


Learning rate: 0.8, Win rate: 0.58, Gamma: 0.85


100%|██████████| 660000/660000 [15:43<00:00, 699.79it/s]


Learning rate: 0.8, Win rate: 0.8, Gamma: 0.9


100%|██████████| 660000/660000 [15:44<00:00, 698.98it/s]


Learning rate: 0.8, Win rate: 0.7, Gamma: 0.99


100%|██████████| 660000/660000 [15:50<00:00, 694.34it/s]


Learning rate: 0.9, Win rate: 0.66, Gamma: 0.8


100%|██████████| 660000/660000 [15:31<00:00, 708.80it/s]


Learning rate: 0.9, Win rate: 0.76, Gamma: 0.85


100%|██████████| 660000/660000 [15:24<00:00, 713.84it/s]


Learning rate: 0.9, Win rate: 0.62, Gamma: 0.9


100%|██████████| 660000/660000 [15:22<00:00, 715.39it/s]


Learning rate: 0.9, Win rate: 0.72, Gamma: 0.99


In [132]:
visited_states.shape[0]
print("Percent:",100*np.sum(visited_states > 0)/visited_states.shape[0])

Percent: 61.17502513124092


In [152]:
num_games = 1000
cur_win_rate, _ = play_tictactoe_with_random(env, qagent.qtable, state_dict, num_test_games=num_games)
win_history.append(sum(cur_win_rate)/num_games)
print("WinRate:", sum(cur_win_rate)/num_games)

WinRate: 0.382


In [64]:
qtable = qagent.qtable
save_qtable(qtable, 'tables', "q_table_with_penalty")

q_table_with_penalty.npy saved!


In [None]:
q_table = load_qtable('tables', "q_table_067")

In [146]:
#check how correct is q-table

state = np.random.choice(np.arange(env.observation_space.n))
# state_dict[state]
print(state)

key = list(filter(lambda x: state_dict[x] == state, state_dict))[0]
print(np.array(key[:-1]).reshape(3,3))
print("Turn was:", key[-1])
print(np.round(qagent.qtable[state].reshape(3,3),1))

5055
[[0 0 0]
 [0 1 0]
 [0 0 2]]
Turn was: 2
[[5.1 5.6 6.3]
 [6.4 0.  6.4]
 [6.1 6.5 0. ]]


In [125]:
play_tictactoe(env, qagent.qtable, state_dict, num_test_games=1)

Agent beginns
--------------------
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
--------------------
move Agent
Action: 7


╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
--------------------
Move Human
Action: 4
-1


╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
--------------------
move Agent
Action: 0


╒═══╤═══╤═══╕
│ O │ - │ - │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
--------------------
Move Human
Action: 2
-1


╒═══╤═══╤═══╕
│ O │ - │ X │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
--------------------
move Agent
Action: 6


╒═══╤═══╤═══╕
│ O │ - │ X │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ O │ O │ - │
╘═══╧═══╧═══╛
--------------------
Move Human
Action: 8
-1


╒═══╤═══╤═══╕
│ O │ - │ X │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ O │ O │ X │
╘═══╧═══╧═══╛
------