In [None]:
import numpy as np
from random import randrange
from IPython.display import clear_output

class TicTacToe: 
    
    def __init__(self):
        self.grid = np.zeros((3, 3), dtype=int)
        self.board = self.grid.reshape(9,)
        self.force_log = False
        self.winner = -1
        self.last_player = -1
        self.symbol = 1
        class ActionSpace:
            def sample(self):
                return randrange(self.n)
        self.action_space = ActionSpace()
        self.action_space.n = 9
        class Symbol:
            CROSS = 0,
            CIRCLE = 1
    
    @classmethod
    def string_symbol(cls, symbol):
        if symbol == 0:
            return ' '
        elif symbol == 1:
            return 'X'
        elif symbol == 2:
            return 'O'
        else:
            raise IndexError("Wrong symbol", symbol)

    def __str__(self):
         return """
            |    |     
         {}  | {}  | {}
            |    |     
         ------------     
            |    |     
         {}  | {}  | {}
            |    |     
         ------------     
            |    |     
         {}  | {}  | {}
            |    |     """.format(TicTacToe.string_symbol(self.board[6]), TicTacToe.string_symbol(self.board[7]), TicTacToe.string_symbol(self.board[8]),
     TicTacToe.string_symbol(self.board[3]), TicTacToe.string_symbol(self.board[4]), TicTacToe.string_symbol(self.board[5]),
     TicTacToe.string_symbol(self.board[0]), TicTacToe.string_symbol(self.board[1]), TicTacToe.string_symbol(self.board[2]))
    
    def render(self):
        self.force_log = True
    
    def close(self):
        pass
    
    def reset(self):
        state, _, _, _ = self.start_game()
        return state

    def start_game(self):
        self.winner = -1
        self.last_player = -1
        self.grid = np.zeros((3, 3), dtype=int)
        self.board = self.grid.reshape(9,)
        self.symbol = 1
        return (self.board, self.last_player), 0, False, {'grid': str(self)}

    def step(self, action, log=False):
        if action < 0 or action > 8:
            raise IndexError(action)
        x, y = np.unravel_index(action, self.grid.shape)
        return self.play(x, y, self.symbol, log)
    
    def play_symbol(self, action, symbol, log = False):
        if action < 0 or action > 8:
            raise IndexError(action)
        x, y = np.unravel_index(action, self.grid.shape)
        return self.play(x, y, symbol, log)
    
    def play(self, x, y, symbol, log=False):
        if self.force_log: log=True
        if self.winner != -1:
            if self.winner == 0:
                return (self.board, self.last_player), 0, True, {'grid': str(self)}
            else:
                if self.winner == symbol:
                    return (self.board, self.last_player), 1, True, {'grid': str(self)}
                else:
                    return (self.board, self.last_player), -1, True, {'grid': str(self)}
        if x < 0 or y < 0 or x >= 3 or y >= 3:
            raise IndexError(x,y)
        if self.grid[x, y] != 0:
            raise IndexError("Already played", x, y, self.grid)
        if symbol > 2:
            raise IndexError("Wrong symbol", symbol)
        if symbol == self.last_player:
            raise IndexError("Same player playing twice")
        self.grid[x, y] = symbol
        self.last_player = symbol
        self.symbol = (symbol+2)%2+1
        reward = 0
        
        if self.has_winner(symbol):
            if log: print("Winner {} !".format(symbol))
            self.winner = symbol
            return (self.board, self.last_player), 1, True, {'grid': str(self)}
        
        if self.isBoardFull():
            if log: print("Draw !")
            self.winner = 0
            return (self.board, self.last_player), 0, True, {'grid': str(self)}
        
        return (self.board, self.last_player), 0, False, {'grid': str(self)}
    
    def has_winner(self, symbol):
        return (
            (self.board[6] == symbol and self.board[7] == symbol and self.board[8] == symbol) or # across the top
            (self.board[3] == symbol and self.board[4] == symbol and self.board[5] == symbol) or # across the middle
            (self.board[0] == symbol and self.board[1] == symbol and self.board[2] == symbol) or # across the bottom
            (self.board[6] == symbol and self.board[3] == symbol and self.board[0] == symbol) or # down the left side
            (self.board[7] == symbol and self.board[4] == symbol and self.board[1] == symbol) or # down the middle
            (self.board[8] == symbol and self.board[5] == symbol and self.board[2] == symbol) or # down the right side
            (self.board[6] == symbol and self.board[4] == symbol and self.board[2] == symbol) or # diag
            (self.board[8] == symbol and self.board[4] == symbol and self.board[0] == symbol)) # diagonal

    def isBoardFull(self):
        # Return True if every space on the board has been taken. Otherwise return False.
        for i in range(9):
            if self.board[i] == 0:
                return False
        if self.winner == -1:
            self.winner = 0
        return True
    
    def play_against(self, computer, adversary, log=True, player=-1):
        if player == -1:
            user = int(input("Which user 1-2 ?"))
        else:
            user = player
        if user <= 0 or user > 2:
            raise IndexError("Wrong user", user)
        if log: self.render()
        done = False
        nturn = 0
        state = self.reset()
        if log: print(self)
        while done is False:
            if nturn%2 == user-1:
                action = adversary(state)
                state, reward, done, _ = self.play_symbol(action - 1, user, log)
            else:
                state, reward, done, _ = self.step(computer.q_trained_action(state))
            nturn += 1
            if not done:
                clear_output()
            if log: print(self)
        return self.winner

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import numpy as np

def state_function(state):
    return (tuple(i for i in state[0]), state[1])
env = TicTacToe()
game = qlearning.GamePlayer(env, state_function)
adversary = qlearning.GamePlayer(env, state_function)

In [None]:
def random(state):
    s = np.array(state[0])
    indices = np.nonzero(s == 0)[0]
    if len(indices) == 0 or len(s) != 9:
        raise IndexError(str(env), s, indices)
    action = np.random.choice(indices)
    if state[0][action] != 0:
        raise IndexError(action, state)
    return action

total_episodes = 20000
alpha = 0.3
gamma = 0.9                 # Discounting rate
decay_rate = 0.0005          # Exponential decay rate for exploration prob
epsilon = 0.7                 # Exploration rate
#game.erase_training()
rewards = game.adversarial_q_train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=4000,
                                   adversary_function=random)
print("Total reward average:", np.mean(rewards))

In [None]:
def random_challenger(state):
    return random(state)+1

env = TicTacToe()

my_win = 0
computer_win = 0
draw_counter = 0
for i in range(100):
    me = i%2+1
    winner = env.play_against(game, random_challenger, False, player=me)
    if winner == me:
        my_win += 1
    elif winner == 0:
        draw_counter += 1
    else:
        computer_win += 1
print("Computer: {}, Me: {}, Draw: {}".format(computer_win, my_win, draw_counter))

In [None]:
def human(state):
    return int(input("Action 1-9 ?"))

env.play_against(game, human)