In [17]:
import numpy as np

class Board:
    def __init__(self, x_player, o_player):
        self.board = np.zeros([3,3])
        self.moves = 0
        self.x_player = x_player
        self.o_player = o_player
        
    # returns the max and min of sum of each axis + each diagonal
    def max_min(self):
        col_sum = np.sum(self.board,0)
        row_sum = np.sum(self.board,1)
        maxs = np.maximum(col_sum,row_sum)
        mins = np.minimum(col_sum,row_sum)
        diag0 = self.board.trace(0)
        diag1 = np.flip(self.board,0).trace(0)
        return max(max(maxs), diag0, diag1), min(min(mins), diag0, diag1)

    # returns winner (if one player had won), else None
    def winner(self):
        x,o = self.max_min()
        if x == 3:
            return 1
        elif o == -3:
            return -1
        else:
            return 0

    def game_over(self):
        return self.winner() != 0 or self.moves == 9
    
    # record a move; return reward with respect to ai player
    def move(self, action, player):
        row = action // 3
        col = action % 3
        if self.board[row][col] == 0:
            self.board[row][col] = player
            self.moves += 1
        else:
            raise ValueError("Illegal move.")
    
    def is_empty(self, row_col):
        return self.board[row_col // 3,row_col % 3] == 0
    
    # return a random legal move (do not call if all 9 squares are taken!)
    def sample(self):
        if self.moves == 9:
            raise ValueError('cannot sample; board is full')
        while True:
            row = np.random.randint(3)  # 0,1,2
            col = np.random.randint(3)
            if self.board[row][col] == 0:
                return row * 3 + col

    def state(self):
        i = 0
        for row in range(3):
            for col in range(3):
                i += (self.board[row][col] + 1) * (3 ** (row*3+col))
        return int(i)
    
    def play(self, verbose=False):
        while not self.game_over():
            x_state = self.state()
            self.move(self.x_player.move(self, x_state),1)
            if verbose:
                print(self)
            if not self.game_over():
                self.move(self.o_player.move(self, self.state()),-1)
                if verbose:
                    print(self)
            self.x_player.update(self, x_state, self.winner())
            if verbose:
                print('--->',x_state,self.winner(),self.x_player)
            self.o_player.update(self, self.state(), -self.winner())
            #if verbose:
             #   print(self.x_player)
             #   print(self.o_player)
            if verbose and self.game_over():
                print("WINNER = ", self.winner())
               
    def __str__(self):
        return self.board.__str__()

class RandomPlayer:     
    def move(self, board, state):
        return board.sample()
    
    def update(self, board, state, reward):
        pass
    
class QPlayer:
    alpha = 0.10
    explore = 0.01

    def __init__(self):
        self.q_table = np.zeros([3**9,9])
        
    def move(self, board, state):
        self.action = np.argmax(self.q_table[state])
        if not board.is_empty(self.action) or np.random.random() < QPlayer.explore:
            self.action = board.sample()
        return self.action
   
    def update(self, board, state, reward):
        future_state = board.state()
        #self.q_table[state] = (1-QPlayer.alpha) * self.q_table[state] + QPlayer.alpha * (reward + max(self.q_table[future_state]))
        self.q_table[state][self.action] += reward
       
    def __str__(self):
        return str(np.sum(self.q_table))
        
class HumanPlayer:
    
    def move(self, board, state):
        return int(input())
        
    def update(self, board, state, reward):
        pass

In [None]:
q = QPlayer()
h = RandomPlayer()
q_wins = 0
for i in range(100000):
    b = Board(q,h)
    b.play()
    if b.winner() == 1:
        q_wins += 1
print(q_wins)
q_wins = 0
for i in range(100):
    b = Board(q,h)
    b.play()
    if b.winner() == 1:
        q_wins += 1
print(q_wins)

while True:
    b = Board(q, HumanPlayer())
    b.play(verbose=True)

8067
86
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [None]:
q = QPlayer()
while True:
    b = Board(q, HumanPlayer())
    b.play(verbose=True)