In [5]:
import numpy as np

class Board:
    def __init__(self, x_player, o_player):
        self.board = np.zeros([3,3])
        self.moves = 0
        self.x_player = x_player
        self.o_player = o_player
        
    # returns the max and min of sum of each axis + each diagonal
    def max_min(self):
        col_sum = np.sum(self.board,0)
        row_sum = np.sum(self.board,1)
        maxs = np.maximum(col_sum,row_sum)
        mins = np.minimum(col_sum,row_sum)
        diag0 = self.board.trace(0)
        diag1 = np.flip(self.board,0).trace(0)
        return max(max(maxs), diag0, diag1), min(min(mins), diag0, diag1)

    # returns winner (if one player had won), else None
    def winner(self):
        x,o = self.max_min()
        if x == 3:
            return 1
        elif o == -3:
            return -1
        else:
            return None

    def game_over(self):
        return self.winner() != None or self.moves == 9
    
    # record a move; return reward with respect to ai player
    def move(self, row_col, player):
        row = row_col // 3
        col = row_col % 3
        if self.board[row][col] == 0:
            self.board[row][col] = player
            self.moves += 1
            if self.game_over():
                winner = self.winner()
                if winner == None:
                    return 0
                elif winner == player:
                    return 1
                else:
                    return -1
            else:
                return 0
        else:
            raise ValueError("Illegal move.")
    
    def is_empty(self, row_col):
        return self.board[row_col // 3,row_col % 3] == 0
    
    # return a random legal move (do not call if all 9 squares are taken!)
    def sample(self):
        if self.moves == 9:
            raise ValueError('cannot sample; board is full')
        while True:
            row = np.random.randint(3)  # 0,1,2
            col = np.random.randint(3)
            if self.board[row][col] == 0:
                return row * 3 + col

    def state(self):
        i = 0
        for row in range(3):
            for col in range(3):
                i += (self.board[row][col] + 1) * (3 ** (row*3+col))
        return int(i)
    
    def step(self):
        self.x_player.move(self, self.state())
        if not self.game_over():
            self.o_player.move(self, self.state())
        return self.winner()
                
    def __str__(self):
        return self.board.__str__()

class RandomPlayer:
    def __init__(self):
        self.value = -1
        
    def move(self, board, state):
        return board.move(board.sample(), self.value)
    
class QPlayer:
    
    alpha = 0.10
    explore = 0.01
    
    def __init__(self):
        self.value = 1
        self.q_table = np.zeros([3**9,9])
        
    def move(self, board, state):
        action = np.argmax(self.q_table[state])
        if not board.is_empty(action) or np.random.random() < QPlayer.explore:
            action = board.sample()

        reward = board.move(action, self.value)
        future_state = board.state()
        self.q_table[state] = (1-QPlayer.alpha) * self.q_table[state] + QPlayer.alpha * (reward + max(self.q_table[future_state]))
    
    def __str__(self):
        return str(np.sum(self.q_table))
        
class HumanPlayer:
    def __init__(self):
        self.value = -1
        
    def move(self, board, state):
        print("++++++++++++++++++++++++")
        print(board)
        board.move(int(input()),-1)

In [6]:
q = QPlayer()
h = HumanPlayer()
while True:
    b = Board(q,h)
    while not b.game_over():
        b.step()
        print('ai',q)
    print(b.winner(),'wins')
        

++++++++++++++++++++++++
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
8
ai 

TypeError: __str__ returned non-string (type numpy.float64)