In [24]:
import numpy as np

class Board:
    def __init__(self, ai_player='o'):
        self.board = np.zeros([3,3])
        self.moves = 0
        self.ai_player = ai_player
        
    # returns the max and min of sum of each axis + each diagonal
    def max_min(self):
        col_sum = np.sum(self.board,0)
        row_sum = np.sum(self.board,1)
        maxs = np.maximum(col_sum,row_sum)
        mins = np.minimum(col_sum,row_sum)
        diag0 = self.board.trace(0)
        diag1 = np.flip(self.board,0).trace(0)
        return max(max(maxs), diag0, diag1), min(min(mins), diag0, diag1)

    # returns winner (if one player had won), else None
    def winner(self):
        x,o = self.max_min()
        if x == 3:
            return 'x'
        elif o == -3:
            return 'o'
        else:
            return None

    def game_over(self):
        return self.winner() != None or self.moves == 9
    
    # record a move; return True if move is legal, else False
    def move(self, row_col, player):
        row = row_col // 3
        col = row_col % 3
        if self.board[row][col] == 0:
            self.board[row][col] = player
            self.moves += 1
            if self.game_over():
                winner = self.winner()
                if winner == None:
                    return 0
                elif winner == self.ai_player:
                    return 1
                else:
                    return -1
            else:
                return 0
        else:
            return -1
    
    def is_empty(self, row_col):
        return self.board[row_col // 3,row_col % 3] == 0
    
    # return a random legal move (do not call if all 9 squares are taken!)
    def sample(self):
        if self.moves == 9:
            raise ValueError('cannot sample; board is full')
        while True:
            row = np.random.randint(3)  # 0,1,2
            col = np.random.randint(3)
            if self.board[row][col] == 0:
                return row * 3 + col

    def state(self):
        i = 0
        for row in range(3):
            for col in range(3):
                i += (self.board[row][col] + 1) * (3 ** (row*3+col))
        return int(i)
                
    def __str__(self):
        return self.board.__str__()

class RandomPlayer:
    def __init__(self, player='x'):
        self.player = player
        self.value = 1 if self.player == 'x' else -1
        
    def move(self, board):
        return board.move(board.sample(), self.value)
    
class QPlayer:
    
    explore = 0.01
    
    def __init__(self, player='o'):
        self.player = player
        self.value = 1 if self.player == 'x' else -1
        self.q_table = np.zeros([3**9,9])
        
    def move(self, board):
        state = board.state()
        action = np.argmax(self.q_table[state])
        if not board.is_empty(action) or np.random.random() < QPlayer.explore:
            action = board.sample()

        reward = board.move(action, self.value)
        future_state = board.state()
        self.q_table[state] += reward + max(self.q_table[future_state])

In [25]:
q = QPlayer()
r = RandomPlayer()

board = Board()
while not board.game_over():
    r.move(board)
    if not board.game_over():
        q.move(board)
    print(board)
    
for i in range(100000):
    board = Board()
    while not board.game_over():
        r.move(board)
        if not board.game_over():
            q.move(board)

board = Board()
while not board.game_over():
    r.move(board)
    if not board.game_over():
        q.move(board)
    print(board)

[[ 1.  0.  0.]
 [ 0.  0.  0.]
 [-1.  0.  0.]]
[[ 1. -1.  0.]
 [ 0.  0.  0.]
 [-1.  0.  1.]]
[[ 1. -1.  0.]
 [ 0. -1.  0.]
 [-1.  1.  1.]]
[[ 1. -1. -1.]
 [ 1. -1.  0.]
 [-1.  1.  1.]]
[[-1.  0.  0.]
 [ 0.  0.  0.]
 [ 1.  0.  0.]]
[[-1.  0.  0.]
 [ 0. -1.  1.]
 [ 1.  0.  0.]]
[[-1.  1.  0.]
 [-1. -1.  1.]
 [ 1.  0.  0.]]
[[-1.  1.  0.]
 [-1. -1.  1.]
 [ 1. -1.  1.]]
[[-1.  1.  1.]
 [-1. -1.  1.]
 [ 1. -1.  1.]]


In [15]:
q = QPlayer()
r = RandomPlayer()
for i in range(1000000):
    board = Board()
    while not board.game_over():
        r.move(board)
        if not board.game_over():
            q.move(board)
x_wins = 0
for i in range(1000):
    board = Board()
    while not board.game_over():
        r.move(board)
        if not board.game_over():
            q.move(board) 
    if board.winner() == 'x':
        x_wins += 1
print(x_wins)

574
