In [195]:
import numpy as np
import time

In [413]:
class ttt:
    def __init__(self,n=3):
        self.n = n
        # nxn board is represented as a n*n flattened array
        self.board = np.zeros((n,n)).reshape(n*n)
        # Players are represented by markers.  P1 is +1 and P2 is -1.  Each turn the market is flipped
        self.marker = 1
        # Experimental Rate: Exploration / Exploitation Tradeoff Factor
        self.experimental_rate = 0.33
        self.lr = 0.2
        self.decay = 0.75
        # Board state values
        self.p1_board_value = {}
        self.p2_board_value = {}
        
    def update_board(self,action):
        """
        Action is action(index,marker)
        Index is position on flattened board
        Marker is 1 or -1 based on which player is taking the action
        """    
        self.board[action[0]] = action[1]

    def reset_board(self):
        self.board = np.zeros((n,n)).reshape(n*n)
        self.marker = 1

    def show_board(self):
        b = self.board.reshape(n,n)
        print(b)

    def available_positions(self):
        available_idx = np.nonzero(self.board==0)[0]
        return available_idx

    def set_board_value(self):
        if self.marker == 1:
            board_value = self.p1_board_value.copy()
        if self.marker == -1:
            board_value = self.p2_board_value.copy()
        return board_value

    def choose_action(self,available_idx,no_exploration=False):
        if self.marker == 1:
            board_value = self.p1_board_value.copy()
        if self.marker == -1:
            board_value = self.p2_board_value.copy()
        #available_idx = available_positions()
        if (np.random.uniform(0,1) <= self.experimental_rate)&(no_exploration==False):
            action_ = (np.random.choice(available_idx),self.marker)
        else:
            max_value = -1*float('inf')
            for idx in available_idx:
                board_tplus1 = self.board.copy()
                board_tplus1[idx] = self.marker
                if board_value.get(str(board_tplus1)) is None:
                    value = 0
                else:
                    value = board_value.get(str(board_tplus1))
                if value >= max_value:
                    max_value = value
                    action_ = (idx,self.marker)
        return action_
  
    def switch_turn(self):
        self.marker *= -1
    
    def check_win(self):
        b = self.board.reshape(n,n)
        win = 0

        # Check Rows
        for r in np.sum(b,axis=1):
            if r==3:
                win = 1
            if r==-3:
                win = -1

        # Check Columns
        for c in np.sum(b,axis=0):
            if c==3:
                win = 1
            if c==-3:
                win = -1

        # Check Diagonals
        # Diagonal 1
        d1 = np.sum([b[i][i] for i in range(n)])
        d2 = np.sum([b[i][n-i-1] for i in range(n)])
        if d1 == 3:
            win = 1
        if d1 == -3:
            win = -1
        if d2 == 3:
            win = 1
        if d2 == -3:
            win = -1

        if len(available_positions(self.board)) == 0:
            win = np.nan

        return win

    def play_turn(self,no_exploration=False):
        #board_value = set_board_value()
        available_idx = self.available_positions()
        action = self.choose_action(available_idx,no_exploration)
        self.update_board(action)
        self.switch_turn()

    def assign_reward(self,board_value,board_history,reward):
        for b in reversed(board_history):
            if board_value.get(b) is None:
                board_value[b] = 0
            else:
                board_value[b] += self.lr * (self.decay*reward - board_value[b])
                reward = board_value[b]
        return board_value   

    def train(self,N):
        i = 0
        while i<=N: 
            win = 0
            board_history = []
            while win == 0:
                self.play_turn()
                board_history.append(str(self.board))
                win = self.check_win()
            if win == 1:
                self.p1_board_value = self.assign_reward(self.p1_board_value,board_history,1)
                self.p2_board_value = self.assign_reward(self.p2_board_value,board_history,0)
            if win == -1:
                self.p2_board_value = self.assign_reward(self.p2_board_value,board_history,1)
                self.p1_board_value = self.assign_reward(self.p1_board_value,board_history,0)
            if win is np.nan:
                self.p1_board_value = self.assign_reward(self.p1_board_value,board_history,0.1)
                self.p2_board_value = self.assign_reward(self.p2_board_value,board_history,0.5)  
            self.reset_board()
            i+=1
            if i%1000==0:
                print(("Training Iterations: ",i))

    def human_input(self,available_idx):
        while True:
            row = int(input("Row:"))
            col = int(input("Col:"))
            z = np.zeros((3,3))
            z[row,col] = 1 
            z = z.reshape(n*n)
            action = (np.nonzero(z)[0][0],-1)
            if action[0] in available_idx:
                return action
            else:
                print("Not available")

    def play_human(self):
        self.reset_board()
        win = 0
        board_history = []
        while win == 0:
            self.play_turn(no_exploration=True)
            print("Computers Move")
            win = self.check_win()
            if win == 1:
                print("Computer Wins")
            if win == -1:
                print("Human Wins")
            if win is np.nan:
                print("Draw")
            self.show_board()
            print("Human turn:")
            available_idx = self.available_positions()
            human_action = self.human_input(available_idx)
            self.update_board(human_action)
            self.show_board()
            print('\n')
            win = self.check_win()
            if win == 1:
                print("Computer Wins")
            if win == -1:
                print("Human Wins")
            if win is np.nan:
                print("Draw")
            self.switch_turn()


### Test Class

In [414]:
m = ttt()

In [415]:
m.train(50000)

('Training Iterations: ', 1000)
('Training Iterations: ', 2000)
('Training Iterations: ', 3000)
('Training Iterations: ', 4000)
('Training Iterations: ', 5000)
('Training Iterations: ', 6000)
('Training Iterations: ', 7000)
('Training Iterations: ', 8000)
('Training Iterations: ', 9000)
('Training Iterations: ', 10000)
('Training Iterations: ', 11000)
('Training Iterations: ', 12000)
('Training Iterations: ', 13000)
('Training Iterations: ', 14000)
('Training Iterations: ', 15000)
('Training Iterations: ', 16000)
('Training Iterations: ', 17000)
('Training Iterations: ', 18000)
('Training Iterations: ', 19000)
('Training Iterations: ', 20000)
('Training Iterations: ', 21000)
('Training Iterations: ', 22000)
('Training Iterations: ', 23000)
('Training Iterations: ', 24000)
('Training Iterations: ', 25000)
('Training Iterations: ', 26000)
('Training Iterations: ', 27000)
('Training Iterations: ', 28000)
('Training Iterations: ', 29000)
('Training Iterations: ', 30000)
('Training Iteratio

In [421]:
m.play_human()


Computers Move
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
Human turn:
Row:1
Col:1
[[ 0.  0.  0.]
 [ 0. -1.  0.]
 [ 0.  0.  1.]]


Computers Move
[[ 0.  0.  0.]
 [ 0. -1.  1.]
 [ 0.  0.  1.]]
Human turn:
Row:0
Col:2
[[ 0.  0. -1.]
 [ 0. -1.  1.]
 [ 0.  0.  1.]]


Computers Move
[[ 0.  0. -1.]
 [ 0. -1.  1.]
 [ 1.  0.  1.]]
Human turn:
Row:0
Col:0
[[-1.  0. -1.]
 [ 0. -1.  1.]
 [ 1.  0.  1.]]


Computers Move
Computer Wins
[[-1.  0. -1.]
 [ 0. -1.  1.]
 [ 1.  1.  1.]]
Human turn:


KeyboardInterrupt: ignored

In [None]:
2