## Tic Tac Toe

### <u> Description <u> 
In this implementation of a 4x4 tic-tac-toe game, we will use two reinforcement learning (RL) algorithms: Q-Learning and temporal difference(TD). We begin by describing the major components.

- **Agent** <br>
Player 1 and player 2.

- **Environment** <br> 
The board will be initialized as a 4x4 grid containing only zeroes. When player places their piece, the position will be updated with 1 if the move came from player 1 and -1 if the move came from player 2. 

- **State** <br>
The board state (current piece placements and available spaces) of the agent and its opponent. 

- **Actions** <br>
The positions that a player can choose based on the current board state. At each position, players can either play a piece or cannot (the piece is in use by the opponent). Players will take turns placing pieces and will continue until terminal state is reached. The position they place a piece will be randomly selected from the open positions.

- **Terminal state** <br>
Players cannot move anymore (the board is filled and/or a win/lose/draw condition has been reached). 

- **Reward** <br>
The player receives +1 reward if they win, -1 reward if they lose and 0 reward if they draw. 

### <u>Environment<u>

In [5]:
import numpy as np 
import matplotlib.pyplot as plt 
import random 

In [25]:
BOARD_ROWS = 4
BOARD_COLS = 4

PLAYER_X = 1 
PLAYER_0 = 0 

##initialize board 
class Environment:
    def __init__ (self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board_state = None
        self.state = state
        self.terminal = False #bool - game signals terminal state
        self.player_symbol = PLAYER_X
        self.p1 = p1
        self.p2 = p2
         
    #returns position at specific location 
    def get_position (self, x, y):
        return self.board[x][y] #note: [x][y] == [x,y]
    
    #set position (i.e, update state) at specific index to player's symbol
    def set_position (self, x, y, player_symbol):
        self.board[x][y] = player_symbol
#         if ((x > 4) or (y > 4) or (x < 0) or (y < 0)): 
#             print("you're outta bounds sir")
        
    def print_board(self):
        print(self.board)
        
    #reshape current board state to store into state-value dictionary
    def save_state(self):
        self.board_state = str(self.board.reshape(BOARD_COLS * BOARD_ROWS))
        return self.board_state   
        
    #return if position is open 
    def check_open(self, x, y):
        #position was empty, return T;
        if (self.board[x][y] == 0): 
            return True 
        else: 
            #position was filled, return F 
            return False 
    
    #determine winner; if agent wins, return 1. if opponent wins, return -1. 
    def winner(self):
        
        winner = None 

        #horizontal win -- player gets 4 in a row across
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 4:
                self.terminal = True
                winner = 1
            if sum(self.board[i, :]) == 0:
                self.terminal = True
                winner = -1

        # vertical win -- player gets 4 in a column 
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 4:
                self.terminal = True
                winner = 1
            if sum(self.board[:, i]) == 0:
                self.terminal = True
                winner = -1
        
        # diagonal win 
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS-i-1] for i in range(BOARD_COLS)])
        diag_sum = max(diag_sum1, diag_sum2)
        if diag_sum == 4:
            self.terminal = True
            winner = 1
        if diag_sum == 0:
            self.terminal = True
            winner = -1
        
        # tie -- no more available positions
        if len(self.open_positions()) == 0:
            self.terminal = True
            winner = 0
      
        # if the game has not ended, simply return nothing 
        self.terminal = False
        print("Winner is: ", winner)
        return winner
            

#     #determine reward based on winner
#     def reward(self): 
#         result = self.winner() 
        
#         #agent won
#         if (result == 1): 
            
#         #opponent won
#         if (result == -1):
            
#         #tie -- no reward 
            
    
    #return an array of open positions in the board
    def open_positions(self): 
        positions = [] 
        for x in range(BOARD_ROWS):
            for y in range(BOARD_COLS):
                if self.board[x,y] == 0:
                    positions.append((x,y))
        return positions
        
    #clear board, reset all positions to 0
    def reset(self): 
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.terminal = False
        self.playerSymbol = AGENT_SYMBOL
    
    #update state to next state, place piece into position on board
    def update_state(self, position):
        self.board[position] = self.player_symbol
        
        #switch with other player 
        if self.player_symbol == 1:
            self.player_symbol == -1
        else:
            self.player_symbol == 1
        
    #print board
    def show_board(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
        
    # give reward only when game ends
    def give_reward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
            self.terminal == True
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
            self.terminal == True
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
        
    #train the agent to play 
    def play(self, rounds=100):
        #print number of rounds, go by thousands
        for i in range(rounds):
            if i%1000 == 0:
                print("Rounds {}".format(i))
            
            #agent plays game againsts itelf
            while not self.isEnd:
                # Player 1
                positions = self.check_open()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                # check board status if it is end

                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    
                    # ended with p1 either win or draw
                    self.give_reward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.check_open()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)

                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.give_reward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

In [27]:
EPSILON = 0.3
GAMMA = 0.9
LEARNING_RATE = 0.2


class Player:
    def __init__ (self):
        self.name = name
        self.states = [] #stores action and position taken
        self.Env = Environment() 
        self.gamma = GAMMA
        self.epsilon = EPSILON
        self.learning_rate = LEARNING_RATE
        self.terminal = self.Env.terminal #determine when to move to next state 
                
        self.q_values = {} #store q_values as [state][action] in dictionary
        
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for p in open_positions():
                    self.q_values[(i,j)][p] = 0
                    
        
        
    def choose_action(self, positions, current_board, symbol):
        #choose action with most expected value 
        next_reward = 0
        
        #take a random action, store that action into positions 
        if (np.random.uniform(0,1) <= self.epsilon):
            index = np.random.choice(len(positions))
            action = positions[index]
        
        #take a greedy action
        else: 
            max_value = -999
            for p in positions:
                #keep copy of current states 
                next_board = current_board.copy()
                next_board[p] = player_symbol
                next_board_state = self.save_state(next_board)
                
                current_position = self.Env.state 
                next_reward = self.q_values[current_position][p]
                
                if self.q_values.get(next_board_state) is None:
                    value = 0
                else: 
                    self.q_values.get(next_board_state)
                print("value", value)
                
                if value >= max_value:
                    max_value = value
                    action = p
      
        print("{} takes action{}".format(self.name, action))
        return action 
        
    def play(self, rounds=10):
        i=0
        
        while i < rounds:
            
            if self.Env.terminal:

                #back propagate reward 
                reward = self.Env.give_reward()

                for p in self.positions:
                    self.q_values[self.Env.state][p] = reward
                print("Game End Reward", reward)

                for s in reversed(self.states):
                    current_q_value = self.q_values[s[0]][s[1]]
                    reard = current_q_value + self.learning_rate * (self.gamma * reward - current_q_value)
                    self.q_values[s[0]][s[1]] = round(reward, 3)

                self.reset()
                i+=1 