In [1]:
import numpy as np
import pickle

In [2]:
ROWS = 3
COLS = 3

In [3]:
class State:
    def __init__(self, player1, player2):
        self.board     = np.zeros((ROWS, COLS))
        self.player1   = player1
        self.player2   = player2
        self.isFin     = False
        self.boardDict = None
        self.playerSymbol   = 1

    def getDict(self):
        self.boardDict = str(self.board.reshape(COLS * ROWS))
        return self.boardDict

    def availablePositions(self):
        positions = []
        for i in range(ROWS):
            for j in range(COLS):
                if self.board[i,j] == 0:
                    positions.append((i,j))
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # function to check if the game is finished and judge the winner
    def winner(self):
        # For vertical win
        for i in range(ROWS):
            if sum(self.board[i, :]) == 3:
                self.isFin = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isFin = True
                return -1

        # For horizontal win
        for i in range(COLS):
            if sum(self.board[:, i]) == 3:
                self.isFin = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isFin = True
                return -1

        # For diagonal win
        sum1    = sum([self.board[i,i] for i in range(COLS)])
        sum2    = sum([self.board[i, COLS-i-1] for i in range(COLS)])
        dia_sum = max(abs(sum1), abs(sum2))

        if dia_sum == 3:
            self.isFin = True
            if sum1 == 3 or sum2 == 3:
                return 1
            else:
                return -1

        # For the game ties
        if len(self.availablePositions()) == 0:
            self.isFin = True
            return 0

        self.isFin = False
        return None

    # Give reward only when game ends
    # Reward is between 0 and 1
    def giveReward(self):
        res = self.winner()

        if res == 1:
            self.player1.feedReward(1)
            self.player2.feedReward(0)
        elif res == -1:
            self.player1.feedReward(0)
            self.player2.feedReward(1)
        else:
            self.player1.feedReward(0.4)
            self.player2.feedReward(0.8)

    def play(self, rounds=100):
        for i in range(rounds):
            if i%1000 == 0:
                print('Rounds {}'.format(i))
            while not self.isFin:
                # Player1
                positions = self.availablePositions()
                p1_action = self.player1.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p1_action)
                board_dict = self.getDict()
                self.player1.addState(board_dict)

                win = self.winner()
                if win is not None:
                    self.giveReward()
                    self.player1.reset()
                    self.player2.reset()
                    self.reset()
                    break
                else:
                    positions = self.availablePositions()
                    p2_action = self.player2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_dict = self.getDict()
                    self.player2.addState(board_dict)

                    win = self.winner()
                    if win is not None:
                        self.giveReward()
                        self.player1.reset()
                        self.player2.reset()
                        self.reset()
                        break

    def playWithHuman(self):
        while not self.isFin:
            # player1
            positions = self.availablePositions()
            p1_action = self.player1.chooseAction(positions, self.board, self.playerSymbol)
            self.updateState(p1_action)
            self.showBoard()

            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.player1.name, "wins!")
                else:
                    print("Game has tied!")
                self.reset()
                break
            else:
                # player2
                positions = self.availablePositions()
                p2_action = self.player2.chooseAction(positions)
                self.updateState(p2_action)
                self.showBoard()
                
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.player2.name, " wins!")
                    else:
                        print("Game has tied!")
                    self.reset()
                    break



    # Display the game board
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, ROWS):
            print('-------------')
            out = '| '
            for j in range(0, COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

    # board reset
    def reset(self):
        self.board = np.zeros((ROWS, COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
    

In [4]:
class Players:
    def __init__(self, name, exp_rate=0.3):
        self.name         = name
        self.states       = []
        self.alpha        = 0.2
        self.exp_rate     = exp_rate
        self.gamma        = 0.9
        self.states_value = {}

    def chooseAction(self, positions, current_game, symbol):
        if np.random.uniform(0,1) <= self.exp_rate:
            idx    = np.random.choice(len(positions))
            action = positions[idx]
        else:
            max = -999
            for p in positions:
                next_game     = current_game.copy()
                next_game[p]  = symbol
                next_gameDict = self.getDict(next_game)

                value = 0 if self.states_value.get(next_gameDict) is None else self.states_value.get(next_gameDict)

                if value >= max:
                    max    = value
                    action = p
        
        return action

    def feedReward(self, reward):
        for state in reversed(self.states):
            if self.states_value.get(state) is None:
                self.states_value[state] = 0
            self.states_value[state] += self.alpha*(self.gamma*reward - self.states_value[state])

            reward = self.states_value[state]

    def getDict(self, board):
        boardDict = str(board.reshape(COLS*ROWS))

    def addState(self, state):
        self.states.append(state)

    def reset(self):
        self.states = []

    def savePolicy(self):
        fwrite = open('policy_'+str(self.name), 'wb')
        pickle.dump(self.states_value, fwrite)
        fwrite.close()

    def loadPolicy(self, file):
        fread = open(file, 'rb')
        self.states_value = pickle.load(fread)
        fread.close()

In [5]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    def addState(self, state):
        pass

    def feedReward(self, reward):
        pass

    def reset(self):
        pass

In [6]:
p1 = Players("p1")
p2 = Players("p2")

st = State(p1, p2)
print("training...")
st.play(50000)

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [7]:
p1.savePolicy()
p2.savePolicy()

In [8]:
p1.loadPolicy('policy_p1')

In [9]:
p1 = Players("computer", exp_rate=0)
p1.loadPolicy("policy_p1")

p2 = HumanPlayer("human")

st = State(p1, p2)
st.playWithHuman()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | x | 
-------------
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   | o | x | 
-------------
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
| x | o | x | 
-------------
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
| x | o | x | 
-------------
-------------
|   |   |   | 
-------------
|   | o | x | 
-------------
| x | o | x | 
-------------
-------------
|   | o |   | 
-------------
|   | o | x | 
-------------
| x | o | x | 
-------------
human  wins!
