In [2]:
import numpy as np
from random import randint
import math
import unittest
import operator
import numbers
import sys
import random

In [197]:
class TicTacToe:

    # There are 5812 legal board states that can be reached before there is a winner
    # http://brianshourd.com/posts/2012-11-06-tilt-number-of-tic-tac-toe-boards.html
    
    __asStr = True
    __bad_move_game_is_over = -1
    __bad_move_action_already_played = -2
    __bad_move_no_consecutive_plays = -3
    __good_move = 0 # value of a free action space on board
    __play = 0.01 # reward for playing an action
    __draw = 0.1 # reward for playing to end but no one wins
    __win = 0.05 # reward for winning a game
    __loss = -0.1 # reward (penalty) for losing a game
    __no_player = -2 # id of a non existent player i.e. used to record id of player that has not played
    __win_mask = np.full((1, 3),3,np.int8)
    actions =  {1:(0,0), 2:(0,1), 3:(0,2), 4:(1,0), 5:(1,1), 6:(1,2), 7:(2,0), 8:(2,1), 9:(2,2)}
    __num_actions = 9
    player_X = 1
    player_O = -1

    #
    # Return game to intial state, where no one has played
    # and the board contains no moves.
    #
    def reset(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
        #NOT __Q = {} - as learning spans games, use the forget function !
        
    #
    # Constructor has no arguments as it just sets the game
    # to an intial up-played set-up
    #
    def __init__(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
        self.__Q = {}
    
    def transfer_learning(self,QV):
        self.__Q = QV
        print("Learned Games:" + str(len(self.__Q)))
    
    #
    # return player as string "X" or "O"
    #
    def __player_to_str(self,player):
        if(player == TicTacToe.player_X): return "X"
        if(player == TicTacToe.player_O): return "O"
        return "?"
        
    #
    # Return a displayable version of the entire game.
    #
    def __str__(self):
        s = ""
        s += "Game Over: " + str(self.__game_over) +"\n"
        s += "Player :" + self.__player_to_str(self.__player) + "\n"
        s += "Current Board : \n" + str(self.__board)+ "\n"
        s += "Prev Player :" + self.__player_to_str(self.__last_player) + "\n"
        s += "Prev Current Board : \n" + str(self.__last_board)+ "\n"
        s += "State" + str(self.state()) + "\n"
        return s
    
    #
    # The learned Q Values for a given state if they exist
    #
    def Q_Vals_for_state(self,state):
        if(state in self.__Q):
            return(self.__Q[state])
        else:
            return(None)

    #
    # Return the list of valid actions
    #
    def list_actions(self):
        return list(self.actions)
    
    #
    # Is the game over ?
    #
    def game_over(self):
        return (self.__game_won() or not self.moves_left_to_take())
    
    #
    # Assume the move has been validated by move method
    # Make a copy of board before move is made and the last player
    #
    def __make_move(self, action, player):
        self.__last_board = np.copy(self.__board)
        self.__last_player = self.__player
        self.__player = player
        self.__board[TicTacToe.actions[action]] = player
        return
    
    #
    # Has a player already moved using the given action.
    #
    def __valid_move(self,action):
        return self.__board[TicTacToe.actions[action]] != self.__good_move
    
    #
    # If the proposed action is a valid move and the game is not
    # over. Make the given move (action) on behalf of the given 
    # player and update the game status.
    #
    # return the rawards (Player who took move, Observer)
    #
    def move(self, action, player):
        if(self.__game_won()) : return TicTacToe.__bad_move_game_is_over
        if(self.__valid_move(action)): return TicTacToe.__bad_move_action_already_played 
        if(player == self.__player): return TicTacToe.__bad_move_no_consecutive_plays 
        
        self.__make_move(action,player)

        if(self.__game_won()):
            self.__game_over = True
            self.__game_drawn = False
            return np.array([TicTacToe.__win,TicTacToe.__loss])
            
        if(not self.moves_left_to_take()):
            self.__game_over = True
            self.__game_drawn = True
            return np.array([TicTacToe.__draw,TicTacToe.__draw])

        return np.array([TicTacToe.__play,TicTacToe.__play])

    #
    # Return (flattened) Game Ended, Last Player, Last Board, Player, Board
    #
    def detailed_state(self):
        flattened_state = []
        if(self.__game_over):
            flattened_state.append(1)
        else:
            flattened_state.append(0)
        flattened_state.append(self.__last_player)
        flattened_state.append(self.__player)
        for itm in np.reshape(self.__last_board,9).tolist() : flattened_state.append(itm)
        for itm in np.reshape(self.__board,9).tolist() : flattened_state.append(itm)
            
        return flattened_state

    #
    # Return state of current board as simple vector or string
    #
    def state(self,tostr=False,plyr=None):
        flattened_state = []
        if(plyr == None):
            flattened_state.append(self.__player)
        else:
            flattened_state.append(plyr)
        
        for itm in np.reshape(self.__board,9).tolist() : flattened_state.append(itm)            
        if not tostr:
            return flattened_state
        else:
            return ''.join(str(e) for e in flattened_state)

    #
    # Show return the current board contents
    #
    def board(self):
        return self.__board
     
    #
    # Any row, column or diagonal with all player X or player O
    #
    def __game_won(self):
        rows = np.abs(np.sum(self.__board,axis=1))
        cols = np.abs(np.sum(self.__board,axis=0))
        diagLR = np.abs(np.sum(self.__board.diagonal()))
        diagRL = np.abs(np.sum(np.rot90(self.__board).diagonal()))
        
        if(np.sum(rows == self.__win_mask) > 0):
            return True
        if(np.sum(cols == self.__win_mask) > 0):
            return True
        if((np.mod(diagLR,3)) == 0) and diagLR > 0:
            return True
        if((np.mod(diagRL,3)) == 0) and diagRL > 0:
            return True
        return False

    #
    # Forget learning
    #
    def forget_learning(self):
        self.__Q = {}

    #
    # Return which player goes next given the current player
    #
    def next_player(self,current_player):
        if(current_player == TicTacToe.player_O):
            return  TicTacToe.player_X
        else:
            return  TicTacToe.player_O

    #
    # Are there any remaining moves to be taken >
    #
    def moves_left_to_take(self):
        return (self.__board[np.where(self.__board == 0)]).size > 0
    
    #
    # Return a random action (move) that is still left
    # to make
    #
    def random_move(self):
        valid_moves = []
        random_action = None
        for key in self.list_actions():
            if(self.__board[TicTacToe.actions[key]] == self.__good_move):
                valid_moves.append(key)
         
        num_poss_moves = len(valid_moves)
        if(num_poss_moves > 0):
            random_action = valid_moves[randint(0, num_poss_moves-1)]
            return random_action
        else:
            return None
        
    @staticmethod
    def __action_to_index(action):
        return (int(action)-1)

    #
    # What moves are valid given for board or if not
    # for the current game board.
    #
    def what_are_valid_moves(self, bd = None):
        if bd == None: bd = self.__board
        vm = np.zeros(len(TicTacToe.actions))
        best_action = None
        for actn,index in TicTacToe.actions.items():
            if(bd[index] == 0):
                vm[int(actn)-1] = True
            else:
                vm[int(actn)-1] = False
        return vm

    #
    # Return an informed action (move) that is still left
    # to make based on given Q values for actions from a 
    # given state
    #
    def informed_move(self):
        # What moves are possible at this stage
        valid_moves = self.what_are_valid_moves()
        
        # Are there any moves ? 
        if(np.sum(valid_moves*np.full(self.__num_actions,1)) == 0):
            return None
           
        # Is there info learned for this state ?
        if self.state(tostr=True) in self.__Q:
            informed_actions = self.__Q[self.state(tostr=True)]
            informed_actions *= valid_moves
            best_action = np.max(informed_actions)
            if(best_action > 0):
                informed_actions = np.arange(1,self.__num_actions+1,1)[np.where(informed_actions == best_action)]
                best_action = informed_actions[randint(0, informed_actions.size-1)]
            else:
                best_action = None

        # If we found a good action then return that 
        # else pick a random action
        if best_action == None:
            actions = valid_moves*np.arange(1,self.__num_actions+1,1)
            actions = actions[np.where(actions > 0)]
            best_action = actions[randint(0,actions.size-1)]

        return int(best_action)

    #
    # Add states to Q Value dictionary if not present
    #
    def add_states_if_missing(self,s1,s2,sp1,sp2):
        if s1 not in self.__Q:
            self.__Q[s1] = np.zeros((self.__num_actions))
        if sp1 not in self.__Q:
            self.__Q[sp1] = np.zeros((self.__num_actions))
        if s2 not in self.__Q:
            self.__Q[s2] = np.zeros((self.__num_actions))
        if sp2 not in self.__Q:
            self.__Q[sp2] = np.zeros((self.__num_actions))

    #
    # Update the Q values for the given player state and
    # the given reward
    #
    def update_Q_Values_for_player(self,mv,s,sp,reward,learning_rate,discount_rate):
        (self.__Q[s])[mv-1] = learning_rate * (self.__Q[s])[mv-1] + (1-learning_rate) * (reward + discount_rate * np.max(self.__Q[sp]))
    
    #
    # Run simulation to estimate Q values for state, action pairs. Random exploration policy
    # which should be tractable with approx 6K valid board states.
    #
    def estimate_Q_values(self,num_simulations,canned_moves=None):
        exploration = 1.0
        decay = (1.0/num_simulations)
        learning_rate0 = 0.05
        learning_rate_decay = 0.1
        discount_rate = 0.95
        reward = 0
        s = None
        sp = None
        sim = 0
        game_step = 0
        plyr = None
        nxt_plyr = None
        score = {TicTacToe.__draw:0,TicTacToe.__win:0}
        while(sim < num_simulations):
            self.reset()
            if canned_moves is None:
                plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Random player to start
                nxt_plyr = self.next_player(plyr)
            mv = None
            while(not self.game_over()):
                
                if canned_moves is None:
                    #random.random() < (exploration-(decay*sim))):
                    mv = self.random_move()
                else:
                    plyr,mv = (canned_moves[sim])[game_step]
                    nxt_plyr = self.next_player(plyr)
                    
                s1 = self.state(self.__asStr,plyr)
                s2 = self.state(self.__asStr,nxt_plyr)
                reward = self.move(mv,plyr)
                sp1 = self.state(self.__asStr,plyr)
                sp2 = self.state(self.__asStr,nxt_plyr)
                
                learning_rate = learning_rate0 / (1 + (sim * learning_rate_decay))
                
                self.add_states_if_missing(s1,s2,sp1,sp2)
                
                self.update_Q_Values_for_player(mv,s1,sp1,reward[0],learning_rate,discount_rate)
                self.update_Q_Values_for_player(mv,s2,sp2,reward[1],learning_rate,discount_rate)
                if canned_moves is None:
                    plyr = nxt_plyr
                    nxt_plyr = self.next_player(plyr)
                game_step += 1
            sim += 1
            game_step = 0
            score[reward[0]] += 1
            if (sim % 1000) == 0 : 
                print(str(sim)+" Win : "+str(round((score[TicTacToe.__win]/sim)*100,0))+"% Draw: " + str(round((score[TicTacToe.__draw]/sim)*100,0))+"%")
        return self.__Q
        

In [198]:
random.seed(42)
np.random.seed(42)
game = TicTacToe()
game.transfer_learning(QV)


Learned Games:17066


In [382]:
QV = game.estimate_Q_values(5000000)

1000 Win : 87.0% Draw: 13.0%
2000 Win : 88.0% Draw: 12.0%
3000 Win : 87.0% Draw: 13.0%
4000 Win : 87.0% Draw: 13.0%
5000 Win : 87.0% Draw: 13.0%
6000 Win : 88.0% Draw: 12.0%
7000 Win : 88.0% Draw: 12.0%
8000 Win : 88.0% Draw: 12.0%
9000 Win : 88.0% Draw: 12.0%
10000 Win : 88.0% Draw: 12.0%
11000 Win : 88.0% Draw: 12.0%
12000 Win : 87.0% Draw: 13.0%
13000 Win : 87.0% Draw: 13.0%
14000 Win : 87.0% Draw: 13.0%
15000 Win : 87.0% Draw: 13.0%
16000 Win : 87.0% Draw: 13.0%
17000 Win : 87.0% Draw: 13.0%
18000 Win : 87.0% Draw: 13.0%
19000 Win : 87.0% Draw: 13.0%
20000 Win : 87.0% Draw: 13.0%
21000 Win : 87.0% Draw: 13.0%
22000 Win : 87.0% Draw: 13.0%
23000 Win : 87.0% Draw: 13.0%
24000 Win : 87.0% Draw: 13.0%
25000 Win : 87.0% Draw: 13.0%
26000 Win : 87.0% Draw: 13.0%
27000 Win : 87.0% Draw: 13.0%
28000 Win : 87.0% Draw: 13.0%
29000 Win : 87.0% Draw: 13.0%
30000 Win : 87.0% Draw: 13.0%
31000 Win : 87.0% Draw: 13.0%
32000 Win : 87.0% Draw: 13.0%
33000 Win : 87.0% Draw: 13.0%
34000 Win : 87.0% D

269000 Win : 87.0% Draw: 13.0%
270000 Win : 87.0% Draw: 13.0%
271000 Win : 87.0% Draw: 13.0%
272000 Win : 87.0% Draw: 13.0%
273000 Win : 87.0% Draw: 13.0%
274000 Win : 87.0% Draw: 13.0%
275000 Win : 87.0% Draw: 13.0%
276000 Win : 87.0% Draw: 13.0%
277000 Win : 87.0% Draw: 13.0%
278000 Win : 87.0% Draw: 13.0%
279000 Win : 87.0% Draw: 13.0%
280000 Win : 87.0% Draw: 13.0%
281000 Win : 87.0% Draw: 13.0%
282000 Win : 87.0% Draw: 13.0%
283000 Win : 87.0% Draw: 13.0%
284000 Win : 87.0% Draw: 13.0%
285000 Win : 87.0% Draw: 13.0%
286000 Win : 87.0% Draw: 13.0%
287000 Win : 87.0% Draw: 13.0%
288000 Win : 87.0% Draw: 13.0%
289000 Win : 87.0% Draw: 13.0%
290000 Win : 87.0% Draw: 13.0%
291000 Win : 87.0% Draw: 13.0%
292000 Win : 87.0% Draw: 13.0%
293000 Win : 87.0% Draw: 13.0%
294000 Win : 87.0% Draw: 13.0%
295000 Win : 87.0% Draw: 13.0%
296000 Win : 87.0% Draw: 13.0%
297000 Win : 87.0% Draw: 13.0%
298000 Win : 87.0% Draw: 13.0%
299000 Win : 87.0% Draw: 13.0%
300000 Win : 87.0% Draw: 13.0%
301000 W

534000 Win : 87.0% Draw: 13.0%
535000 Win : 87.0% Draw: 13.0%
536000 Win : 87.0% Draw: 13.0%
537000 Win : 87.0% Draw: 13.0%
538000 Win : 87.0% Draw: 13.0%
539000 Win : 87.0% Draw: 13.0%
540000 Win : 87.0% Draw: 13.0%
541000 Win : 87.0% Draw: 13.0%
542000 Win : 87.0% Draw: 13.0%
543000 Win : 87.0% Draw: 13.0%
544000 Win : 87.0% Draw: 13.0%
545000 Win : 87.0% Draw: 13.0%
546000 Win : 87.0% Draw: 13.0%
547000 Win : 87.0% Draw: 13.0%
548000 Win : 87.0% Draw: 13.0%
549000 Win : 87.0% Draw: 13.0%
550000 Win : 87.0% Draw: 13.0%
551000 Win : 87.0% Draw: 13.0%
552000 Win : 87.0% Draw: 13.0%
553000 Win : 87.0% Draw: 13.0%
554000 Win : 87.0% Draw: 13.0%
555000 Win : 87.0% Draw: 13.0%
556000 Win : 87.0% Draw: 13.0%
557000 Win : 87.0% Draw: 13.0%
558000 Win : 87.0% Draw: 13.0%
559000 Win : 87.0% Draw: 13.0%
560000 Win : 87.0% Draw: 13.0%
561000 Win : 87.0% Draw: 13.0%
562000 Win : 87.0% Draw: 13.0%
563000 Win : 87.0% Draw: 13.0%
564000 Win : 87.0% Draw: 13.0%
565000 Win : 87.0% Draw: 13.0%
566000 W

799000 Win : 87.0% Draw: 13.0%
800000 Win : 87.0% Draw: 13.0%
801000 Win : 87.0% Draw: 13.0%
802000 Win : 87.0% Draw: 13.0%
803000 Win : 87.0% Draw: 13.0%
804000 Win : 87.0% Draw: 13.0%
805000 Win : 87.0% Draw: 13.0%
806000 Win : 87.0% Draw: 13.0%
807000 Win : 87.0% Draw: 13.0%
808000 Win : 87.0% Draw: 13.0%
809000 Win : 87.0% Draw: 13.0%
810000 Win : 87.0% Draw: 13.0%
811000 Win : 87.0% Draw: 13.0%
812000 Win : 87.0% Draw: 13.0%
813000 Win : 87.0% Draw: 13.0%
814000 Win : 87.0% Draw: 13.0%
815000 Win : 87.0% Draw: 13.0%
816000 Win : 87.0% Draw: 13.0%
817000 Win : 87.0% Draw: 13.0%
818000 Win : 87.0% Draw: 13.0%
819000 Win : 87.0% Draw: 13.0%
820000 Win : 87.0% Draw: 13.0%
821000 Win : 87.0% Draw: 13.0%
822000 Win : 87.0% Draw: 13.0%
823000 Win : 87.0% Draw: 13.0%
824000 Win : 87.0% Draw: 13.0%
825000 Win : 87.0% Draw: 13.0%
826000 Win : 87.0% Draw: 13.0%
827000 Win : 87.0% Draw: 13.0%
828000 Win : 87.0% Draw: 13.0%
829000 Win : 87.0% Draw: 13.0%
830000 Win : 87.0% Draw: 13.0%
831000 W

1062000 Win : 87.0% Draw: 13.0%
1063000 Win : 87.0% Draw: 13.0%
1064000 Win : 87.0% Draw: 13.0%
1065000 Win : 87.0% Draw: 13.0%
1066000 Win : 87.0% Draw: 13.0%
1067000 Win : 87.0% Draw: 13.0%
1068000 Win : 87.0% Draw: 13.0%
1069000 Win : 87.0% Draw: 13.0%
1070000 Win : 87.0% Draw: 13.0%
1071000 Win : 87.0% Draw: 13.0%
1072000 Win : 87.0% Draw: 13.0%
1073000 Win : 87.0% Draw: 13.0%
1074000 Win : 87.0% Draw: 13.0%
1075000 Win : 87.0% Draw: 13.0%
1076000 Win : 87.0% Draw: 13.0%
1077000 Win : 87.0% Draw: 13.0%
1078000 Win : 87.0% Draw: 13.0%
1079000 Win : 87.0% Draw: 13.0%
1080000 Win : 87.0% Draw: 13.0%
1081000 Win : 87.0% Draw: 13.0%
1082000 Win : 87.0% Draw: 13.0%
1083000 Win : 87.0% Draw: 13.0%
1084000 Win : 87.0% Draw: 13.0%
1085000 Win : 87.0% Draw: 13.0%
1086000 Win : 87.0% Draw: 13.0%
1087000 Win : 87.0% Draw: 13.0%
1088000 Win : 87.0% Draw: 13.0%
1089000 Win : 87.0% Draw: 13.0%
1090000 Win : 87.0% Draw: 13.0%
1091000 Win : 87.0% Draw: 13.0%
1092000 Win : 87.0% Draw: 13.0%
1093000 

1319000 Win : 87.0% Draw: 13.0%
1320000 Win : 87.0% Draw: 13.0%
1321000 Win : 87.0% Draw: 13.0%
1322000 Win : 87.0% Draw: 13.0%
1323000 Win : 87.0% Draw: 13.0%
1324000 Win : 87.0% Draw: 13.0%
1325000 Win : 87.0% Draw: 13.0%
1326000 Win : 87.0% Draw: 13.0%
1327000 Win : 87.0% Draw: 13.0%
1328000 Win : 87.0% Draw: 13.0%
1329000 Win : 87.0% Draw: 13.0%
1330000 Win : 87.0% Draw: 13.0%
1331000 Win : 87.0% Draw: 13.0%
1332000 Win : 87.0% Draw: 13.0%
1333000 Win : 87.0% Draw: 13.0%
1334000 Win : 87.0% Draw: 13.0%
1335000 Win : 87.0% Draw: 13.0%
1336000 Win : 87.0% Draw: 13.0%
1337000 Win : 87.0% Draw: 13.0%
1338000 Win : 87.0% Draw: 13.0%
1339000 Win : 87.0% Draw: 13.0%
1340000 Win : 87.0% Draw: 13.0%
1341000 Win : 87.0% Draw: 13.0%
1342000 Win : 87.0% Draw: 13.0%
1343000 Win : 87.0% Draw: 13.0%
1344000 Win : 87.0% Draw: 13.0%
1345000 Win : 87.0% Draw: 13.0%
1346000 Win : 87.0% Draw: 13.0%
1347000 Win : 87.0% Draw: 13.0%
1348000 Win : 87.0% Draw: 13.0%
1349000 Win : 87.0% Draw: 13.0%
1350000 

1576000 Win : 87.0% Draw: 13.0%
1577000 Win : 87.0% Draw: 13.0%
1578000 Win : 87.0% Draw: 13.0%
1579000 Win : 87.0% Draw: 13.0%
1580000 Win : 87.0% Draw: 13.0%
1581000 Win : 87.0% Draw: 13.0%
1582000 Win : 87.0% Draw: 13.0%
1583000 Win : 87.0% Draw: 13.0%
1584000 Win : 87.0% Draw: 13.0%
1585000 Win : 87.0% Draw: 13.0%
1586000 Win : 87.0% Draw: 13.0%
1587000 Win : 87.0% Draw: 13.0%
1588000 Win : 87.0% Draw: 13.0%
1589000 Win : 87.0% Draw: 13.0%
1590000 Win : 87.0% Draw: 13.0%
1591000 Win : 87.0% Draw: 13.0%
1592000 Win : 87.0% Draw: 13.0%
1593000 Win : 87.0% Draw: 13.0%
1594000 Win : 87.0% Draw: 13.0%
1595000 Win : 87.0% Draw: 13.0%
1596000 Win : 87.0% Draw: 13.0%
1597000 Win : 87.0% Draw: 13.0%
1598000 Win : 87.0% Draw: 13.0%
1599000 Win : 87.0% Draw: 13.0%
1600000 Win : 87.0% Draw: 13.0%
1601000 Win : 87.0% Draw: 13.0%
1602000 Win : 87.0% Draw: 13.0%
1603000 Win : 87.0% Draw: 13.0%
1604000 Win : 87.0% Draw: 13.0%
1605000 Win : 87.0% Draw: 13.0%
1606000 Win : 87.0% Draw: 13.0%
1607000 

1833000 Win : 87.0% Draw: 13.0%
1834000 Win : 87.0% Draw: 13.0%
1835000 Win : 87.0% Draw: 13.0%
1836000 Win : 87.0% Draw: 13.0%
1837000 Win : 87.0% Draw: 13.0%
1838000 Win : 87.0% Draw: 13.0%
1839000 Win : 87.0% Draw: 13.0%
1840000 Win : 87.0% Draw: 13.0%
1841000 Win : 87.0% Draw: 13.0%
1842000 Win : 87.0% Draw: 13.0%
1843000 Win : 87.0% Draw: 13.0%
1844000 Win : 87.0% Draw: 13.0%
1845000 Win : 87.0% Draw: 13.0%
1846000 Win : 87.0% Draw: 13.0%
1847000 Win : 87.0% Draw: 13.0%
1848000 Win : 87.0% Draw: 13.0%
1849000 Win : 87.0% Draw: 13.0%
1850000 Win : 87.0% Draw: 13.0%
1851000 Win : 87.0% Draw: 13.0%
1852000 Win : 87.0% Draw: 13.0%
1853000 Win : 87.0% Draw: 13.0%
1854000 Win : 87.0% Draw: 13.0%
1855000 Win : 87.0% Draw: 13.0%
1856000 Win : 87.0% Draw: 13.0%
1857000 Win : 87.0% Draw: 13.0%
1858000 Win : 87.0% Draw: 13.0%
1859000 Win : 87.0% Draw: 13.0%
1860000 Win : 87.0% Draw: 13.0%
1861000 Win : 87.0% Draw: 13.0%
1862000 Win : 87.0% Draw: 13.0%
1863000 Win : 87.0% Draw: 13.0%
1864000 

KeyboardInterrupt: 

In [180]:
def informed_move(game,st,rnd):
    # What moves are possible at this stage
    valid_moves = game.what_are_valid_moves()
        
    # Are there any moves ? 
    if(np.sum(valid_moves*np.full(9,1)) == 0):
        return None
    
    best_action = None
    if(not rnd):
        # Is there info learned for this state ?
        informed_actions = game.Q_Vals_for_state(st)
        if not informed_actions is None:
            informed_actions *= valid_moves
            best_action = np.max(informed_actions)
            if(best_action > 0):
                informed_actions = np.arange(1,9+1,1)[np.where(informed_actions == best_action)]
                best_action = informed_actions[randint(0, informed_actions.size-1)]
            else:
                best_action = None

    # If we found a good action then return that 
    # else pick a random action
    if best_action == None:
        actions = valid_moves*np.arange(1,9+1,1)
        actions = actions[np.where(actions > 0)]
        best_action = actions[randint(0,actions.size-1)]

    return int(best_action)

In [181]:
def play(game):
        game.reset()
        plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Random player to start
        print(plyr)
        mv = None
        while(not game.game_over()):
            print("--")
            print(game.board())
            st = game.state(True,plyr)
            print(st)
            QV = game.Q_Vals_for_state(st)
            print(QV)
            mx = np.max(game.Q_Vals_for_state(st))
            print(mx)
            print(QV*(QV==mx))
            if(plyr == TicTacToe.player_X):
                mv = informed_move(game,st,False)
            else:
                mv = informed_move(game,st,True)            
            print("Move:" +str(mv))
            game.move(mv,plyr)
            print(game.board())
            plyr = game.next_player(plyr)
            print("--\n")
        print("-")
        print(game.board())
        print(game.Q_Vals_for_state(st))
        return

In [182]:
def _game_won(bd,plyr=None):
    
    if not plyr is None: bd = (bd==plyr)*1
        
    rows = np.abs(np.sum(bd,axis=1))
    cols = np.abs(np.sum(bd,axis=0))
    diagLR = np.abs(np.sum(bd.diagonal()))
    diagRL = np.abs(np.sum(np.rot90(bd).diagonal()))        
    
    if(np.sum(rows == 3) > 0):
        return True
    if(np.sum(cols == 3) > 0):
        return True
    if((np.mod(diagLR,3)) == 0) and diagLR > 0:
        return True
    if((np.mod(diagRL,3)) == 0) and diagRL > 0:
        return True
    return False

In [183]:
def _play(game):
        game.reset()
        plyr = TicTacToe.player_X
        mv = None
        profile= ""
        while(not game.game_over()):
            st = game.state(True,plyr)
            QV = game.Q_Vals_for_state(st)
            mx = np.max(game.Q_Vals_for_state(st))
            if(plyr == TicTacToe.player_X):
                mv = informed_move(game,st,False)
            else:
                mv = informed_move(game,st,True)            
            game.move(mv,plyr)
            profile += str(plyr)+":"+str(mv)+"~"
            plyr = game.next_player(plyr)
        return profile

In [184]:
def st(bd):
    flattened_state = []
    for itm in np.reshape(bd,9).tolist() : flattened_state.append(itm)            
    return ''.join(str(e) for e in flattened_state)

def record_game_stats(D,profile):
    if profile in D:
        D[profile] += 1
    else:
        D[profile] = 1
    return

In [238]:
def play_many(game,num):
    informed_wins = 0
    random_wins = 0
    draws = 0
    I = {}
    R = {}
    D = {}
    G = {}
    profile = ""
    for x in range(0, num):
        profile = _play(game)
        if profile not in G: G[profile]=""
        if _game_won(game.board(),TicTacToe.player_X):
            informed_wins += 1
            record_game_stats(I,profile)
        else:
            if _game_won(game.board(),TicTacToe.player_O):
                random_wins +=1
                record_game_stats(R,profile)
            else: 
                record_game_stats(D,profile)
                draws += 1
        if(x % 100) == 0 : print (str(x))
    print("Informed :" +  str(informed_wins)+" : " + str(round((informed_wins/num)*100,0)))
    print("Random :" +  str(random_wins)+" : " + str(round((random_wins/num)*100,0)))
    print("Draw :" + str(draws)+" : " + str(round((draws/num)*100,0)))
    print("Diff Games :" +  str(len(G)))
    return (I,R,D)

In [400]:
GI = {}
GR = {}
GD = {}
for j in range (0,50):
    GI,GR,GD = play_many(game,500)
    GRD = moves_to_dict(GR)
    print("re learn")
    QV = game.estimate_Q_values(len(GRD),GRD)


0
100
200
300
400
Informed :309 : 62.0
Random :80 : 16.0
Draw :111 : 22.0
Diff Games :255
re learn
0
100
200
300
400
Informed :326 : 65.0
Random :60 : 12.0
Draw :114 : 23.0
Diff Games :268
re learn
0
100
200
300
400
Informed :316 : 63.0
Random :81 : 16.0
Draw :103 : 21.0
Diff Games :227
re learn
0
100
200
300
400
Informed :344 : 69.0
Random :51 : 10.0
Draw :105 : 21.0
Diff Games :248
re learn
0
100
200
300
400
Informed :324 : 65.0
Random :66 : 13.0
Draw :110 : 22.0
Diff Games :263
re learn
0
100
200
300
400
Informed :289 : 58.0
Random :92 : 18.0
Draw :119 : 24.0
Diff Games :264
re learn
0
100
200
300
400
Informed :310 : 62.0
Random :66 : 13.0
Draw :124 : 25.0
Diff Games :253
re learn
0
100
200
300
400
Informed :315 : 63.0
Random :87 : 17.0
Draw :98 : 20.0
Diff Games :243
re learn
0
100
200
300
400
Informed :312 : 62.0
Random :77 : 15.0
Draw :111 : 22.0
Diff Games :248
re learn
0
100
200
300
400
Informed :309 : 62.0
Random :82 : 16.0
Draw :109 : 22.0
Diff Games :254
re learn
0
100
200
3

KeyboardInterrupt: 

In [397]:
GI = {}
GR = {}
GD = {}
GI,GR,GD = play_many(game,500)

0
100
200
300
400
Informed :263 : 53.0
Random :186 : 37.0
Draw :51 : 10.0
Diff Games :366


In [375]:
GM = {}
#GM['1:1~-1:4~1:3~-1:9~1:2~']=0
GM['1:9~-1:3~1:1~-1:4~1:5~']=0
GMD = moves_to_dict(GR)
for i in range (0,1000):
    QV = game.estimate_Q_values(len(GMD),GMD)


In [390]:
#game.transfer_learning(QV)
game.reset()

In [396]:
plyr = TicTacToe.player_X
st = game.state(True,plyr)
QV = game.Q_Vals_for_state(st)
mx = np.max(game.Q_Vals_for_state(st))
print(st)
print(mx)
print((QV==mx)*mx)
game.move(informed_move(game,st,False),plyr)
print(game.board())


1-10-110001-1
0.064625
[ 0.        0.        0.        0.        0.064625  0.        0.        0.
  0.      ]
[[-1  0 -1]
 [ 1  1  0]
 [ 0  1 -1]]


In [395]:
game.move(3,TicTacToe.player_O)
print(game.board())

[[-1  0 -1]
 [ 1  0  0]
 [ 0  1 -1]]


In [155]:
def move_str_to_array(moves_as_str):
    mvd = {}
    mvc = 0
    mvs = moves_as_str.split('~')
    for mv in mvs:
        if(len(mv)>0):
            pl,ps = mv.split(":")
            mvd[mvc]=(int(pl),int(ps))
        mvc +=1
    return mvd

def moves_to_dict(D):
    MD = {}
    i = 0
    for mvss,cnt in D.items():
        MD[i] = move_str_to_array(mvss)
        i+=1
    return MD
    

In [None]:
#@staticmethod
def state_to_board(st): 
    if(st[0]=='-'):
        st = st[2:] # ignore first number as this is the player not a board element
    else:
        st = st[1:]
    bd = np.zeros((3, 3),np.int8)
    i = 0
    j = 0
    plyr = 1
    for c in st:
        if c == '-':
            plyr = -1
        else:
            bd[i,j] = int(c) * plyr
            if plyr == -1: plyr = 1
            j+=1
            if j == 3:
                j=0
                i+=1
    return bd

def valid_moves(board):
    vm = np.zeros(len(TicTacToe.actions))
    best_action = None
    for actn,index in TicTacToe.actions.items():
        if(bd[index] == 0):
            vm[int(actn)-1] = True
        else:
            vm[int(actn)-1] = False
    return vm

In [None]:
#alla = {1:(0,0), 2:(0,1), 3:(0,2), 4:(1,0), 5:(1,1), 6:(1,2), 7:(2,0), 8:(2,1), 9:(2,2)}
print(type(TicTacToe.actions))
bd = state_to_board("-1-10-1010110")
vm = valid_moves(bd)
print(bd)
print(vm)

In [None]:
print(valid_moves(state_to_board("-1-10-1010110")))

In [None]:
#
# Run A Single Test
#
test_to_run = "test_one"
suite = unittest.TestSuite()
suite.addTest(testTicTacToe(test_to_run))
runner = unittest.TextTestRunner()
runner.run(suite)

In [None]:
#
# Run All Tests.
#
tests = testTicTacToe()
suite = unittest.TestLoader().loadTestsFromModule(tests)
unittest.TextTestRunner().run(suite)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [None]:
class playTicTacToe:
    __actor_network_name = "net/actor"
    __critic_network_name = "net/critic"
    __x_units = 21

    def __init__(self):
        return
    
    #
    # Build a four layer fully connected Network.
    # 21 -> relu -> 21 -> relu -> 21 -> relu -> 21 -> relu
    #
    def construct_network(X_state,network_name):
        return

In [383]:
print(len(QV))

17066
