In [1]:
import numpy as np
from random import randint
import math
import unittest
import operator
import numbers
import sys
import random

In [2]:
class TicTacToe:

    # There are 5812 legal board states that can be reached before there is a winner
    # http://brianshourd.com/posts/2012-11-06-tilt-number-of-tic-tac-toe-boards.html
    
    __asStr = True
    __bad_move_game_is_over = -1
    __bad_move_action_already_played = -2
    __bad_move_no_consecutive_plays = -3
    __good_move = 0 # value of a free action space on board
    __play = 0.01 # reward for playing an action
    __draw = 0.1 # reward for playing to end but no one wins
    __win = 0.05 # reward for winning a game
    __loss = -0.1 # reward (penalty) for losing a game
    __no_player = -2 # id of a non existent player i.e. used to record id of player that has not played
    __win_mask = np.full((1, 3),3,np.int8)
    actions =  {1:(0,0), 2:(0,1), 3:(0,2), 4:(1,0), 5:(1,1), 6:(1,2), 7:(2,0), 8:(2,1), 9:(2,2)}
    __num_actions = 9
    player_X = 1
    player_O = -1

    #
    # Return game to intial state, where no one has played
    # and the board contains no moves.
    #
    def reset(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
        
    #
    # Constructor has no arguments as it just sets the game
    # to an intial up-played set-up
    #
    def __init__(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
    
    #
    # return player as string "X" or "O"
    #
    def __player_to_str(self,player):
        if(player == TicTacToe.player_X): return "X"
        if(player == TicTacToe.player_O): return "O"
        return "?"
        
    #
    # Return a displayable version of the entire game.
    #
    def __str__(self):
        s = ""
        s += "Game Over: " + str(self.__game_over) +"\n"
        s += "Player :" + self.__player_to_str(self.__player) + "\n"
        s += "Current Board : \n" + str(self.__board)+ "\n"
        s += "Prev Player :" + self.__player_to_str(self.__last_player) + "\n"
        s += "Prev Current Board : \n" + str(self.__last_board)+ "\n"
        s += "State" + str(self.state()) + "\n"
        return s
    
    #
    # Return the list of valid actions
    #
    def list_actions(self):
        return list(self.actions)
    
    #
    # Assume the move has been validated by move method
    # Make a copy of board before move is made and the last player
    #
    def __make_move(self, action, player):
        self.__last_board = np.copy(self.__board)
        self.__last_player = self.__player
        self.__player = player
        self.__board[TicTacToe.actions[action]] = player
        return
    
    #
    # Has a player already moved using the given action.
    #
    def __valid_move(self,action):
        return self.__board[TicTacToe.actions[action]] != self.__good_move
    
    #
    # If the proposed action is a valid move and the game is not
    # over. Make the given move (action) on behalf of the given 
    # player and update the game status.
    #
    # return the rawards (Player who took move, Observer)
    #
    def move(self, action, player):
        if(self.game_won()) : return TicTacToe.__bad_move_game_is_over
        if(self.__valid_move(action)): return TicTacToe.__bad_move_action_already_played 
        if(player == self.__player): return TicTacToe.__bad_move_no_consecutive_plays 
        
        self.__make_move(action,player)

        if(self.game_won()):
            self.__game_over = True
            self.__game_drawn = False
            return np.array([TicTacToe.__win,TicTacToe.__loss])
            
        if(not self.moves_left_to_take()):
            self.__game_over = True
            self.__game_drawn = True
            return np.array([TicTacToe.__draw,TicTacToe.__draw])

        return np.array([TicTacToe.__play,TicTacToe.__play])

    #
    # Return (flattened) Game Ended, Last Player, Last Board, Player, Board
    #
    def detailed_state(self):
        flattened_state = []
        if(self.__game_over):
            flattened_state.append(1)
        else:
            flattened_state.append(0)
        flattened_state.append(self.__last_player)
        flattened_state.append(self.__player)
        for itm in np.reshape(self.__last_board,9).tolist() : flattened_state.append(itm)
        for itm in np.reshape(self.__board,9).tolist() : flattened_state.append(itm)
            
        return flattened_state

    #
    # Return state of current board as simple vector or string
    #
    def state(self,tostr=False,plyr=None):
        flattened_state = []
        if(plyr == None):
            flattened_state.append(self.__player)
        else:
            flattened_state.append(plyr)
        
        for itm in np.reshape(self.__board,9).tolist() : flattened_state.append(itm)            
        if not tostr:
            return flattened_state
        else:
            return ''.join(str(e) for e in flattened_state)

    #
    # Show return the current board contents
    #
    def board(self):
        return self.__board
     
    #
    # Any row, column or diagonal with all player X or player O. If a
    # player is given then it answers has that specific player won
    #
    def game_won(self,board=None,plyr=None):
    
        if board is None: bd = self.__board
        if not plyr is None: bd = (bd==plyr)*1
        
        rows = np.abs(np.sum(bd,axis=1))
        cols = np.abs(np.sum(bd,axis=0))
        diagLR = np.abs(np.sum(bd.diagonal()))
        diagRL = np.abs(np.sum(np.rot90(bd).diagonal()))        
    
        if(np.sum(rows == 3) > 0):
            return True
        if(np.sum(cols == 3) > 0):
            return True
        if((np.mod(diagLR,3)) == 0) and diagLR > 0:
            return True
        if((np.mod(diagRL,3)) == 0) and diagRL > 0:
            return True
        return False

    #
    # Is the game over ?
    #
    def game_over(self):
        return (self.game_won() or not self.moves_left_to_take())
    
    #
    # Return which player goes next given the current player
    #
    @staticmethod
    def next_player(current_player):
        if(current_player == TicTacToe.player_O):
            return  TicTacToe.player_X
        else:
            return  TicTacToe.player_O

    #
    # Are there any remaining moves to be taken >
    #
    def moves_left_to_take(self):
        return (self.__board[np.where(self.__board == 0)]).size > 0
    
    #
    # What moves are valid given for board or if not
    # for the current game board.
    #
    def what_are_valid_moves(self, bd = None):
        if bd == None: bd = self.__board
        vm = np.zeros(len(TicTacToe.actions))
        best_action = None
        for actn,index in TicTacToe.actions.items():
            if(bd[index] == 0):
                vm[int(actn)-1] = True
            else:
                vm[int(actn)-1] = False
        return vm


In [12]:
class PlayTicTacToe:

    #
    # Constructor has no arguments as it just sets the game
    # to an intial up-played set-up
    #
    def __init__(self):
        self.__game = TicTacToe()
        self.__Q = {}
    
    #
    # Set leared state to given QValues.
    #
    def transfer_learning(self,QV):
        self.__Q = QV
        print("Learned Games:" + str(len(self.__Q)))
        
    #
    # The learned Q Values for a given state if they exist
    #
    def Q_Vals_for_state(self,state):
        if(state in self.__Q):
            return(self.__Q[state])
        else:
            return(None)

    #
    # Forget learning
    #
    def forget_learning(self):
        self.__Q = {}

    #
    # Add states to Q Value dictionary if not present
    #
    def add_states_if_missing(self,s1,s2,sp1,sp2):
        if s1 not in self.__Q:
            self.__Q[s1] = np.zeros((self.__num_actions))
        if sp1 not in self.__Q:
            self.__Q[sp1] = np.zeros((self.__num_actions))
        if s2 not in self.__Q:
            self.__Q[s2] = np.zeros((self.__num_actions))
        if sp2 not in self.__Q:
            self.__Q[sp2] = np.zeros((self.__num_actions))

    #
    # Update the Q values for the given player state and
    # the given reward
    #
    def update_Q_Values_for_player(self,mv,s,sp,reward,learning_rate,discount_rate):
        (self.__Q[s])[mv-1] = learning_rate * (self.__Q[s])[mv-1] + (1-learning_rate) * (reward + discount_rate * np.max(self.__Q[sp]))
    
    #
    # Run simulation to estimate Q values for state, action pairs. Random exploration policy
    # which should be tractable with approx 6K valid board states.
    #
    def train_Q_values(self,num_simulations,canned_moves=None):
        exploration = 1.0
        decay = (1.0/num_simulations)
        learning_rate0 = 0.05
        learning_rate_decay = 0.1
        discount_rate = 0.95
        reward = 0
        s = None
        sp = None
        sim = 0
        game_step = 0
        plyr = None
        nxt_plyr = None
        score = {TicTacToe.__draw:0,TicTacToe.__win:0}
        while(sim < num_simulations):
            self.reset()
            if canned_moves is None:
                plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Random player to start
                nxt_plyr = self.next_player(plyr)
            mv = None
            while(not self.__game.game_over()):
                
                if canned_moves is None:
                    #random.random() < (exploration-(decay*sim))):
                    mv = self.random_move()
                else:
                    plyr,mv = (canned_moves[sim])[game_step]
                    nxt_plyr = self.next_player(plyr)
                    
                s1 = self.state(self.__asStr,plyr)
                s2 = self.state(self.__asStr,nxt_plyr)
                reward = self.move(mv,plyr)
                sp1 = self.state(self.__asStr,plyr)
                sp2 = self.state(self.__asStr,nxt_plyr)
                
                learning_rate = learning_rate0 / (1 + (sim * learning_rate_decay))
                
                self.add_states_if_missing(s1,s2,sp1,sp2)
                
                self.update_Q_Values_for_player(mv,s1,sp1,reward[0],learning_rate,discount_rate)
                self.update_Q_Values_for_player(mv,s2,sp2,reward[1],learning_rate,discount_rate)
                if canned_moves is None:
                    plyr = nxt_plyr
                    nxt_plyr = self.next_player(plyr)
                game_step += 1
            sim += 1
            game_step = 0
            score[reward[0]] += 1
            if (sim % 1000) == 0 : 
                print(str(sim)+" Win : "+str(round((score[TicTacToe.__win]/sim)*100,0))+"% Draw: " + str(round((score[TicTacToe.__draw]/sim)*100,0))+"%")
        return self.__Q
    
    #
    # Given current state and lerned Q Values (if any) suggest
    # the move that is expected to yield the highest reward.
    #
    def informed_move(self,st,rnd):
        # What moves are possible at this stage
        valid_moves = self.__game.what_are_valid_moves()
        
        # Are there any moves ? 
        if(np.sum(valid_moves*np.full(9,1)) == 0):
            return None
    
        best_action = None
        if(not rnd):
            # Is there info learned for this state ?
            informed_actions = self.Q_Vals_for_state(st)
            if not informed_actions is None:
                informed_actions *= valid_moves
                best_action = np.max(informed_actions)
                if(best_action > 0):
                    informed_actions = np.arange(1,9+1,1)[np.where(informed_actions == best_action)]
                    best_action = informed_actions[randint(0, informed_actions.size-1)]
                else:
                    best_action = None

        # If we found a good action then return that 
        # else pick a random action
        if best_action == None:
            actions = valid_moves*np.arange(1,9+1,1)
            actions = actions[np.where(actions > 0)]
            best_action = actions[randint(0,actions.size-1)]

        return int(best_action)
        
    #
    # Play an automated game between a random player and an
    # informed player. 
    # Return the move sequence for the entire game as s string.
    #
    def play(self):
        self.__game.reset()
        plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Chose random player to start
        mv = None
        profile= ""
        while(not self.__game.game_over()):
            st = self.__game.state(True,plyr)
            QV = self.Q_Vals_for_state(st)
            mx = np.max(self.Q_Vals_for_state(st))
            if(plyr == TicTacToe.player_X):
                mv = self.informed_move(st,False) # Informed Player
            else:
                mv = self.informed_move(st,True) # Random Player
            self.__game.move(mv,plyr)
            profile += str(plyr)+":"+str(mv)+"~"
            plyr = TicTacToe.next_player(plyr)
        return profile
    
    #
    # Add the game profile to the given game dictionary and
    # up the count for the number of times that games was played
    #
    @staticmethod    
    def record_game_stats(D,profile):
        if profile in D:
            D[profile] += 1
        else:
            D[profile] = 1
        return
    
    def play_many(self,num):
        informed_wins = 0
        random_wins = 0
        draws = 0
        I = {}
        R = {}
        D = {}
        G = {}
        profile = ""
        for x in range(0, num):
            profile = self.play()
            if profile not in G: G[profile]=""
            if self.__game.game_won(self.__game.board(),TicTacToe.player_X):
                informed_wins += 1
                record_game_stats(I,profile)
            else:
                if _game_won(self.__game.board(),TicTacToe.player_O):
                    random_wins +=1
                    record_game_stats(R,profile)
                else: 
                    record_game_stats(D,profile)
                    draws += 1
            if(x % 100) == 0 : print (str(x))
        print("Informed :" +  str(informed_wins)+" : " + str(round((informed_wins/num)*100,0)))
        print("Random :" +  str(random_wins)+" : " + str(round((random_wins/num)*100,0)))
        print("Draw :" + str(draws)+" : " + str(round((draws/num)*100,0)))
        print("Diff Games :" +  str(len(G)))
        return (I,R,D)
    
    #
    # Convert a game profile string returned from play method
    # into an array that can be passed as a canned-move to
    # training. (Q learn)
    #
    def move_str_to_array(moves_as_str):
        mvd = {}
        mvc = 0
        mvs = moves_as_str.split('~')
        for mv in mvs:
            if(len(mv)>0):
                pl,ps = mv.split(":")
                mvd[mvc]=(int(pl),int(ps))
            mvc +=1
        return mvd

    #
    # Convert a dictionary or game profiles returned from play_many
    # to a dictionary of canned moves that can be passed to training (Q Learn)
    #
    def moves_to_dict(D):
        MD = {}
        i = 0
        for mvss,cnt in D.items():
            MD[i] = move_str_to_array(mvss)
            i+=1
        return MD

In [13]:
random.seed(42)
np.random.seed(42)
play = PlayTicTacToe()


In [14]:
GI = {}
GR = {}
GD = {}
GI,GR,GD = play.play_many(10)

UnboundLocalError: local variable 'bd' referenced before assignment

In [None]:
GI = {}
GR = {}
GD = {}
for j in range (0,2):
    GI,GR,GD = play_many(game,500)
    QV = play.train_Q_values(len(GR),moves_to_dict(GR))
    QV = play.train_Q_values(len(GI),moves_to_dict(GI))
    QV = play.train_Q_values(len(GD),moves_to_dict(GD))


In [None]:
#
# Run A Single Test
#
test_to_run = "test_one"
suite = unittest.TestSuite()
suite.addTest(testTicTacToe(test_to_run))
runner = unittest.TextTestRunner()
runner.run(suite)

In [None]:
#
# Run All Tests.
#
tests = testTicTacToe()
suite = unittest.TestLoader().loadTestsFromModule(tests)
unittest.TextTestRunner().run(suite)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [None]:
class playTicTacToe:
    __actor_network_name = "net/actor"
    __critic_network_name = "net/critic"
    __x_units = 21

    def __init__(self):
        return
    
    #
    # Build a four layer fully connected Network.
    # 21 -> relu -> 21 -> relu -> 21 -> relu -> 21 -> relu
    #
    def construct_network(X_state,network_name):
        return

In [None]:
print(len(QV))