In [3]:
import numpy as np
from random import randint
import math
import unittest
import operator
import numbers
import sys
import random

In [4]:
class TicTacToe:

    # There are 5812 legal board states that can be reached before there is a winner
    # http://brianshourd.com/posts/2012-11-06-tilt-number-of-tic-tac-toe-boards.html
    
    __bad_move_game_is_over = -1
    __bad_move_action_already_played = -2
    __bad_move_no_consecutive_plays = -3
    __play = float(0) # reward for playing an action
    __draw = float(2) # reward for playing to end but no one wins
    __win = float(1) # reward for winning a game
    __loss = float(-2) # reward (penalty) for losing a game
    __rewards = {"Play":0,"Draw":2,"Win":1,"Loss":-2}
    __no_player = -2 # id of a non existent player i.e. used to record id of player that has not played
    __win_mask = np.full((1, 3),3,np.int8)
    __actions = {1:(0,0), 2:(0,1), 3:(0,2), 4:(1,0), 5:(1,1), 6:(1,2), 7:(2,0), 8:(2,1), 9:(2,2)}
    player_X = 1 # numerical value of player X on the board
    player_O = -1 # numerical value of player O on the board
    empty_cell = 0 # value of a free action space on board
    asStr = True

    #
    # Return game to intial state, where no one has played
    # and the board contains no moves.
    #
    def reset(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
        
    #
    # Constructor has no arguments as it just sets the game
    # to an intial up-played set-up
    #
    def __init__(self):
        self.__board = np.zeros((3, 3),np.int8)
        self.__last_board = np.zeros((3, 3),np.int8)
        self.__game_over = False
        self.__game_drawn = False
        self.__player = TicTacToe.__no_player
        self.__last_player = TicTacToe.__no_player
    
    #
    # Return a displayable version of the entire game.
    #
    def __str__(self):
        s = ""
        s += "Game Over: " + str(self.__game_over) +"\n"
        s += "Player :" + TicTacToe.__player_to_str(self.__player) + "\n"
        s += "Current Board : \n" + str(self.__board)+ "\n"
        s += "Prev Player :" + TicTacToe.__player_to_str(self.__last_player) + "\n"
        s += "Prev Current Board : \n" + str(self.__last_board)+ "\n"
        s += "State" + str(self.state()) + "\n"
        return s
    
    #
    # return player as string "X" or "O"
    #
    @classmethod
    def __player_to_str(cls,self,player):
        if(player == TicTacToe.player_X): return "X"
        if(player == TicTacToe.player_O): return "O"
        return "?"
        
    #
    # Return the actions as a list of integers.
    #
    @classmethod
    def num_actions(cls):
        return len(TicTacToe.__actions)

    #
    # Return the maximum number of moves per game.
    #
    @classmethod
    def max_moves_per_game(cls):
        return len(TicTacToe.__actions)

    #
    # Return the actions as a list of integers.
    #
    @classmethod
    def actions(cls):
        return list(map(lambda a: int(a), list(TicTacToe.__actions.keys())))

    #
    # Return the board index (i,j) of a given action
    #
    @classmethod
    def board_index(cls,action):
        return TicTacToe.__actions[action]

    #
    # Return rewards as dictionary where key is name of reward
    # and the value is the reward
    #
    @classmethod
    def rewards(cls):
        return TicTacToe.__rewards
    
    #
    # Assume the move has been validated by move method
    # Make a copy of board before move is made and the last player
    #
    def __make_move(self, action, player):
        self.__last_board = np.copy(self.__board)
        self.__last_player = self.__player
        self.__player = player
        self.__board[TicTacToe.board_index(action)] = player
        return
    
    #
    # Has a player already moved using the given action.
    #
    def __valid_move(self,action):
        return self.__board[TicTacToe.board_index(action)] != TicTacToe.empty_cell
    
    #
    # If the proposed action is a valid move and the game is not
    # over. Make the given move (action) on behalf of the given 
    # player and update the game status.
    #
    # return the rawards (Player who took move, Observer)
    #
    def move(self, action, player):
        if(TicTacToe.game_won(self.__board)) : return TicTacToe.__bad_move_game_is_over
        if(self.__valid_move(action)): return TicTacToe.__bad_move_action_already_played 
        if(player == self.__player): return TicTacToe.__bad_move_no_consecutive_plays 
        
        self.__make_move(action,player)

        if(TicTacToe.game_won(self.__board)):
            self.__game_over = True
            self.__game_drawn = False
            return np.array([TicTacToe.__win,TicTacToe.__loss])
            
        if(not TicTacToe.moves_left_to_take(self.__board)):
            self.__game_over = True
            self.__game_drawn = True
            return np.array([TicTacToe.__draw,TicTacToe.__draw])

        return np.array([TicTacToe.__play,0])

    #
    # Return (flattened) Game Ended, Last Player, Last Board, Player, Board
    #
    def detailed_state(self):
        flattened_state = []
        if(self.__game_over):
            flattened_state.append(1)
        else:
            flattened_state.append(0)
        flattened_state.append(self.__last_player)
        flattened_state.append(self.__player)
        for itm in np.reshape(self.__last_board,9).tolist() : flattened_state.append(itm)
        for itm in np.reshape(self.__board,9).tolist() : flattened_state.append(itm)
            
        return flattened_state

    #
    # Show return the current board contents
    #
    def board(self):
        return self.__board
     
    #
    # Any row, column or diagonal with all player X or player O. If a
    # player is given then it answers has that specific player won
    #
    @classmethod
    def game_won(cls,bd,plyr=None):
    
        if not plyr is None: bd = (bd==plyr)*1
        
        rows = np.abs(np.sum(bd,axis=1))
        cols = np.abs(np.sum(bd,axis=0))
        diagLR = np.abs(np.sum(bd.diagonal()))
        diagRL = np.abs(np.sum(np.rot90(bd).diagonal()))        
    
        if(np.sum(rows == 3) > 0):
            return True
        if(np.sum(cols == 3) > 0):
            return True
        if((np.mod(diagLR,3)) == 0) and diagLR > 0:
            return True
        if((np.mod(diagRL,3)) == 0) and diagRL > 0:
            return True
        return False

    #
    # Are there any remaining moves to be taken >
    #
    @classmethod
    def moves_left_to_take(cls,board):
        return (board[np.where(board == 0)]).size > 0
    
    #
    # Board is in a gamne over state, with a winner or a draw
    #
    @classmethod
    def board_game_over(cls,board):
        return (TicTacToe.game_won(board) or not TicTacToe.moves_left_to_take(board))
        
    #
    # Is the game over ?
    #
    def game_over(self):
        return TicTacToe.board_game_over(self.__board)
    
    #
    # Return which player goes next given the current player
    #
    @staticmethod
    def other_player(current_player):
        if(current_player == TicTacToe.player_O):
            return  TicTacToe.player_X
        else:
            return  TicTacToe.player_O

    #
    # What moves are valid for the given board
    #
    @classmethod
    def valid_moves(cls,board):
        vm = np.zeros(TicTacToe.num_actions())
        best_action = None
        for actn in TicTacToe.actions():
            if(board[TicTacToe.board_index(actn)] == 0):
                vm[int(actn)-1] = True
            else:
                vm[int(actn)-1] = False
        return vm
    
    #
    # What moves are valid given for board or if not
    # for the current game board.
    #
    def what_are_valid_moves(self):
        return TicTacToe.valid_moves(self.__board)


In [120]:
class PlayTicTacToe:

    #
    # Constructor has no arguments as it just sets the game
    # to an intial up-played set-up
    #
    def __init__(self):
        self.__game = TicTacToe()
        self.__Q = {}
    
    #
    # Return the current game
    #
    def game(self):
        return self.__game
    
    #
    # Set leared state to given QValues.
    #
    def transfer_learning(self,QV):
        self.__Q = QV
        print("Learned Games:" + str(len(self.__Q)))
        
    #
    # The learned Q Values for a given state if they exist
    #
    def Q_Vals_for_state(self,state):
        if(state in self.__Q):
            return(self.__Q[state])
        else:
            return(None)

    #
    # Expose the current class instance learning in terms of Q Values.
    #
    def Q_Vals(self):
        return(self.__Q)
    
    #
    # Forget learning
    #
    def forget_learning(self):
        self.__Q = {}

    #
    # Add states to Q Value dictionary if not present
    #
    def add_states_if_missing(self,s1):
        if s1 not in self.__Q:
            self.__Q[s1] = np.zeros(TicTacToe.num_actions())

    #
    # Update the Q values for the given player state and
    # the given reward
    #
    def update_Q_Values_for_player(self,mv,s,sp,reward,learning_rate,discount_rate):
        
        actn = mv-1 # action is indexed from zero, moves are 1..9
        (self.__Q[s])[actn] = learning_rate * (self.__Q[s])[actn] + (1-learning_rate) * (reward + discount_rate * -np.max(self.__Q[sp]))
        return
    
    #
    # Keep Score of players as Q Val Trains.
    #
    @classmethod
    def __init_score(cls):
        score = {}
        score[TicTacToe.player_X]={}
        score[TicTacToe.player_O]={}
        for rn,rv in TicTacToe.rewards().items():
            score[TicTacToe.player_X][rv] = 0
            score[TicTacToe.player_O][rv] = 0
        return score

    @classmethod
    def __keep_score(cls,score,plyr,nxt_plyr,reward):                
        (score[plyr])[reward[0]] += 1
        (score[nxt_plyr])[reward[1]] += 1
        return

    #
    # Return the State, Action Key from the perspective of given player
    #
    @classmethod
    def state(cls,player,board):                
        sa = ""
        sa += str(player)
        for cell in np.reshape(board,9).tolist() : sa+= str(cell)            
        return sa

    #
    # Given q values for move to a given state select the
    # max of what player stands to win over what player stands to lose
    #
    @classmethod
    def best_outcome(cls,q):
        stand_to_lose = np.max((q>=0)*q)
        stand_to_win = -np.min((q<0)*q)
        return max(stand_to_win, stand_to_lose)

    #
    # Given q values for move to a given state select the
    # max of what player stands to win over what player stands to lose
    #
    @classmethod
    def signed_best_outcome(cls,q):
        stand_to_win = np.max((q>=0)*q)
        stand_to_lose = -np.min((q<0)*q)
        if(stand_to_win > stand_to_lose):
            return stand_to_win
        else:
            return -stand_to_lose

    #
    # Run simulation to estimate Q values for state, action pairs. Random exploration policy
    # which should be tractable with approx 6K valid board states.
    #
    def train_Q_values(self,num_episodes,canned_moves):
        
        # Simulation defaults.
        decay = (1.0/TicTacToe.max_moves_per_game())
        learning_rate0 = 0.05
        learning_rate_decay = 0.1
        discount_rate = 0.8
        
        # Initalization
        reward = 0
        sim = 0
        game_step = 0
        score = PlayTicTacToe.__init_score()
        
        # Iterate over and play
        while(sim < num_episodes):
            self.__game.reset()
            plyr = None
            prev_plyr = None
            s = None
            mv = None
            prev_mv = None
            prev_s = None
            mv = None
            
            while(not self.__game.game_over()):
                
                prev_mv = mv
                plyr,mv = (canned_moves[sim])[game_step]
                prev_plyr = TicTacToe.other_player(plyr)
                prev_s = s
                
                s = PlayTicTacToe.state(plyr,self.__game.board())
                reward = self.__game.move(mv,plyr)
                learning_rate = learning_rate0 / (1 + (sim * learning_rate_decay))
                 
                self.add_states_if_missing(s)

                # Update Q Values for both players based on last play reward.
                (self.__Q[s])[mv-1] = ((learning_rate * (self.__Q[s])[mv-1])) + ((1-learning_rate) * reward[0])
                if(not prev_s is None):
                    (self.__Q[prev_s])[prev_mv - 1] -= (discount_rate * self.best_outcome(self.__Q[s]))
                game_step += 1
            sim += 1
            game_step = 0
            
            PlayTicTacToe.__keep_score(score,plyr,prev_plyr,reward)
            
            if ((sim % 1000) == 0) or (sim == num_episodes) : 
                smX = "Player X : " + str(sim) + " : "
                smO = "Player O : " + str(sim) + " : "
                for rn,rv in TicTacToe.rewards().items():
                    smX += rn+" : "+str(round(((score[TicTacToe.player_X])[rv]/sim)*100,0))+"% "
                    smO += rn+" : "+str(round(((score[TicTacToe.player_O])[rv]/sim)*100,0))+"% "
                print(smX)
                print(smO)
        return self.__Q

    #
    # Run simulation to estimate Q values for state, action pairs. Random exploration policy
    # which should be tractable with approx 6K valid board states.
    #
    def train_Q_values_R(self,num_simulations):
        exploration = 1.0
        decay = (1.0/num_simulations)
        learning_rate0 = 0.05
        learning_rate_decay = 0.1
        discount_rate = 0.95
        reward = 0
        sim = 0
        game_step = 0
        score = PlayTicTacToe.__init_score()
            
        while(sim < num_simulations):
            self.__game.reset()
            plyr = None
            prev_plyr = None
            s = None
            mv = None
            prev_mv = None
            prev_s = None
            plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Random player to start
            nxt_plyr = TicTacToe.other_player(plyr)
            mv = None
            while(not self.__game.game_over()):
                
                prev_mv = mv
                mv = self.random_move()
                
                prev_s = s
                s = PlayTicTacToe.state(plyr,self.__game.board())
                reward = self.__game.move(mv,plyr)

                learning_rate = learning_rate0 / (1 + (sim * learning_rate_decay))
                 
                self.add_states_if_missing(s)

                # Update Q Values for both players based on last play reward.
                (self.__Q[s])[mv-1] = ((learning_rate * (self.__Q[s])[mv-1])) + ((1-learning_rate) * reward[0])
                if(not prev_s is None):
                    (self.__Q[prev_s])[prev_mv - 1] -= (discount_rate * self.best_outcome(self.__Q[s]))
            
                plyr = TicTacToe.other_player(plyr)
                prev_plyr = plyr
                game_step += 1
            sim += 1
            game_step = 0
            
            PlayTicTacToe.__keep_score(score,plyr,prev_plyr,reward)
            
            if ((sim % 1000) == 0) or (sim == num_simulations) : 
                smX = "Player X : " + str(sim) + " : "
                smO = "Player O : " + str(sim) + " : "
                for rn,rv in TicTacToe.rewards().items():
                    smX += rn+" : "+str(round(((score[TicTacToe.player_X])[rv]/sim)*100,0))+"% "
                    smO += rn+" : "+str(round(((score[TicTacToe.player_O])[rv]/sim)*100,0))+"% "
                print(smX)
                print(smO)
        return self.__Q
    
    #
    # Return a random action (move) that is still left
    # to make
    #
    def random_move(self):
        valid_moves = []
        random_action = None
        for actn in self.__game.actions():
            if(self.__game.board()[TicTacToe.board_index(actn)] == TicTacToe.empty_cell):
                valid_moves.append(actn)
         
        num_poss_moves = len(valid_moves)
        if(num_poss_moves > 0):
            random_action = valid_moves[randint(0, num_poss_moves-1)]
            return random_action
        else:
            return None
        
    #
    # Given current state and lerned Q Values (if any) suggest
    # the move that is expected to yield the highest reward.
    #
    def informed_move(self,st,rnd):
        # What moves are possible at this stage
        valid_moves = self.__game.what_are_valid_moves()
        
        # Are there any moves ? 
        if(np.sum(valid_moves*np.full(9,1)) == 0):
            return None
    
        best_action = None
        if(not rnd):
            # Is there info learned for this state ?
            informed_actions = self.Q_Vals_for_state(st)
            if not informed_actions is None:
                informed_actions *= valid_moves
                best_action = PlayTicTacToe.signed_best_outcome(informed_actions)
                if(best_action > 0):
                    informed_actions = np.arange(1,TicTacToe.num_actions()+1,1)[np.where(informed_actions == best_action)]
                    best_action = informed_actions[randint(0, informed_actions.size-1)]
                else:
                    best_action = None

        # If we found a good action then return that 
        # else pick a random action
        if best_action == None:
            actions = valid_moves*np.arange(1,TicTacToe.num_actions()+1,1)
            actions = actions[np.where(actions > 0)]
            best_action = actions[randint(0,actions.size-1)]

        return int(best_action)        
    #
    # Play an automated game between a random player and an
    # informed player. 
    # Return the move sequence for the entire game as s string.
    #
    def play(self):
        self.__game.reset()
        plyr = (TicTacToe.player_X,TicTacToe.player_O)[randint(0,1)] # Chose random player to start
        mv = None
        profile= ""
        while(not self.__game.game_over()):
            st = PlayTicTacToe.state(plyr,self.__game.board())
            QV = self.Q_Vals_for_state(st)
            mx = np.max(self.Q_Vals_for_state(st))
            if(plyr == TicTacToe.player_X):
                mv = self.informed_move(st,False) # Informed Player
            else:
                mv = self.informed_move(st,True) # Random Player
            self.__game.move(mv,plyr)
            profile += str(plyr)+":"+str(mv)+"~"
            plyr = TicTacToe.other_player(plyr)
        return profile
    
    #
    # Add the game profile to the given game dictionary and
    # up the count for the number of times that games was played
    #
    @classmethod    
    def record_game_stats(cls,D,profile):
        if profile in D:
            D[profile] += 1
        else:
            D[profile] = 1
        return
    
    def play_many(self,num):
        informed_wins = 0
        random_wins = 0
        draws = 0
        I = {}
        R = {}
        D = {}
        G = {}
        profile = ""
        for x in range(0, num):
            profile = self.play()
            if profile not in G: G[profile]=""
            if self.__game.game_won(self.__game.board(),TicTacToe.player_X):
                informed_wins += 1
                PlayTicTacToe.record_game_stats(I,profile)
            else:
                if self.__game.game_won(self.__game.board(),TicTacToe.player_O):
                    random_wins +=1
                    PlayTicTacToe.record_game_stats(R,profile)
                else: 
                    PlayTicTacToe.record_game_stats(D,profile)
                    draws += 1
            if(x % 100) == 0 : print (str(x))
        print("Informed :" +  str(informed_wins)+" : " + str(round((informed_wins/num)*100,0)))
        print("Random :" +  str(random_wins)+" : " + str(round((random_wins/num)*100,0)))
        print("Draw :" + str(draws)+" : " + str(round((draws/num)*100,0)))
        print("Diff Games :" +  str(len(G)))
        return (I,R,D)
    
    #
    # move_str is of form "1:8~-1:1~1:6~-1:3~1:9~-1:2~"
    # plyr:action~.. repreat players must be alternate X,O (1,-1..)
    # there is always a trailing ~
    
    #
    # Convert a game profile string returned from play method
    # into an array that can be passed as a canned-move to
    # training. (Q learn)
    #
    @classmethod    
    def move_str_to_array(cls,moves_as_str):
        mvd = {}
        mvc = 0
        mvs = moves_as_str.split('~')
        for mv in mvs:
            if(len(mv)>0):
                pl,ps = mv.split(":")
                mvd[mvc]=(int(pl),int(ps))
            mvc +=1
        return mvd

    #
    # Convert a game profile string returned from play method
    # into an array that can be passed as a canned-move to
    # training. (Q learn)
    #
    @classmethod    
    def move_str_to_board(cls,moves_as_str):
        mvd = {}
        mvc = 0
        mvs = moves_as_str.split('~')
        bd = np.zeros((3*3),np.int8)
        for mv in mvs:
            if(len(mv)>0):
                pl,ps = mv.split(":")
                bd[int(ps)-1] = int(pl)
            mvc +=1
        return np.reshape(bd, (3, 3))

    #
    # Convert a dictionary of game profiles returned from play_many
    # to a dictionary of canned moves that can be passed to training (Q Learn)
    #
    @classmethod    
    def moves_to_dict(cls,D):
        MD = {}
        i = 0
        for mvss,cnt in D.items():
            MD[i] = PlayTicTacToe.move_str_to_array(mvss)
            i+=1
        return MD
    
    #
    # All possible endings. Generate moves str's for all the possible endings of the
    # game from the perspective of the prev player. 
    #
    # The given moves must be the moves of a valid game that played to either win/draw
    # including the last move that won/drew the game.
    #
    @classmethod
    def all_possible_endings(cls,moves_as_str,exclude_current_ending=True):
        APE = {}
        mvs = PlayTicTacToe.move_str_to_array(moves_as_str)
        
        terminal_move = mvs[len(mvs)-1] # The move that won, drew
        last_move = mvs[len(mvs)-2] # the move we will replace with all other options
        
        t_plyr=terminal_move[0]
        t_actn=terminal_move[1]

        l_plyr=last_move[0]
        l_actn=last_move[1]
        
        base_game = "~".join(moves_as_str.split("~")[:-3]) # less Trailing ~ + terminal & last move
        bd = PlayTicTacToe.move_str_to_board(base_game)
        vmvs = TicTacToe.valid_moves(bd) 
        a=1
        for vm in vmvs:
            poss_end = base_game
            if(vm):
                if(a != t_actn): # don't include the terminal action as we will add that back on.
                    if(not (exclude_current_ending and a == l_actn)):
                        poss_end += "~"+str(l_plyr)+":"+str(a)
                        poss_end += "~"+str(t_plyr)+":"+str(t_actn)+"~"
                        APE[poss_end] = 0
            a+=1            
            
        return(APE)

In [121]:
qv=np.array((-1.12074287e-03  ,-8.15610660e-01,  -8.15281066e-01 , -8.13555037e-01,  -8.20507102e-01 , -2.22765039e-06 , -8.24740712e-01 , -8.15325657e-01,   -8.18110732e-01))
print(PlayTicTacToe.signed_best_outcome(qv))

-0.824740712


In [119]:
def pqv(qv):
    print("[ %3.5f %3.5f %3.5f :: %3.5f %3.5f %3.5f :: %3.5f %3.5f %3.5f ]" % (qv[0],qv[1],qv[2],qv[3],qv[4],qv[5],qv[6],qv[7],qv[8]))

In [122]:
random.seed(42)
np.random.seed(42)
play = PlayTicTacToe()
play.forget_learning()
print(play.Q_Vals())

{}


In [105]:
print(pqv(play.Q_Vals()["-1000000000"]))
print(pqv(play.Q_Vals()["1000000000"]))
print(pqv(play.Q_Vals()["10000000-10"]))
print(pqv(play.Q_Vals()["-11000000-10"]))
print(pqv(play.Q_Vals()["110000-10-10"]))
print(pqv(play.Q_Vals()["-110100-10-10"]))
print(pqv(play.Q_Vals()["110100-1-1-10"]))

[ -0.81515 -0.81495 -0.81494 :: -0.81488 -0.81638 -0.81645 :: -0.81503 -0.81526 -0.81489 ]
None
[ -0.81502 -0.81498 -0.81517 :: -0.81493 -0.81510 -0.81494 :: -0.81490 -0.81513 -1.32390 ]
None
[ -0.85765 -0.85743 -0.85776 :: -0.85768 -0.85755 -0.85756 :: -0.85808 0.00000 -0.85769 ]
None
[ 0.00000 -0.90270 -0.81429 :: 0.00000 -0.81483 -0.85747 :: 0.00000 0.00000 -0.85687 ]
None
[ 0.00000 -0.90249 -0.85633 :: -0.90229 0.00000 0.00000 :: -0.90271 0.00000 0.00000 ]
None
[ 0.00000 -0.90246 0.00000 :: 0.00000 0.00000 0.00000 :: -0.85737 0.00000 0.00000 ]
None
[ 0.00000 1.00000 0.00000 :: 0.00000 -0.94985 0.00000 :: 0.00000 0.00000 -0.90250 ]
None


In [24]:
APE = PlayTicTacToe.all_possible_endings('1:8~-1:1~1:6~-1:3~1:7~-1:2~',False)
#APE['-1:8~1:1~-1:6~1:3~-1:7~1:2~']=0
QV = play.train_Q_values(len(APE),PlayTicTacToe.moves_to_dict(APE))
print(play.Q_Vals())

Player X : 4 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 4 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
{'-1000000000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), '11010-1-10-10': array([ 0.  ,  0.95,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ]), '10000000-10': array([-0.00062359,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ]), '110000-10-10': array([ 0.       ,  0.       , -0.6378042,  0.       ,  0.       ,
        0.       ,  0.       ,  0.       ,  0.       ]), '1000000000': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), '-1-10-1011010': array([ 0.        ,  0.96153846,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ]), '110100-1-1-10': array([ 0.        ,  0.96153846,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ]), '-110100-10-10': array([ 0.        ,  0.        ,  0.  

In [84]:
GI = {}
GR = {}
GD = {}
APE = {}
for i in (0,1):
    GI,GR,GD = play.play_many(1000)
    print(GR)
    #if(len(GI)>0):
    #    for key, value in GI.items():
    #        APE = PlayTicTacToe.all_possible_endings(key)
    #        if(len(APE) > 0):
    #            QV = play.train_Q_values(len(APE),PlayTicTacToe.moves_to_dict(APE))
    if(len(GR)>0):
        for key, value in GR.items():
            APE = PlayTicTacToe.all_possible_endings(key)
            if(len(APE) > 0):
                QV = play.train_Q_values(len(APE),PlayTicTacToe.moves_to_dict(APE))


0
100
200
300
400
500
600
700
800
900
Informed :651 : 65.0
Random :270 : 27.0
Draw :79 : 8.0
Diff Games :977
{'-1:2~1:6~-1:4~1:8~-1:5~1:1~-1:3~1:9~-1:7~': 1, '-1:5~1:2~-1:9~1:4~-1:6~1:1~-1:3~': 1, '-1:8~1:1~-1:7~1:4~-1:5~1:6~-1:2~': 1, '-1:1~1:9~-1:3~1:6~-1:8~1:2~-1:7~1:5~-1:4~': 1, '-1:3~1:4~-1:2~1:9~-1:1~': 1, '1:7~-1:5~1:2~-1:3~1:6~-1:9~1:4~-1:1~': 1, '-1:3~1:7~-1:1~1:6~-1:2~': 1, '-1:8~1:2~-1:1~1:4~-1:9~1:5~-1:7~': 1, '-1:9~1:2~-1:3~1:1~-1:4~1:8~-1:6~': 1, '-1:9~1:2~-1:1~1:4~-1:6~1:3~-1:5~': 1, '1:8~-1:5~1:6~-1:9~1:2~-1:1~': 1, '-1:3~1:4~-1:1~1:5~-1:2~': 1, '1:7~-1:3~1:6~-1:8~1:9~-1:1~1:4~-1:2~': 1, '1:8~-1:4~1:7~-1:9~1:1~-1:5~1:3~-1:6~': 1, '1:4~-1:1~1:6~-1:5~1:7~-1:3~1:9~-1:2~': 1, '1:3~-1:7~1:2~-1:1~1:9~-1:6~1:5~-1:4~': 1, '-1:9~1:3~-1:1~1:4~-1:7~1:6~-1:5~': 1, '-1:1~1:5~-1:6~1:3~-1:7~1:8~-1:4~': 1, '-1:1~1:2~-1:4~1:3~-1:7~': 1, '-1:2~1:4~-1:5~1:9~-1:8~': 1, '-1:5~1:4~-1:2~1:6~-1:1~1:9~-1:8~': 1, '-1:4~1:2~-1:6~1:7~-1:9~1:8~-1:3~': 1, '-1:4~1:5~-1:9~1:1~-1:8~1:2~-1:7~': 1, '1:4~

Player O : 4 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 1 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 1 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 1 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 1 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 1 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 1 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 1 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 1 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 2 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 2 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 2 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 2 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 4 : Play : 0.0% Draw : 0.0% Loss : 100.0% Win : 0.0% 
Player O : 4 : Play : 0.0% Draw : 0.0% Loss : 0.0% Win : 100.0% 
Player X : 4 : Play : 0.0

In [126]:
GI = {}
GR = {}
GD = {}
GI,GR,GD = play.play_many(100)


ba: -0.85781667664
ia: [ 0.         -0.85781204 -0.85781668 -0.85749161 -0.85741369 -0.85764168
 -0.85770281 -0.85773659 -0.85772602]
ba: -0.902485250583
ia: [  0.00000000e+00   0.00000000e+00   0.00000000e+00  -9.02485251e-01
  -9.00403716e-01   0.00000000e+00  -8.57373840e-01   0.00000000e+00
  -1.15484307e-04]
ba: -0.949820619335
ia: [ 0.          0.          0.          0.          0.          0.
 -0.94982062  0.          0.        ]
ba: 0.999859865471
ia: [ 0.         -0.94999998  0.          0.          0.99985987  0.          0.
  0.          0.        ]
0
ba: -0.857950889031
ia: [-0.85755264 -0.85795089 -0.85767858 -0.85768761 -0.8575607  -0.85781714
  0.         -0.85739608 -0.85787864]
ba: -0.857173671954
ia: [-0.85716884 -0.85717367  0.          0.          0.          0.          0.
  0.          0.        ]
ba: -0.949892412231
ia: [ 0.          0.          0.          0.          0.         -0.94989241
  0.          0.          0.        ]
ba: -1.90023924143
ia: [ 0.      

In [125]:
QV = play.train_Q_values_R(5000)
print(len(play.Q_Vals()))
QVV = play.Q_Vals()

Player X : 1000 : Play : 0.0% Draw : 12.0% Loss : 45.0% Win : 45.0% 
Player O : 1000 : Play : 0.0% Draw : 10.0% Loss : 44.0% Win : 44.0% 
Player X : 2000 : Play : 0.0% Draw : 12.0% Loss : 47.0% Win : 47.0% 
Player O : 2000 : Play : 0.0% Draw : 12.0% Loss : 42.0% Win : 42.0% 
Player X : 3000 : Play : 0.0% Draw : 12.0% Loss : 46.0% Win : 46.0% 
Player O : 3000 : Play : 0.0% Draw : 12.0% Loss : 42.0% Win : 42.0% 
Player X : 4000 : Play : 0.0% Draw : 12.0% Loss : 46.0% Win : 46.0% 
Player O : 4000 : Play : 0.0% Draw : 12.0% Loss : 42.0% Win : 42.0% 
Player X : 5000 : Play : 0.0% Draw : 13.0% Loss : 46.0% Win : 46.0% 
Player O : 5000 : Play : 0.0% Draw : 12.0% Loss : 42.0% Win : 42.0% 
8052


In [None]:
play.game().reset()
print((play.Q_Vals()))

In [21]:
def play_ml(playg):
    st = PlayTicTacToe.state(TicTacToe.player_X,playg.game().board())
    QV = playg.Q_Vals_for_state(st)
    mx = np.max(playg.Q_Vals_for_state(st))
    print(playg.game().board())
    print(st)
    print(QV)
    print((QV-mx))
    print((QV==mx)*mx)
    
    #mv = self.informed_move(st,True) # Random Player
    playg.game().move(playg.informed_move(st,False),TicTacToe.player_X)
    print(playg.game().board())

    return

def play_me(playg,mv):
    st = PlayTicTacToe.state(TicTacToe.player_X,playg.game().board())
    QV = playg.Q_Vals_for_state(st)
    mx = np.max(playg.Q_Vals_for_state(st))
    print(playg.game().board())
    playg.game().move(mv,TicTacToe.player_O)
    print(playg.game().board())

    return


In [56]:
play.game().reset()

In [63]:
play_ml(play)

[[-1  0  1]
 [ 0  0  1]
 [-1  1 -1]]
1-101001-11-1
[ 0.         -0.95001249  0.         -0.95001127 -0.95001025  0.          0.
  0.          0.        ]
[ 0.         -0.95001249  0.         -0.95001127 -0.95001025  0.          0.
  0.          0.        ]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
[[-1  0  1]
 [ 1  0  1]
 [-1  1 -1]]


In [64]:
play_me(play,1)

[[-1  0  1]
 [ 1  0  1]
 [-1  1 -1]]
[[-1  0  1]
 [ 1  0  1]
 [-1  1 -1]]


In [None]:
qv = 0
for key, value in play.Q_Vals().items():
    if(np.sum((value > 0)*1)>0):
        print(key)
        print(value)
        qv += value
print("-----")
print(qv)

In [None]:

-110100-10-10
[ 0.          0.          0.         -0.9025     -0.91041667  0.
 -0.91346154  0.         -0.90681818]
-----
[ 0.          0.          0.         -0.9025     -0.91041667  0.
 -0.91346154  0.         -0.90681818]