# Black Jack

In [1]:
class Card:
    
    def __init__(self, rank, face, value, suit):
        self.rank = rank
        self.face = face
        self.value = value
        self.suit = suit

    def get_rank(self):
        return self.rank
    
    def get_face(self):
        return self.face
    
    def get_value(self):
        return self.value
    
    def get_suit(self):
        return self.suit
        
    def __str__(self):
        return f"{self.value}"
    
    def __repr__(self):
        if self.face is not None:
            return f"{self.face} of {self.suit}, Value: {self.value}"
        return f"{self.rank} of {self.suit}, Value: {self.value}"

In [2]:
class Deck:
    
    def __init__(self):
        self.value = 11
        self.faces = {1: "Ace", 11: "Jack", 12: "Queen", 13: "King"}
        self.ranks = [i for i in range(1, 14)]
        self.suits = ["Hearts","Diamonds", "Clubs", "Spades"]
            
    def card(self):
        """
        Create and return card from infinite deck with replacement
        """
        import random as rn
        
        rank = rn.choice(self.ranks)
        face = None
        value = rank
        suit = rn.choice(self.suits)
        
        if rank == 1:
            value = self.value
            face = self.faces[rank]
        
        elif rank > 10:
            value = self.value - 1
            face = self.faces[rank]
        
        return Card(rank, face, value, suit)

In [3]:
class Dealer:
    
    def __init__(self):
        self.deck = Deck()
        self.hand = self.deal()
        self.ace_positions = []
        self.current_score = 0
        self.ace_in_hand = False
        self.soft = True
        self.player_wins = False
        self.player_busts = False
        self.player_sticks = False
        self.player_loses = self.player_busts == True
        
    def deal(self):
        """
        First deal of game, returns agent's hand
        """
        # deal dealer's hand
        self.hand = [self.deck.card() for i in range(2)]
        
        # deal agent's hand
        player = [self.deck.card() for i in range(2)]
        
        return player
    
    def get_hand(self):
        """
        Return's dealer's hand
        """
        
        return self.hand
        
    def show(self):
        """
        Returns dealer's show card
        """
        
        return self.hand[0]
    
    def reveal(self):
        """
        Reveals hidden card
        """
        
        return self.hand[1]
    
    def reset(self):
        """
        Resets hand after game
        """
        
        self.hand.clear()
    
    def hit(self):
        """
        Hit and deal card to self
        """
        
        self.hand.append(self.deck.card())
        
    def stick(self):
        """
        Returns boolean to stay
        """
        
        return self.score() >= 17
        
    def hit_player(self):
        """
        Deal one card to player
        """
        
        return self.deck.card()
    
    def useable(self):
        """
        Return whether ace is useable
        """
        return self.soft and self.current_score < 17
    
    def score(self):
        """
        Calculate dealer's card sum
        """
        
        # reset to sum hand
        total = 0
        
        # iterate through hand
        for i in range(len(self.hand)):
            
            # sum hand
            total += self.hand[i].get_value()
            
            # check if ace in hand
            if self.hand[i].get_value() == 11:
                
                # ace found
                self.ace_in_hand = True
                
                # keep track of position in hand
                self.ace_positions.append(i)
                
        # keep track of running total
        self.current_score = total
        
        # ace makes sum too large
        if self.ace_in_hand and self.useable() and total > 21:
            
            # check if aces left
            if len(self.ace_positions) > 0:
                
                # change ace value from 11 to 1
                self.hand[self.ace_positions.pop()].value = 1
            
                # no lonrger soft/useable
                self.soft = False
            
        return total
    
    def strategy(self):
        """
        Dealer's game strategy
        """
        
        # player wins return to start a new episode
        if self.player_wins:
            return False
        
        # dealer busts return to start a new episode
        if self.score() > 21:
            self.player_wins = True
            return False
        
        # dealer's turn
        elif self.player_sticks:
            
            # hand sum 17 - 20
            if self.stick():
                return False
            
            # ace is useable/soft
            elif self.useable():
                return True
            
            # otherwise hit until at least 17
            elif self.score() < 17:
                return True


In [4]:
class Agent:
    
    def __init__(self):
        self.hand = []
        self.ace_in_hand = False
        self.useable = False
        self.action = 0
        self.ace_positions = []
        self.policy = self.create_policy()
        
    def reset(self):
        """
        Reset hand after game
        """
        self.hand.clear()
        
    def create_policy(self):
        """
        Creates agent's arbitrary policy 
        """
        
        # tabular solutions method
        policy = {}
        
        # agent hit for all sums under 20
        for hand in range(12, 20):
          
            # create policy
            for show in range(2, 12):
                
                # Non-Useable ace
                policy[(hand, show, False)] = 1
                
                # Useable ace
                policy[(hand, show, True)] = 1
                
                # Stick policy
                if hand >= 19:
                    
                    # Non-Useable ace
                    policy[(20, show, False)] = 0
                
                    # Useable ace
                    policy[(20, show, True)] = 0
                    
                    # Non-Useable ace
                    policy[(21, show, False)] = 0
                
                    # Useable ace
                    policy[(21, show, True)] = 0
                    
                    # Non-Useable ace
                    policy[(22, show, False)] = 0
                
                    # Useable ace
                    policy[(22, show, True)] = 0
            
        return policy
    
    def get_policy(self):
        """
        Returns agent's policy
        """
        
        return self.policy
    
    def get_hand(self):
        """
        Returns agent's hand
        """
        
        return self.hand
    
    def set_hand(self, hand):
        """
        Dealer's initial deal
        """
        
        self.hand = hand
    
    def hit(self, card):
        """
        Add's hit card to hand
        """
        
        self.hand.append(card)
    
    def score(self):
        """
        Calculate and return agent's sum of cards
        """
        
        # reset to keep track of current sum 
        total = 0
        
        # sum hand and keep track of aces
        for i in range(len(self.hand)):
            
            # sum hand
            total += self.hand[i].get_value()
            
            # track aces in hand
            if self.hand[i].get_value() == 11:
                
                # ace found
                self.ace_in_hand = True
                self.useable = True
                
                # keep track of position in hand
                self.ace_positions.append(i)
        
        # utility condition for agent's policy
        if self.ace_in_hand and self.useable and total > 21:
            
            # check if aces left
            if len(self.ace_positions) > 0:
                
                # change ace value from 11 to 1
                self.hand[self.ace_positions.pop(0)].value = 1
            
                # ace is no longer useable
                self.useable = False
              
                
        return total
    
    def is_useable(self):
        """
        Checks and returns if Ace is useable
        """
        
        return self.ace_in_hand and self.useable
    
    def decision(self, score, show_card, useable):
        """
        Returns agent's decision: score, show card 
        """
        
        return self.policy[(score, show_card, useable)]

In [5]:
class Game:
    
    def __init__(self):
        self.player_turn = True
        self.dealer_turn = False
        self.game_over = False
        self.episode = []
        self.reward = 0
        self.score = 0
        self.start = True
        self.win = 21        
        self.agent = Agent()
        self.dealer = Dealer()
        
    def on(self, initial_state = None):
        """
        Game Environment
        Returns Episode: (state, action, reward)
        """
        
        # GAME LOOP
        while not self.game_over:
            
            # initial start of game
            if self.start:
                
                # MC First Visit - first deal
                self.agent.set_hand(self.dealer.deal())
                
                # Check for MC Exploration Starts
                if initial_state is not None:
                    
                    # first deal
                    self.agent.set_hand(initial_state)

                # Check if natural
                if self.agent.score() == self.win:
                    
                    # GAME OVER
                    self.dealer.player_wins = True
                    
                    # return episode
                    self.episode.append(((self.agent.score(), self.dealer.show().get_value(), self.agent.is_useable()), self.agent.action, self.reward))
                    return self.episode
                
                # game in play
                self.start = False
                
            # Agent's turn
            if self.player_turn:
                
                # keep track of current score
                self.score = self.agent.score()
                
                # Check if 21 reached
                if self.score == self.win:
                    
                    # Agent's turn over
                    self.player_turn = False
                    
                    # Dealer's turn
                    self.dealer_turn = True
                    
                # check if agent busts
                elif self.score > self.win:
                    
                    # Agent's turn over
                    self.player_turn = False
                    
                    # Dealer's turn
                    self.dealer_turn = False
                    
                    # Agent loses
                    self.dealer.player_busts = True
                    
                    # Game Over
                    self.game_over = True
                    
                    # set score to 22 for bust
                    self.score = 22
                    
                    # Consequence to increase regret
                    self.reward = -1
                
                # check agent sticks
                elif self.agent.decision(self.score, self.dealer.show().get_value(), self.agent.is_useable()) == 0:
                    
                    # Agent's turn over
                    self.player_turn = False; self.dealer.player_sticks = True
                    
                    # Dealer's turn
                    self.dealer_turn = True
                    
                    # set action
                    self.agent.action = 0
                    
                # otherwise hit
                elif self.agent.decision(self.score, self.dealer.show().get_value(), self.agent.is_useable()) == 1:
                    
                    # add card to hand
                    self.agent.hit(self.dealer.hit_player())
                    
                    # set action
                    self.agent.action = 1
                
                # Add episode
                if self.score >= 12 and self.score < 22:
                    
                    # Episode following pi: S0, A0, R1, ..., St-1, At-1, Rt
                    self.episode.append(((self.score, self.dealer.show().get_value(), self.agent.is_useable()), self.agent.action, self.reward))
                
            # Dealer's turn
            if self.dealer_turn and not self.game_over:
                
                # keep track of dealers score
                dealer_score = self.dealer.score()
                
                # Dealer's strategy
                if self.dealer.strategy():
                    self.dealer.hit()
                    
                # Dealer's turn over
                else:
                    
                    # Dealer's turn
                    self.dealer_turn = False
                    
                    # game ends
                    self.game_over = True
                    
                    # if dealer busts
                    if dealer_score > self.win:
                        
                        # reset for terminal state
                        dealer_score = 0
                    
                    # State
                    state = (self.score, self.dealer.show().get_value(), self.agent.is_useable())
                    
                    # Reward 
                    self.reward = self.terminal(dealer_score, self.score)
                    
                    # Episode following pi: S0, A0, R1, ..., St-1, At-1, Rt 
                    self.episode.append((state, self.agent.action, self.reward))
                    
            # GAME OVER
            if self.game_over:

                # return episode
                return self.episode[::-1]
            
        
    def terminal(self, dealer_score, agent_score):
        """
        Terminal state
        Returns reward: [-1, 0, 1]
        """
        
        # Lose
        if dealer_score > agent_score or agent_score > self.win:
            return -1
        
        # Draw
        if dealer_score == agent_score:
            return 0
        
        # Win
        if dealer_score < agent_score:
            return 1


In [6]:
class MonteCarlo:
    
    def __init__(self):
        self.game = Game()
        self.policy = self.game.agent.get_policy()
        self.V = {}
        self.Returns = {}
        self.Q = {}
        self.gamma = 1.0
        self.epsilon = 0.01
        self.probability = {}
        self.appears = []
        self.G = 0
        self.S = 0
        self.A = 1
        self.R = 2
        self.W = 1
        
    def average(self, returns):
        """
        Returns average(Returns(St, At))
        """
        
        return sum(returns)/len(returns)
    
    def first_visit(self, episodes):
        """
        First-visit MC prediction, for estimating V ⇡ v⇡
        """
        
        #########
        # Input #
        #########
        
        # Input Policy
        self.policy = self.game.agent.get_policy()
        
        ##############
        # Initialize #
        ##############
        
        # Initialize V(s) and Returns(s) arbitrarily
        for state in self.policy: 
        
            # State-Value Function V(s)
            self.V[state] = 0.0

            # Returns(St)
            self.Returns[state] = []
        
        ################
        # Loop Forever #
        ################
        
        # Loop for each episode
        for i in range(episodes):
            
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT
            self.game = Game(); episode = self.game.on()
            
            # G <- 0
            self.G = 0
            
            # Loop for each step of episode, t = T-1, T-2, ..., 0:
            for step in episode:
                
                # G <- Gamma*G + Rt+1
                self.G = self.gamma*self.G + step[self.R]
                
                if step[self.S] not in self.appears:
              
                    # Append G to Returns(St)
                    self.Returns[step[self.S]].append(self.G)

                    # V(St) <- average(Returns(St))
                    self.V[step[self.S]] = self.average(self.Returns[step[self.S]])
                    
                    # state appeared
                    self.appears.append(step[self.S])

        # V = v_pi
        return self.V
    
    
    def exploration_starts(self, episodes):
        """
        Monte Carlo ES (Exploring Starts), for estimating Pi = pi*
        """
        
        ##############
        # Initialize #
        ##############
          
        # arbitrary policy
        policy = self.policy.copy()
        
        # Initialize Q(s,a) and Returns(s,a) arbitrarily
        for state in policy: 
        
            # State-Action-Value Function Q(s,a)
            self.Q[(state, 0)] = 0.0
            self.Q[(state, 1)] = 0.0

            # Returns(s,a)
            self.Returns[(state, 0)] = []
            self.Returns[(state, 1)] = []
        
        ################
        # Loop Forever #
        ################
        
        # Loop for each episode
        for i in range(episodes):
            
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT
            self.game = Game(); episode = self.game.on(initial_state = self.game.dealer.deal())
            
            # G <- 0
            self.G = 0
            
            # Loop for each step of episode, t = T-1, T-2, ..., 0:
            for step in episode:
                
                # G <- Gamma*G + Rt+1
                self.G = self.gamma*self.G + step[self.R]
              
                # Append G to Returns(St, At)
                self.Returns[(step[self.S], step[self.A])].append(self.G)

                # Q(St, At) <- average(Returns(St, At))
                self.Q[(step[self.S], step[self.A])] = self.average(self.Returns[(step[self.S], step[self.A])])
                
                # pi⇡(St) argmax a Q(St, a)
                self.policy[step[self.S]] = [state[1] for state, value in self.Q.items() if value == max(self.Q[step[self.S], 0], self.Q[step[self.S], 1])][0]
                
        # Pi = pi*
        return self.policy
    
    def on_policy(self):
        """
        On-policy first-visit MC control (for "e-soft policies)
        """
        
        ##############
        # Initialize #
        ##############
          
        # arbitrary policy
        policy = self.policy.copy()
        
        # Initialize Q(s,a) and Returns(s,a) arbitrarily
        for state in policy: 
        
            # State-Action-Value Function Q(s,a)
            self.Q[(state, 0)] = 0.0
            self.Q[(state, 1)] = 0.0

            # Returns(s,a)
            self.Returns[(state, 0)] = []
            self.Returns[(state, 1)] = []
        
        ################
        # Loop Forever #
        ################
        
        # Loop for each episode
        for i in range(episodes):
            
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT
            self.game = Game(); episode = self.game.on()
            
            # G <- 0
            self.G = 0
            
            # Loop for each step of episode, t = T-1, T-2, ..., 0:
            for step in episode:
                
                # G <- Gamma*G + Rt+1
                self.G = self.gamma*self.G + step[self.R]
              
                # Append G to Returns(St, At)
                self.Returns[(step[self.S], step[self.A])].append(self.G)

                # Q(St, At) <- average(Returns(St, At))
                self.Q[(step[self.S], step[self.A])] = self.average(self.Returns[(step[self.S], step[self.A])])
                
                # pi⇡(St) argmax a Q(St, a)
                A = [state[1] for state, value in self.Q.items() if value == max(self.Q[step[self.S], 0], self.Q[step[self.S], 1])][0]
                
                # For all a of A(St):
                for a in self.policy.values():
                    
                    # pi(a|St) <-- 1 - e + e/|A(St)| if a = A* 
                    self.probability[step[self.S]] = 1 - self.epsilon + self.epsilon/len(list(self.policy.values()))
                    
                    # if a != A*
                    if a != A:
                        
                        # pi(a|St) <-- e/|A(St)|
                        self.probability[step[self.S]] = self.epsilon/len(list(self.policy.values()))
                    
        # Pi = pi*
        return self.policy
    
    def off_policy(self):
        """
        Off-policy MC prediction (policy evaluation) for estimating Q = q pi
        """
        
        ##############
        # Initialize #
        ##############
          
        # arbitrary policy
        policy = self.policy.copy()
        
        # Initialize Q(s,a) and Returns(s,a) arbitrarily
        for state in policy: 
        
            # State-Action-Value Function Q(s,a)
            self.Q[(state, 0)] = 0.0
            self.Q[(state, 1)] = 0.0

            # Returns(s,a)
            self.C[(state, 0)] = 0
            self.C[(state, 1)] = 0
        
        ################
        # Loop Forever #
        ################
        
        # Loop for each episode
        for i in range(episodes):
            
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT
            self.game = Game(); episode = self.game.on()
            
            # G <- 0
            self.G = 0
            
            # W <- 0
            self.W = 1
            
            # Loop for each step of episode, t = T-1, T-2, ..., 0:
            for step in episode:
                
                # while W != 0
                while self.W != 0:
                    
                    # G <- Gamma*G + Rt+1
                    self.G = self.gamma*self.G + step[self.R]

                    # C(St, At) <-- C(St, At) + W
                    self.C[(step[self.S], step[self.A])] += self.W

                    # Q(St, At) <-- Q(St, At) + W/C(St, At)[G - Q(St, At)]  
                    self.Q[(step[self.S], step[self.A])] = (self.W/self.C[(step[self.S], step[self.A])])*(self.G - self.Q[(step[self.S], step[self.A])])

                    # W <-- W pi(a|s)/b(a|s)
                    self.W *= (self.probability[step[self.S]]/0.5)
        
        # return Q estimate of q-pi
        return self.Q
    
    def off_control(self):
        """
        Off-policy MC control, for estimating pi = p⇡*
        """
        
        ##############
        # Initialize #
        ##############
          
        # pi(s) argmax a Q(s, a) (with ties broken consistently)
        policy = self.policy.copy()
        
        # Initialize Q(s,a) and Returns(s,a) arbitrarily
        for state in policy: 
        
            # State-Action-Value Function Q(s,a)
            self.Q[(state, 0)] = 0.0
            self.Q[(state, 1)] = 0.0

            # Returns(s,a)
            self.C[(state, 0)] = 0
            self.C[(state, 1)] = 0
        
        ################
        # Loop Forever #
        ################
        
        # Loop for each episode
        for i in range(episodes):
            
            # Generate an episode following p⇡: S0, A0, R1, ... ,ST-1, AT-1, RT
            self.game = Game(); episode = self.game.on()
            
            # G <- 0
            self.G = 0
            
            # W <- 0
            self.W = 1
            
            # Loop for each step of episode, t = T-1, T-2, ..., 0:
            for step in episode:
 
                # G <- Gamma*G + Rt+1
                self.G = self.gamma*self.G + step[self.R]

                # C(St, At) <-- C(St, At) + W
                self.C[(step[self.S], step[self.A])] += self.W

                # Q(St, At) <-- Q(St, At) + W/C(St, At)[G - Q(St, At)]  
                self.Q[(step[self.S], step[self.A])] = (self.W/self.C[(step[self.S], step[self.A])])*(self.G - self.Q[(step[self.S], step[self.A])])

                # pi(St) argmax a Q(St, a) (with ties broken consistently)
                self.policy[step[self.S]] = [state[1] for state, value in self.Q.items() if value == max(self.Q[step[self.S], 0], self.Q[step[self.S], 1])][0]

                # If At != pi(St) 
                if step[self.A] != self.policy[step[self.S]]:
                    
                    # then exit inner Loop (proceed to next episode)
                    break

                # W <-- W pi(a|s)/b(a|s)
                self.W *= (1/0.5)
        
        # return Q estimate of q-pi
        return self.Q

In [7]:
if __name__ == "__main__":
    import pprint
    
    # create terminal printer instance
    pp = pprint.PrettyPrinter(width=160, compact=True)
    
    # Monte Carlo Algorithms
    mc = MonteCarlo()
    game = Game()
    
    print("First Visit Episodes: 10,000")
    pp.pprint(mc.first_visit(10000))
    pp.pprint(game.on())
    

First Visit Episodes: 10,000


KeyError: (9, 4, False)