In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from first_visit_MC import first_visit_mc_policy_evaluation

# Blackjack

The object of the popular casino card game of *blackjack* is to obtain cards the sum of whose numerical values is as great as possible without exceeding 21. All face cards count as 10, and an ace can count either as 1 or as 11. The game begins with two cards dealt to both dealer and player. One of the dealer’s cards is face up and the other is face down. If the player has 21 immediately (an ace and a 10-card), it is called a natural. He then wins unless the dealer also has a natural, in which case the game is a draw. If the player does not have a natural, then he can request additional cards, one by one (hits), until he either stops (sticks) or exceeds 21 (goes bust). If he goe bust, he loses; if he sticks, then it becomes the dealer’s turn. The dealer hits or sticks according to a fixed strategy without choice: he sticks on any sum of 17 or greater, and hits otherwise. If the dealer goes bust, then the player wins; otherwise, the outcome—win, lose, or draw—is determined by whose final sum is closer to 21.

Playing blackjack is naturally formulated as an episodic finite MDP. Each game of blackjack is an episode. Rewards of +1, 1, and 0 are given for winning, losing, and drawing, respectively. All rewards within a game are zero, and we do not discount ($\gamma$ = 1); therefore these terminal rewards are also the returns. The player’s actions are to hit or to stick. The states depend on the player’s cards and the dealer’s showing card. We assume that cards are dealt from an infinite deck (i.e., with replacement) so that there is no advantage to keeping track of the cards already dealt. If the player holds an ace that he could count as 11 without going bust, then the ace is said to be usable. In this case it is always counted as 11 because counting it as 1 would make the sum 11 or less, in which case there is no decision to be made because, obviously, the player should always hit. Thus, the player makes decisions on the basis of three variables: his current sum (12–21), the dealer’s one showing card (ace–10), and whether or not he holds a usable ace.

Consider the policy that sticks if the player’s sum is 20 or 21, and otherwise hits.

In [2]:
# Create states and policy
states = []
for sum_player in range(12, 22):
    for sum_dealer in range(1, 11):
        for usable_ace in [True, False]:
            state = (sum_player, sum_dealer, usable_ace)
            states.append(state)

ACTIONS = ['hits', 'sticks']
policy = {}
for s in states:
    if s[0] >= 20:
        policy[s] = ACTIONS[1]
    else:
        policy[s] = ACTIONS[0]

In [3]:
# Create the function to generate an episoed: a list of (state, action, reward)
def generate_episode_blackjack(policy: dict):
    """
    Returns:
        episode: a list of (state, action, reward) tuples.
    """

    def draw_card():
        card = np.random.randint(1, 14)
        return min(card, 10)
    
    def draw_hand():
        return [draw_card(), draw_card()]
    
    def usable_ace(hand: list):
        return 1 in hand and sum(hand) + 10 <= 21
    
    def sum_hand(hand: list):
        total = sum(hand)
        if 1 in hand and total + 10 <= 21:
            return total + 10
        return total
    
    def is_bust(hand):
        return sum_hand(hand) > 21
    
    def dealer_policy(dealer_hand):
        while sum_hand(dealer_hand) < 17:
            dealer_hand.append(draw_card())
        return dealer_hand
    
    # Initialize the game
    player_hand = draw_hand()
    dealer_hand = draw_hand()
    dealer_showing = dealer_hand[0]

    episode = []

    while sum_hand(player_hand) < 12:
        player_hand.append(draw_card())
    
    while True:
        player_sum = sum_hand(player_hand)
        usable = usable_ace(player_hand)
        state = (player_sum, dealer_showing, usable)

        action = policy[state] if state in policy else ('sticks' if player_sum >= 20 else 'hits')

        episode.append((state, action, 0))

        if action == 'hits':
            player_hand.append(draw_card())
            if is_bust(player_hand):
                episode.append(((sum_hand(player_hand), dealer_showing, usable_ace(player_hand)), None, -1))
                return episode
        
        else:
            break
    
    dealer_hand = dealer_policy(dealer_hand)

    if is_bust(dealer_hand):
        reward = 1
    else:
        player_total = sum_hand(player_hand)
        dealer_total = sum_hand(dealer_hand)
        if player_total > dealer_total:
            reward = 1
        elif player_total < dealer_total:
            reward = -1
        else:
            reward = 0
    
    episode.append(((sum_hand(player_hand), dealer_showing, usable_ace(player_hand)), None, reward))
    return episode

In [7]:
# Run the Blackjack lab
first_visit_mc_policy_evaluation(states=states,
                                 policy=policy,
                                 generate_episode_fn=generate_episode_blackjack,
                                 gamma=1,
                                 num_episodes=1000)

{(12, 1, True): 0.0,
 (12, 1, False): np.float64(-0.5454545454545454),
 (12, 2, True): 0.0,
 (12, 2, False): np.float64(-0.4),
 (12, 3, True): 0.0,
 (12, 3, False): np.float64(-0.25),
 (12, 4, True): 0.0,
 (12, 4, False): np.float64(-0.45454545454545453),
 (12, 5, True): 0.0,
 (12, 5, False): np.float64(-0.6363636363636364),
 (12, 6, True): np.float64(-1.0),
 (12, 6, False): np.float64(-0.5555555555555556),
 (12, 7, True): 0.0,
 (12, 7, False): np.float64(-0.75),
 (12, 8, True): 0.0,
 (12, 8, False): np.float64(-0.3333333333333333),
 (12, 9, True): np.float64(1.0),
 (12, 9, False): np.float64(0.0),
 (12, 10, True): np.float64(-1.0),
 (12, 10, False): np.float64(-0.7631578947368421),
 (13, 1, True): np.float64(0.0),
 (13, 1, False): np.float64(-0.8181818181818182),
 (13, 2, True): np.float64(1.0),
 (13, 2, False): np.float64(-0.45454545454545453),
 (13, 3, True): np.float64(0.0),
 (13, 3, False): np.float64(-0.5555555555555556),
 (13, 4, True): np.float64(1.0),
 (13, 4, False): np.float