In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from MC_methods import first_visit_mc_policy_evaluation

## Blackjack

The object of the popular casino card game of *blackjack* is to obtain cards the sum of whose numerical values is as great as possible without exceeding 21. All face cards count as 10, and an ace can count either as 1 or as 11. The game begins with two cards dealt to both dealer and player. One of the dealer’s cards is face up and the other is face down. If the player has 21 immediately (an ace and a 10-card), it is called a natural. He then wins unless the dealer also has a natural, in which case the game is a draw. If the player does not have a natural, then he can request additional cards, one by one (hits), until he either stops (sticks) or exceeds 21 (goes bust). If he goe bust, he loses; if he sticks, then it becomes the dealer’s turn. The dealer hits or sticks according to a fixed strategy without choice: he sticks on any sum of 17 or greater, and hits otherwise. If the dealer goes bust, then the player wins; otherwise, the outcome—win, lose, or draw—is determined by whose final sum is closer to 21.

Playing blackjack is naturally formulated as an episodic finite MDP. Each game of blackjack is an episode. Rewards of +1, -1, and 0 are given for winning, losing, and drawing, respectively. All rewards within a game are zero, and we do not discount ($\gamma$ = 1); therefore these terminal rewards are also the returns. The player’s actions are to hit or to stick. The states depend on the player’s cards and the dealer’s showing card. We assume that cards are dealt from an infinite deck (i.e., with replacement) so that there is no advantage to keeping track of the cards already dealt. If the player holds an ace that he could count as 11 without going bust, then the ace is said to be usable. In this case it is always counted as 11 because counting it as 1 would make the sum 11 or less, in which case there is no decision to be made because, obviously, the player should always hit. Thus, the player makes decisions on the basis of three variables: his current sum (12–21), the dealer’s one showing card (ace–10), and whether or not he holds a usable ace.

Consider the policy that sticks if the player’s sum is 20 or 21, and otherwise hits.

In [2]:
# Create states and policy
states = []
for sum_player in range(12, 22):
    for sum_dealer in range(1, 11):
        for usable_ace in [True, False]:
            state = (sum_player, sum_dealer, usable_ace)
            states.append(state)

ACTIONS = ['hits', 'sticks']
policy = {}
for s in states:
    if s[0] >= 20:
        policy[s] = ACTIONS[1]
    else:
        policy[s] = ACTIONS[0]

In [None]:
# Create the function to generate an episoed: a list of (state, action, reward)
def generate_episode_blackjack(policy: dict):
    """
    Returns:
        episode: a list of (state, action, reward) tuples.
    """

    def draw_card():
        card = np.random.randint(1, 14)
        return min(card, 10)
    
    def draw_hand():
        return [draw_card(), draw_card()]
    
    def usable_ace(hand: list):
        return 1 in hand and sum(hand) + 11 <= 21
    
    def sum_hand(hand: list):
        total = sum(hand)
        if 1 in hand and total + 11 <= 21:
            return total + 11
        return total
    
    def is_bust(hand):
        return sum_hand(hand) > 21
    
    def dealer_policy(dealer_hand):
        while sum_hand(dealer_hand) < 17:
            dealer_hand.append(draw_card())
        return dealer_hand
    
    # Initialize the game
    player_hand = draw_hand()
    dealer_hand = draw_hand()
    dealer_showing = dealer_hand[0]

    episode = []

    while sum_hand(player_hand) < 12:
        player_hand.append(draw_card())
    
    while True:
        player_sum = sum_hand(player_hand)
        usable = usable_ace(player_hand)
        state = (player_sum, dealer_showing, usable)

        action = policy[state] if state in policy else ('sticks' if player_sum >= 20 else 'hits')

        episode.append((state, action, 0))

        if action == 'hits':
            player_hand.append(draw_card())
            if is_bust(player_hand):
                episode.append(((sum_hand(player_hand), dealer_showing, usable_ace(player_hand)), None, -1))
                return episode
        
        else:
            break
    
    dealer_hand = dealer_policy(dealer_hand)

    if is_bust(dealer_hand):
        reward = 1
    else:
        player_total = sum_hand(player_hand)
        dealer_total = sum_hand(dealer_hand)
        if player_total > dealer_total:
            reward = 1
        elif player_total < dealer_total:
            reward = -1
        else:
            reward = 0
    
    episode.append(((sum_hand(player_hand), dealer_showing, usable_ace(player_hand)), None, reward))
    return episode

In [4]:
# Run the policy evaluation on the Blackjack example
first_visit_mc_policy_evaluation(states=states,
                                 policy=policy,
                                 generate_episode_fn=generate_episode_blackjack,
                                 gamma=1,
                                 num_episodes=1000)

{(12, 1, True): 0.0,
 (12, 1, False): np.float64(-0.6923076923076923),
 (12, 2, True): 0.0,
 (12, 2, False): np.float64(-0.6923076923076923),
 (12, 3, True): 0.0,
 (12, 3, False): np.float64(-0.1111111111111111),
 (12, 4, True): 0.0,
 (12, 4, False): np.float64(-0.36363636363636365),
 (12, 5, True): 0.0,
 (12, 5, False): np.float64(-1.0),
 (12, 6, True): 0.0,
 (12, 6, False): np.float64(-0.6666666666666666),
 (12, 7, True): np.float64(-1.0),
 (12, 7, False): np.float64(-1.0),
 (12, 8, True): 0.0,
 (12, 8, False): np.float64(-0.7777777777777778),
 (12, 9, True): 0.0,
 (12, 9, False): np.float64(-0.8571428571428571),
 (12, 10, True): np.float64(-1.0),
 (12, 10, False): np.float64(-0.34210526315789475),
 (13, 1, True): np.float64(0.0),
 (13, 1, False): np.float64(-0.5833333333333334),
 (13, 2, True): np.float64(-1.0),
 (13, 2, False): np.float64(-0.6666666666666666),
 (13, 3, True): np.float64(-1.0),
 (13, 3, False): np.float64(-0.5454545454545454),
 (13, 4, True): np.float64(-1.0),
 (13,

## Blackjack ES

We now want to seek the optimal policy.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from MC_methods import mc_control_es

In [2]:
# Creamos la lista de states
states = []
for s in range(12, 22):
    for d in range(1, 11):
        for usable in [True, False]:
            states.append((s, d, usable))
len(states)

200

In [3]:
states_without_usable_ace = []
for s in range(12, 22):
    for d in range(1, 11):
        states_without_usable_ace.append((s, d))
len(states_without_usable_ace)

100

In [4]:
# Creamos el diccionario de acciones
ACTIONS = ['hits', 'sticks']
actions = {}
for s in states:
    actions[s] = ACTIONS
len(actions)

200

In [5]:
# Creamos el diccionario de acciones
actions_without_usable_ace = {}
for s in states_without_usable_ace:
    actions_without_usable_ace[s] = ACTIONS
len(actions_without_usable_ace)

100

In [27]:
# Creamos la función para generar episodios
def generate_episode_es_blackjack(initial_state: tuple, 
                                  initial_action: str, 
                                  policy: dict):
    """
    inital_state: (player_sum, showing_dealer_card, usable_ace)
    initial_action: string --> 'hits' or 'sticks'
    policy: dict

    Devuelve una lista de tuplas [(s0, a0, r1), (s1, a1, r2), ..., (sT-1, aT-1, rT)]
    """

    # Mano del dealer
    dealer_usable_ace = False
    showing_dealer_card = initial_state[1]
    if showing_dealer_card == 1:
        showing_dealer_card == 11
        dealer_usable_ace = True
    card_dealer = np.random.randint(1, 14)
    if card_dealer > 10:
        card_dealer = 10
    if (card_dealer == 1) and (showing_dealer_card + 11 <= 21):
        card_dealer = 11
        dealer_usable_ace = True
    sum_dealer = showing_dealer_card + card_dealer
    
    # Función auxiliar para crear la policy del dealer
    def policy_dealer(sum_dealer: int, dealer_usable_ace: bool):
        if sum_dealer >= 17:
            return sum_dealer
        else:
            card = np.random.randint(1, 14)
            if card > 10:
                card = 10

            if (card == 1) and ((sum_dealer + 11) <= 21):
                sum_dealer = sum_dealer + 11
                dealer_usable_ace = True
                return policy_dealer(sum_dealer, dealer_usable_ace)
            elif (sum_dealer + card > 21) and (dealer_usable_ace == True):
                sum_dealer = sum_dealer + card - 10
                dealer_usable_ace = False
                return policy_dealer(sum_dealer, dealer_usable_ace)
            elif (sum_dealer + card > 21) and (dealer_usable_ace == False):
                return sum_dealer + card
            return policy_dealer(sum_dealer + card, dealer_usable_ace)

    # Inicializamos la lista de los steps del episodio
    episodio = []
    reward = 0
    # Si la acción es hits el dealer nos dará una carta
    if initial_action == 'hits':
        card = np.random.randint(1, 14)
        if card > 10:
            card = 10
        next_state = (initial_state[0] + card, initial_state[1], initial_state[2])
        # Calculamos la reward
        if (next_state[0] > 21) and (next_state[2] == False):
            reward = -1
            episodio.append((initial_state, initial_action, reward))
            return episodio
        elif (next_state[0] > 21) and (next_state[2] == True):
            next_state = (next_state[0] - 10, next_state[1], False)
            episodio.append((initial_state, initial_action, 0))
        episodio.append((initial_state, initial_action, 0))
    else:
        total_sum_dealer = policy_dealer(sum_dealer, dealer_usable_ace)
        if total_sum_dealer > 21:
            reward = 1
            episodio.append((initial_state, initial_action, reward))
            return episodio
        else:
            if total_sum_dealer > initial_state[0]:
                reward = -1
                episodio.append((initial_state, initial_action, reward))
                return episodio
            elif total_sum_dealer == initial_state[0]:
                episodio.append((initial_state, initial_action, reward))
                return episodio
            else:
                reward = 1
                episodio.append((initial_state, initial_action, reward))
                return episodio

    # Hay que seguir la policy
    while True:
        next_action = policy[next_state]
        # Si la próxima acción es sticks se termina el juego
        if next_action == 'sticks':
            total_sum_dealer = policy_dealer(sum_dealer, dealer_usable_ace)
            if total_sum_dealer > 21:
                reward = 1
                episodio.append((next_state, next_action, reward))
                return episodio
            else:
                if total_sum_dealer > next_state[0]:
                    reward = -1
                    episodio.append((next_state, next_action, reward))
                    return episodio
                elif total_sum_dealer == next_state[0]:
                    episodio.append((next_state, next_action, 0))
                    return episodio
                else:
                    reward = 1
                    episodio.append((next_state, next_action, reward))
                    return episodio
        else:
            next_card = np.random.randint(1, 14)
            if next_card > 10:
                next_card = 10
            # Calculamos la reward
            next_player_sum = next_state[0] + next_card
            if (next_player_sum > 21) and (next_state[2] == False):
                reward = -1
                episodio.append((next_state, next_action, -1))
                return episodio
            elif (next_player_sum > 21) and (next_state[2] == True):
                episodio.append((next_state, next_action, 0))
                next_state = (next_player_sum - 10, next_state[1], False)
            else:
                episodio.append((next_state, next_action, 0))
                next_state = (next_player_sum, next_state[1], next_state[2])

In [28]:
Q, policy = mc_control_es(states=states,
              actions=actions,
              generate_episode_es_fn=generate_episode_es_blackjack,
              num_episodes=100000)

In [29]:
policy

{(12, 1, True): 'hits',
 (12, 1, False): 'hits',
 (12, 2, True): 'hits',
 (12, 2, False): 'hits',
 (12, 3, True): 'hits',
 (12, 3, False): 'sticks',
 (12, 4, True): 'hits',
 (12, 4, False): 'hits',
 (12, 5, True): 'hits',
 (12, 5, False): 'sticks',
 (12, 6, True): 'hits',
 (12, 6, False): 'hits',
 (12, 7, True): 'hits',
 (12, 7, False): 'hits',
 (12, 8, True): 'hits',
 (12, 8, False): 'hits',
 (12, 9, True): 'hits',
 (12, 9, False): 'hits',
 (12, 10, True): 'hits',
 (12, 10, False): 'hits',
 (13, 1, True): 'hits',
 (13, 1, False): 'hits',
 (13, 2, True): 'hits',
 (13, 2, False): 'hits',
 (13, 3, True): 'hits',
 (13, 3, False): 'sticks',
 (13, 4, True): 'hits',
 (13, 4, False): 'hits',
 (13, 5, True): 'hits',
 (13, 5, False): 'sticks',
 (13, 6, True): 'hits',
 (13, 6, False): 'sticks',
 (13, 7, True): 'hits',
 (13, 7, False): 'hits',
 (13, 8, True): 'hits',
 (13, 8, False): 'hits',
 (13, 9, True): 'hits',
 (13, 9, False): 'hits',
 (13, 10, True): 'hits',
 (13, 10, False): 'hits',
 (14, 