In [1]:
import numpy as np
from typing import Literal, Union, List, Tuple, get_args
from pydantic import BaseModel
from tqdm import tqdm
import time
import os
import pickle

In [2]:
Action = Literal["D", "H", "Sp", "S"]
Card = Literal["2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K", "A"]

class Hand(BaseModel):
    cards: List[Card]
    active: bool

class Player(BaseModel):
    hands: List[Hand]

class Dealer():
    def __init__(self, num_decks=8, **kwargs):
        self.rng = np.random.default_rng(123)
        self.num_decks = num_decks
        self.shoe = None
        self.dealt_cards = None
        self.shuffle_shoe()

    def shuffle_shoe(self):
        deck = list(4 * get_args(Card))
        self.shoe = self.num_decks * deck
        self.dealt_cards = []
        self.rng.shuffle(self.shoe)

    def get_next_card(self) -> Card:
        next_card = self.shoe.pop()
        self.dealt_cards.append(next_card)
        return next_card

    def get_pct_cards_left_in_shoe(self) -> float:
        return len(self.shoe) / (len(self.shoe) + len(self.dealt_cards))

    def init_dealer_hand(self, cards: Tuple[Card, Card]):
        self.upcard = cards[0]
        self.hole_card = cards[1]
        self.hand = Hand(cards=cards, active=True)

    def clear_dealer_hand(self):
        self.upcard = None
        self.hole_card = None
        self.hand = Hand(cards=[], active=False)

In [3]:
# The average number of cards per player per round is pretty solidly 2.7 when playing by the book. People aren't likely
#   deviating from the book by much on aggregate.
# The number of cards per hand for the dealer is 2.78 with 1 player and 2.91 - 2.93 for 2+ players. This is because 
#   with 1 player if that player busts the dealer does not deal any more cards to themself. With more than 1 player, so
#   long as at least 1 person did not bust, the dealer takes their expected ~2.92 cards. It's unlikely that 2 players
#   will bust in the same round. 
# When using a balanced strategy, it is necessary to convert from a running count to a true count. The running count is
#   the total count and the true count is the running count divided by the number of decks remaining in the shoe.
# Instead of actually counting the actual total number of cards dealt, its common to count the number of rounds, and
#   multiply by the number of players (plus dealer), then multiply by 2.7.
# To account for the fact that the dealer typically sees more than 2.7 cards, you can add 1 every 5 rounds. However, if
#   you don't your estimate of cards dealt will be off by less than 2%
AVERAGE_CARDS_PER_HAND = 2.7

In [4]:
BOOK_PAIR_ACTIONS = {}
for score in range(2, 21, 2):
    for upcard_score in range(2, 12):
        # Always split Aces and 8's
        if score == 2 or score == 16:
            BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Sp"
        
        # Split 2s, 3s, and 7s if dealer has less than an 8
        elif score == 4 or score == 6 or score == 14:
            if upcard_score < 8:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Sp"
            else:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "H"

        # Split 4s if dealer has 5 or 6
        elif score == 8:
            if upcard_score in [5, 6]:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Sp"
            else:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "H"

        # Double down on 5's if dealer has less than a 10
        elif score == 10:
            if upcard_score < 10:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "H"
        
        # Split 6s if dealer has less than an 8
        elif score == 12:
            if upcard_score < 7:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Sp"
            else:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "H"

        # Split 9s unless dealer has 7, 10, or Ace (in which case stand)
        elif score == 18:
            if upcard_score not in [7, 10, 11]:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "Sp"
            else:
                BOOK_PAIR_ACTIONS[(score, upcard_score)] = "S"

        # Always stand with 20
        elif score == 20:
            BOOK_PAIR_ACTIONS[(score, upcard_score)] = "S"

In [5]:
BOOK_SOFT_ACTIONS = {}
for score in range(13, 22):
    for upcard_score in range(2, 12):
        if score >= 19:
            if score == 19 and upcard_score == 6:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "Ds"
            else:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "S"
        elif score == 18:
            if upcard_score < 7:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "Ds"
            elif upcard_score < 9:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "S"
            else:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "H"
        elif score == 17:
            if upcard_score in [3, 4, 5, 6]:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "H"
        elif score == 16 or score == 15:
            if upcard_score in [4, 5, 6]:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "H"
        elif score == 14 or score == 13:
            if upcard_score in [5, 6]:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_SOFT_ACTIONS[(score, upcard_score)] = "H"

# You can have a soft 11 after a split, but not a soft 12 (otherwise it is a pair of Aces)
for upcard_score in range(2, 12):
    BOOK_SOFT_ACTIONS[(11, upcard_score)] = "H"

In [6]:
BOOK_HARD_ACTIONS = {}
for score in range(2, 22):
    for upcard_score in range(2, 12):
        if score < 9:
            BOOK_HARD_ACTIONS[(score, upcard_score)] = "H"
        elif score == 9:
            if upcard_score in [3, 4, 5, 6]:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "H"
        elif score == 10:
            if upcard_score < 10:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "Dh"
            else:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "H"
        elif score == 11:
            BOOK_HARD_ACTIONS[(score, upcard_score)] = "Dh"
        elif score == 12:
            if upcard_score in [4, 5, 6]:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "S"
            else:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "H"
        elif score in [13, 14, 15, 16]:
            if upcard_score < 7:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "S"
            else:
                BOOK_HARD_ACTIONS[(score, upcard_score)] = "H"
        else:
            BOOK_HARD_ACTIONS[(score, upcard_score)] = "S"

In [None]:
# Royal Caribbean House Rules (Player Actions)
#   1) No Surrender
#   2) 3 Splits Max
#   3) One Draw Split Aces
#   4) Double Any Dealt Hand
#   5) Double After Split
#   6) No Double on Spit Aces

# Royal Caribbean House Rules (Blackjack Table)
#   Dealer hits on soft 17
#   Natural Blackjack pays 3:2
#       Split aces get 21, not Blackjack
#   8-deck shoes
#   75-80% Penetration
#   Hole Card
#   Note: Maybe also try single deck with 6:5 payout and see which is better for card counting. Single deck is 3 players with 3 rounds per deck

class BlackjackTable():
    def __init__(
            self,
            num_decks=8,
            num_other_players=3,
            shoe_penetration=0.75,
            state_function: Literal["basic", "hi_lo", "knock_out"] = "basic"
        ):
        self.num_other_players = num_other_players
        self.num_decks = num_decks
        self.shoe_penetration = shoe_penetration
        self.num_table_hands = self.num_other_players + 2 # agent and dealer

        self.state_function = {
            "basic": self.get_basic_state,
            "hi_lo": self.get_hi_lo_state,
            "knock_out": self.get_knock_out_state
        }[state_function]

        # Count of rounds this game (1 indexed)
        # Initialize to 0 because new_round increments to 1 (first round is index 1)
        #   A game begins with the first card dealt and ends when the shoe is empty enough that it needs to be shuffled
        self.round_idx = 0

        # Initialize the dealer (who is similar to a player, but different in significant ways)
        self.dealer = Dealer(num_decks)

        # Reset the game to shuffle the decks and initilize the player (in case someone uses them before running
        #   new_round)
        self.reset_game()

    def normalize_hands(self, player: Player) -> Player:
        new_player = Player(hands=[])
        for hand in player.hands:
            new_player.hands.append(Hand(cards=sorted(hand.cards), active=hand.active))
        return new_player

    def new_round(self):
        # Get all dealt cards for the agent, house, and other players
        dealt_cards = [self.dealer.get_next_card() for _ in range(2 * self.num_table_hands)]

        # Initialize all other player hands
        self.other_players = []
        for other_player_idx in range(0, self.num_other_players):
            dealt_hand = Hand(
                cards=dealt_cards[other_player_idx::self.num_table_hands],
                active=True
            )
            player_i = Player(hands=[dealt_hand])
            self.other_players.append(player_i)

        # Agent is last player (most advantageous seat, but only marginally)
        dealt_hand = Hand(
            cards=dealt_cards[self.num_other_players::self.num_table_hands],
            active=True
        )
        self.agent = Player(hands=[dealt_hand])

        # Dealer is always dealt last, and the 2 dealer cards 
        dealer_cards = dealt_cards[self.num_other_players + 1::self.num_table_hands]
        self.dealer.init_dealer_hand(dealer_cards)

        # Increment the round index
        self.round_idx += 1

        # Have all the other players take their turns
        for i in range(len(self.other_players)):
            player = self.other_players[i]
            while(not self.is_end_of_turn(player)):
                book_action = self.get_book_action(player)
                player = self.do_player_action(player, book_action)
            self.other_players[i] = player

    def reset_game(self):
        self.dealer.shuffle_shoe()
        self.agent = Player(hands=[Hand(cards=[], active=False)])
        self.other_players = [Player(hands=[Hand(cards=[], active=False)]) for _ in range(self.num_other_players)]
        self.round_idx = 0

    def score_hand(self, hand: Union[Hand, List[Card]]) -> Tuple[int, bool]:
        if isinstance(hand, Hand):
            cards = hand.cards
        else:
            cards = hand

        def card_value(card: str) -> int:
            if card in {"J", "Q", "K"}:
                return 10
            elif card == "A":
                return 11
            else:
                return int(card)

        total = 0
        aces = 0

        # Calculate 'soft' hand total (soft means each ace counts as 11)
        for card in cards:
            value = card_value(card)
            total += value
            if card == "A":
                aces += 1

        # If the hand busts, try to salvage it incrementally reducing value of aces to 1
        while total > 21 and aces:
            total -= 10
            aces -= 1

        # Return the hand total, and a boolean to indicate if it is a soft or hard (there are aces counting as 11)
        return total, bool(aces)

    def is_hand_busted(self, hand: Union[Hand, List[Card]]) -> bool:
        return self.score_hand(hand)[0] > 21

    def get_next_dealer_action(self):
        score, softness = self.score_hand(self.dealer.hand)

        # Dealer hits when les than 17, or on soft 17
        if score < 17 or (softness and score == 17):
            return "H"
        else:
            return "S"

    def do_dealer_turn(self):
        dealer_action = self.get_next_dealer_action()
        while dealer_action != "S":
            nextCard = self.dealer.get_next_card()
            self.dealer.hand.cards.append(nextCard)
            dealer_action = self.get_next_dealer_action()
        self.dealer.hand.active = False

    def get_active_hand_idx(self, player: Player) -> List[Action]:
        # TODO: remove assert
        assert sum(hand.active for hand in player.hands) <= 1, "Player can have at most 1 active hand"

        active_hand_idx = None
        for i in range(len(player.hands)):
            if player.hands[i].active:
                active_hand_idx = i
                break

        return active_hand_idx

    def is_end_of_turn(self, player: Player) -> bool:
        return self.get_active_hand_idx(player) is None

    def get_basic_state(self, player: Player) -> Tuple[Tuple[int, bool, bool], int]:
        """ Return (hand_states, active_hand_idx, upcard score) where each hand_state is (score, softnes bool, pair bool) """
        hand_states = []
        for hand in player.hands:
            score, softness = self.score_hand(hand)
            is_pair = len(hand.cards) == 2 and hand.cards[0] == hand.cards[1]
            hand_states.append((score, softness, is_pair))
        hand_states = tuple(hand_states)
        
        return (hand_states, self.get_active_hand_idx(player), self.score_hand([self.dealer.upcard])[0])

    def get_hi_lo_state(self, player: Player) -> tuple:
        """ Hi-Lo is a simple card counting system 
        """
        def hiLoValue(card: str) -> int:
            if card in {"10", "J", "Q", "K", "A"}:
                return -1
            elif card in {"7", "8", "9"}:
                return 0
            else:
                return 1

        basic_state = self.get_basic_state(player)
        running_count = sum(hiLoValue(card) for card in self.dealer.dealt_cards)
        estimate_cards_seen = self.round_idx * self.num_table_hands * AVERAGE_CARDS_PER_HAND
        estimte_decks_remaining = self.num_decks - estimate_cards_seen / 52
        true_count = int(np.rint(running_count / estimte_decks_remaining))
        
        return basic_state + (true_count,)

    def get_knock_out_state(self, player: Player) -> tuple:
        """ Knock-Out is a simplified version of Hi-Lo that only requires a running count
        
        Knock-out is unbalanced, so it starts with an offset based on the number of decks but doesn't require an
        estimate of the number of dealt cards
        """
        def knock_out_value(card: str) -> int:
            if card in {"10", "J", "Q", "K", "A"}:
                return -1
            elif card in {"8", "9"}:
                return 0
            else:
                return 1

        basic_state = self.get_basic_state(player)
        init_count = -4 * (self.num_decks - 1)
        count = init_count + sum(knock_out_value(card) for card in self.dealer.dealt_cards)
        return basic_state + (count,)

    def get_possible_actions(self, player: Player) -> List[Action]:
        """
        hand will always be a valid hand. It will never have 0 cards, and the total value will never exceed 21
        """
        active_hand_idx = self.get_active_hand_idx(player)
        
        # If there is no active hand, 
        if active_hand_idx is None:
            return []

        # Set the active hand
        cards = player.hands[active_hand_idx].cards

        # Player can always hit or stand
        hand_actions = ["H", "S"]

        # If player has 1 card, that means they split and this is their first action with this split of the hand
        # Player can double down after splitting any pair, except Aces
        if len(cards) == 1:
            if cards[0] != "A":
                hand_actions.append("D")

        # If player are 2 cards, this is either the first action of the round or some subsequent action after splitting
        if len(cards) == 2:
            # Iff this is the first action of the round, the player can double down
            if len(player.hands) == 1:
                hand_actions.append("D")

            # Player can split any pair so long as they have less than 3 total hands
            card1, card2 = cards
            if card1 == card2 and len(player.hands) < 3:
                hand_actions.append("Sp")

        # If this is the edge case where the player split aces, hit, and drew and ace
        #   Then they are still allowed to stand or split, but not allowed to hit
        if len(player.hands) > 1 and len(cards) == 2 and cards[0] == "A" and cards[1] == "A":
            hand_actions.remove("H")

        return hand_actions

    def get_book_action(self, player: Player) -> List[Action]:
        possible_actions = self.get_possible_actions(player)
        basic_state = self.get_basic_state(player)
        hand_states, active_hand_idx, upcard_score = basic_state
        active_hand = hand_states[active_hand_idx]
        score, softness, pairness = active_hand

        action = None
        if pairness:
            # If you can split, follow the normal splitting rules
            if "Sp" in possible_actions:
                # Always split paired aces when allowed
                if softness:
                    action = "Sp"
                else:
                    action = BOOK_PAIR_ACTIONS[(score, upcard_score)]
            
            # Not splitting means we have split twice already. In that case, follow the hard rules
            #   Note: There is a weird case where you split aces twice, then get another ace
            else:
                action = BOOK_HARD_ACTIONS[(score, upcard_score)]

        elif softness:
            action = BOOK_SOFT_ACTIONS[(score, upcard_score)]
        else:
            action = BOOK_HARD_ACTIONS[(score, upcard_score)]

        # With soft scores, there are some cases where you want to double down if allowed
        #   You're only allowed to double down when dealt a hand or the first card per hand after a split
        #   If double down if it is a legal action
        if action == "Dh":
            action = "D" if "D" in possible_actions else "H"
        if action == "Ds":
            action = "D" if "D" in possible_actions else "S"
        
        # TODO: remove assert
        assert action in possible_actions
        return action

    def do_player_action(self, player: Player, action: Action) -> Player:
        """ Given a player and an action, update the player's active hand to reflect the given action

        player will always have 1 active hand
        action will always be a valid action. It may cause an invalid hand (over 21), but will always be valid
            split only with pair, double down only on dealt hands or after split, etc
        """
        assert action in self.get_possible_actions(player) # TODO: remove assert
        active_hand_idx = self.get_active_hand_idx(player)

        # Boolean to denote that the player hit on split aces (which means this hand is not allowed any more cards)
        hit_on_split_aces = False

        # Update the cards based on the action taken
        if action == "D" or action == "H":
            # If player hit on split aces, they are only allowed 1 card
            if len(player.hands[active_hand_idx].cards) == 1 and player.hands[active_hand_idx].cards[0] == "A":
                hit_on_split_aces = True
            nextCard = self.dealer.get_next_card()
            player.hands[active_hand_idx].cards.append(nextCard)

        elif action == "Sp":
            card = player.hands[active_hand_idx].cards[0]
            hand1 = Hand(cards=[card], active=True)
            hand2 = Hand(cards=[card], active=False)
            player.hands[active_hand_idx] = hand1
            player.hands.insert(active_hand_idx + 1, hand2)

        elif action == "S":
            pass

        else:
            raise ValueError("""BlackjackTable.do_action only implemented for actions in ["D", "H", "Sp", "S"]""")

        # Deactivate the hand on split, stand, or bust
        #   Current hand is no longer active if move was Double down or stand
        #   Current hand is no longer active if it results in a hand with a value larger than 21
        if action == "D" or action == "S" or self.is_hand_busted(player.hands[active_hand_idx]):
            player.hands[active_hand_idx].active = False
            if (active_hand_idx+1) < len(player.hands):
                player.hands[active_hand_idx+1].active = True

        # If you hit on split aces, the hand is deactivated unless you are dealt another ace
        #   (in which case you can split or stand)
        # However, if you cannot split due to reaching the hand maximum, your only option is to stand
        #   (which deactivates the hand)
        if hit_on_split_aces:
            if not nextCard == "A" or len(player.hands) == 3:
                player.hands[active_hand_idx].active = False
                if (active_hand_idx+1) < len(player.hands):
                    player.hands[active_hand_idx+1].active = True

        return player

    def round_cleanup(self):
        """ Helper function to step """
        # If there's at least 1 non-busted hand at the table, do the dealer's turn
        if (
            any(not self.is_hand_busted(hand) for hand in self.agent.hands) or
            any(not self.is_hand_busted(hand) for player in self.other_players for hand in player.hands)
        ):
            self.do_dealer_turn()
            dealer_score = self.score_hand(self.dealer.hand)

        hand_rewards = []
        for hand in self.agent.hands:
            if self.is_hand_busted(hand):
                hand_rewards.append(-1)
            else:
                if self.score_hand(hand) > dealer_score:
                    hand_rewards.append(1)
                elif self.score_hand(hand) == dealer_score:
                    hand_rewards.append(0)
                else:
                    hand_rewards.append(-1)

        return self.state_function(self.agent), sum(hand_rewards), True

    def step(self, action: Action) -> Tuple[Tuple[Tuple[int, bool, bool], int], float, bool]:
        # TODO: move step into the RL class
        """ Updates the Blackjack Table based on the player action 
        
        Returns a tuple of (next_state, reward, done) where
            next_state is the state based on the initially given state function (basic, hi_lo, or knockout)
            reward is the money earned or lost
            done is a boolean to denote if this round is over as a result of the action
        """
        self.agent = self.do_player_action(self.agent, action)

        # If it is the end of the agent's turn, do the dealer's turn and give the reward
        if self.is_end_of_turn(self.agent):
            return self.round_cleanup()

        return self.state_function(self.agent), 0, False

    # TODO: move a lot of things in the Blackjack class into the player class
    #   Probably basic state, is_hand_busted, is_end_of_turn, get_active_hand_idx
    #   Probably get_next_dealer_action and do_dealer_turn to dealer class
    #   Probably have a by-the-book player class that inherits from the base player and adds get_book_action
    #   The RL class also inherits from the base player class and adds RL things like get next action

    # TODO: need something that puppets BlackjackTable
    #   step does most of the work. Might want something that walks it though the entire epoch (shoe) rather than
    #   jsut one game though

    # TODO: Need some initial state when the board isn't initialized where the only action for the player is "begin"
    #   This shouldn't change the Q-matrix of the state space
    #   This will make it so we can have each hand within the epoch begin with a count, and later when the agent decides
    #       on a bet, it can choose the bet before the hand begins

    # TODO: need to somehow incorperate if the player has blackjack, they get paid out 3:2
    #   I think this just goes in the new_round for now. Later, "start" needs to be its own action

In [8]:
dealer = Dealer()
assert dealer.shoe[0] == "4"
assert dealer.shoe[1] == "K"
assert dealer.shoe[-1] == "Q"
dealer.shuffle_shoe()

assert dealer.shoe[0] == "5"
assert dealer.shoe[1] == "4"
assert dealer.shoe[-1] == "8"

assert dealer.get_next_card() == "8"
assert dealer.shoe[0] == "5"
assert dealer.shoe[1] == "4"
assert dealer.shoe[-1] == "7"

for i in range(51):
    dealer.get_next_card()
assert dealer.get_pct_cards_left_in_shoe() == 0.875

dealer.init_dealer_hand(["A", "K"])
assert dealer.upcard == "A"
assert dealer.hole_card == "K"
assert dealer.hand.cards == ["A", "K"]
assert dealer.hand.active

dealer.clear_dealer_hand()
assert dealer.upcard == None
assert dealer.hole_card == None
assert dealer.hand.cards == []
assert not dealer.hand.active

In [None]:
# Reinforcement Learning Loop
#   For basic state, a hand is an epoch (no information is carried between shoes)
#   For non_basic states, a shoe is an epoch (information is carried between shoes)
"""
# 1000 epochs (go through 1000 shoes)
for i in range(1000):
    table.reset_game()
    while(enough cards left in shoe):"

        # Player takes actions until they take an action that ends their turn
        state = table.get_basic_state(table.agent)
        action = rl.get_action(state, table.get_possible_action(table.agent))

        done = False
        while(not done):
            next_state, reward, done = table.step(action)
            next_action = rl.get_action(next_state, table.get_possible_action(table.agent))

            # SARSA update
            #   Update state based on reward plus actual future reward
            #       actual bbecause we may take sub-optimal actions, which decreases out future reward
            # Q[(state, action)] += alpha * (reward + gamma * Q[(next_state, next_action)] - Q[(state, action)])

            # Q-Learning
            #   Update state based on reward plus future reward, assuming we always act optimally in the future
            #   max(Q[(next_state, :)]) is equivalent to Q[(next_state, best_action)])
            # Q[(state, action)] += alpha * (reward + gamma * max(Q[(next_state, :)]) - Q[(state, action)]

            #   Where Q is a hashmap of (state, action) -> reward
            rl.update(state, action, reward, next_state, next_action)
            
            state = next_state
            action = next_action
"""

# rl can be an epsilon-greedy policy to start
#   turns a random action some % of the time
#   returns the optimal action the rest of the time

'\n# 1000 epochs (go through 1000 shoes)\nfor i in range(1000):\n    table.reset_game()\n    while(enough cards left in shoe):"\n\n        # Player takes actions until they take an action that ends their turn\n        state = table.get_basic_state(table.agent)\n        action = rl.get_action(state, table.get_possible_action(table.agent))\n\n        done = False\n        while(not done):\n            next_state, reward, done = table.step(action)\n            next_action = rl.get_action(next_state, table.get_possible_action(table.agent))\n\n            # SARSA update\n            #   Update state based on reward plus actual future reward\n            #       actual bbecause we may take sub-optimal actions, which decreases out future reward\n            # Q[(state, action)] += alpha * (reward + gamma * Q[(next_state, next_action)] - Q[(state, action)])\n\n            # Q-Learning\n            #   Update state based on reward plus future reward, assuming we always act optimally in the futu

In [10]:
# Test new_round
table = BlackjackTable()
table.new_round()
assert table.round_idx == 1
assert len(table.dealer.dealt_cards) == 12
assert len(table.dealer.dealt_cards) == (len(table.agent.hands[0].cards) + len(table.dealer.hand.cards) + sum(sum(len(hand.cards) for hand in player.hands) for player in table.other_players))
assert len(table.dealer.shoe) == 52*8 - len(table.dealer.dealt_cards)

assert table.other_players[0].hands[0].cards == ["8", "10"]
assert not table.other_players[0].hands[0].active
assert table.other_players[1].hands[0].cards == ["7", "2", "A"]
assert not table.other_players[1].hands[0].active
assert table.other_players[2].hands[0].cards == ["3", "7", "2"]
assert not table.other_players[2].hands[0].active
assert table.agent.hands[0].cards == ["10", "9"]
assert table.agent.hands[0].active
assert table.dealer.upcard == "5"
assert table.dealer.hole_card == "5"
assert table.dealer.hand.cards == ["5", "5"]
assert table.dealer.hand.active
table.new_round()
assert table.round_idx == 2


In [11]:
# Test reset_game
table = BlackjackTable()
table.new_round()
table.reset_game()
assert table.round_idx == 0
assert table.agent == Player(hands=[Hand(cards=[], active=False)])
assert table.other_players[0] == Player(hands=[Hand(cards=[], active=False)])
assert table.other_players[1] == Player(hands=[Hand(cards=[], active=False)])
assert table.other_players[2] == Player(hands=[Hand(cards=[], active=False)])
assert len(table.dealer.dealt_cards) == 0
assert len(table.dealer.shoe) == 52*8


In [12]:
# Test score_hand
table = BlackjackTable()
hands_and_scores = {
    ("2", "3"): (5, False),
    ("4", "J"): (14, False),
    ("5", "6", "Q"): (21, False),
    ("7", "8", "K"): (25, False),
    ("9", "10", "A"): (20, False),
    ("10", "A"): (21, True),
    ("4", "A", "A"): (16, True),
    ("10", "10", "A", "A"): (22, False)
}
for hand, score in hands_and_scores.items():
    hand = Hand(cards=hand, active=True)
    assert table.score_hand(hand) == score


In [13]:
# TODO: test is_hand_busted

In [14]:
# Test get_next_dealer_action
table = BlackjackTable()
hands_and_actions = {
    ("2", "10"): "H",
    ("6",  "Q"): "H",
    ("A",  "J"): "S",
    ("A",  "6"): "H",
    ("10", "7"): "S"
}
for hand, action in hands_and_actions.items():
    table.dealer.hand = hand
    assert table.get_next_dealer_action() == action


In [15]:
# Test do_dealer_turn
table = BlackjackTable()

# Dealer hits on hard 15, then sdtands on 21
dealer_cards = ["5", "K"]
shoe = ["6"]
result_hand = ["5", "K", "6"]
table.dealer.init_dealer_hand(dealer_cards)
table.dealer.shoe = shoe
table.do_dealer_turn()
assert table.dealer.hand.cards == result_hand

# Dealer stands on soft 21
dealer_cards = ["A", "K"]
shoe = []
result_hand = ["A", "K"]
table.dealer.init_dealer_hand(dealer_cards)
table.dealer.shoe = shoe
table.do_dealer_turn()
assert table.dealer.hand.cards == result_hand

# Dealer hits on soft 13, hits on soft 17, then stands when over 21
dealer_cards = ["A", "2"]
shoe = ["Q", "4"]
result_hand = ["A", "2", "4", "Q"]
table.dealer.init_dealer_hand(dealer_cards)
table.dealer.shoe = shoe
table.do_dealer_turn()
assert table.dealer.hand.cards == result_hand

# Dealer hits on soft 13 then stands on soft 18
dealer_cards = ["A", "2"]
shoe = ["5"]
result_hand = ["A", "2", "5"]
table.dealer.init_dealer_hand(dealer_cards)
table.dealer.shoe = shoe
table.do_dealer_turn()
assert table.dealer.hand.cards == result_hand

# Dealer stands on soft 20
dealer_cards = ["K", "J"]
shoe = []
result_hand = ["K", "J"]
table.dealer.init_dealer_hand(dealer_cards)
table.do_dealer_turn()
assert table.dealer.hand.cards == result_hand


In [16]:
# Test get_active_hand_idx
table = BlackjackTable()
assert table.get_active_hand_idx(Player(hands=[Hand(cards=["A", "K"], active=False)])) is None
assert table.get_active_hand_idx(Player(hands=[Hand(cards=["A", "K"], active=True)])) == 0
assert table.get_active_hand_idx(Player(hands=[Hand(cards=["A", "K"], active=True), Hand(cards=["A", "K"], active=False)])) == 0
assert table.get_active_hand_idx(Player(hands=[Hand(cards=["A", "K"], active=False), Hand(cards=["A", "K"], active=True)])) == 1
assert table.get_active_hand_idx(Player(hands=[Hand(cards=["A", "K"], active=False), Hand(cards=["A", "K"], active=False)])) == None


In [17]:
# Test get_basic_state
table = BlackjackTable()
state_and_basic_states = {
    ((("2", "3", True), ("4", "J", False)), "J"): (((5, False, False), (14, False, False)), 0, 10),
    ((("5", "5", "Q", False), ("7", "8", "K", True)), "A"): (((20, False, False), (25, False, False)), 1, 11),
    ((("9", "9", False), ("9", True)), "2"): (((18, False, True), (9, False, False)), 1, 2),
    ((("A", "A", True), ("A", False)), "4"): (((12, True, True), (11, True, False)), 0, 4)
}
for state, basic_state in state_and_basic_states.items():
    hands, upcard = state
    agent_hands = [Hand(cards=hand[:-1], active=hand[-1]) for hand in hands]
    table.agent.hands = agent_hands
    table.dealer.upcard = upcard

    assert table.get_basic_state(table.agent) == basic_state


In [18]:
# Test get_hi_o_state
table = BlackjackTable()
state_and_hi_lo_state = {
    ((("2", "3", True), ("4", "2", False)), "2", ("2", "3", "7", "4", "2", "2")): (((5, False, False), (6, False, False)), 0, 2, 1),
    ((("5", "5", "K", "K", False), ("3", "K", "K", True)), "A", ("5", "5", "K", "K", "7", "3", "K", "K", "A") + ("K",)*4): (((30, False, False), (23, False, False)), 1, 11, -1),
    ((("9", "9", True), ("9", False)), "2", ("9", "9", "9", "A", "2") + ("2",)*100): (((18, False, True), (9, False, False)), 0, 2, 13),
    ((("A", "A", True), ("A", False)), "4", ("A", "A", "5", "4")): (((12, True, True), (11, True, False)), 0, 4, 0)
}
for state, hi_lo_state in state_and_hi_lo_state.items():
    hands, upcard, dealt_cards = state
    agent_hands = [Hand(cards=hand[:-1], active=hand[-1]) for hand in hands]
    table.agent.hands = agent_hands
    table.dealer.upcard = upcard
    table.dealer.dealt_cards = list(dealt_cards)
    table.round_idx = 1

    assert table.get_hi_lo_state(table.agent) == hi_lo_state


In [19]:
# Test get_knock_out_state
table = BlackjackTable()
state_and_knockout_state = {
    ((("2", "3", True), ("4", "2", False)), "2", ("2", "3", "7", "4", "2", "2")): (((5, False, False), (6, False, False)), 0, 2, -22),
    ((("5", "5", "K", "K", False), ("3", "K", "K", True)), "A", ("5", "5", "K", "K", "7", "3", "K", "K", "A") + ("K",)*4): (((30, False, False), (23, False, False)), 1, 11, -33),
    ((("9", "9", True), ("9", False)), "2", ("9", "9", "9", "A", "2") + ("2",)*100): (((18, False, True), (9, False, False)), 0, 2, 72),
    ((("A", "A", True), ("A", False)), "4", ("A", "A", "5", "4")): (((12, True, True), (11, True, False)), 0, 4, -28)
}

for state, knockout_state in state_and_knockout_state.items():
    hands, upcard, dealt_cards = state
    agent_hands = [Hand(cards=hand[:-1], active=hand[-1]) for hand in hands]
    table.agent.hands = agent_hands
    table.dealer.upcard = upcard
    table.dealer.dealt_cards = list(dealt_cards)
    table.round_idx = 1

    assert table.get_knock_out_state(table.agent) == knockout_state


In [20]:
# Test get_possible_player_actions
table = BlackjackTable()
hands_and_actions = {
    ("2",): ("H", "S", "D"),
    ("5", "Q", "Q"): ("H", "S"),
    ("9", "9"): ("H", "S", "D", "Sp"),
    ("A", "A"): ("H", "S", "D", "Sp")
}
for hand, actions in hands_and_actions.items():
    player = Player(hands=[Hand(cards=hand, active=True)])
    assert table.get_possible_actions(player) == list(actions)

# Can only split 3 times
table = BlackjackTable()
split_3_actions = table.get_possible_actions(
    Player(
        hands=[
            Hand(cards=["A", "K"], active=False),
            Hand(cards=["A", "Q"], active=False),
            Hand(cards=["A"], active=True)
        ]
    )
)
assert split_3_actions == ["H", "S"]

# The only active hand that cannot hit is once split aces, because you can choose to stand or split
#   Twice split aces has reached the hand limit, so you must stand. Only being able to stand deactivates the hand
#       So you can't get the actions for twice split aces
assert table.get_possible_actions(Player(hands=[Hand(cards=["A", "K"], active=False), Hand(cards=["A", "A"], active=True)])) == ["S", "Sp"]

In [21]:
# TODO: test get_book_action
#   Test based on score and dealer upcard. Build 3 matrices and get based on index

In [22]:
# Test do_player_action
# Double down gets 1 card then sets the hand to not active
table = BlackjackTable()
player = Player(hands=[Hand(cards=("2", "9"), active=True)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "D") == Player(hands=[Hand(cards=("2", "9", "Q"), active=False)])

# Hit takes one card, but the hand is still active if the result is under 21
table = BlackjackTable()
player = Player(hands=[Hand(cards=("2", "2"), active=True)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "H") == Player(hands=[Hand(cards=("2", "2", "Q"), active=True)])

# Hit takes one card, but the hand is no longer active if the result is over 21
table = BlackjackTable()
player = Player(hands=[Hand(cards=("Q", "Q"), active=True)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "H") == Player(hands=[Hand(cards=("Q", "Q", "Q"), active=False)])

# With split aces, you are only allowed to receive one card (so the current hand becomes not active and the next hard become active
table = BlackjackTable()
player = Player(hands=[Hand(cards=("A",), active=True), Hand(cards=("A",), active=False)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "H") == Player(hands=[Hand(cards=("A", "Q"), active=False), Hand(cards=("A",), active=True)])

table = BlackjackTable()
player = Player(hands=[Hand(cards=("A",), active=False), Hand(cards=("A",), active=True)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "H") == Player(hands=[Hand(cards=("A",), active=False), Hand(cards=("A", "Q"), active=False)])

# Stand sets the hand to not active
table = BlackjackTable()
player = Player(hands=[Hand(cards=("6", "Q"), active=True)])
table.dealer.shoe = ["A"]
assert table.do_player_action(player, "S") == Player(hands=[Hand(cards=("6", "Q"), active=False)])

# Split creates two hands, each with 1 card and sets the first hand to active and the second to not active
table = BlackjackTable()
player = Player(hands=[Hand(cards=("6", "6"), active=True)])
table.dealer.shoe = ["Q"]
assert table.do_player_action(player, "Sp") == Player(hands=[Hand(cards=("6",), active=True), Hand(cards=("6",), active=False)])

In [None]:
# Integration test of step, get_possible_actions, get_book_action, and do_action
#   Also reset_game, new_round, get_pct_cards_left_in_shoe, and get_active_hand_idx, score_hand, is_end_of_turn
# Takes about 15 minutes to rip through

# Play 1M shoes (~56M rounds, ~86M actions)
round_count = 0
action_count = 0

table = BlackjackTable(num_other_players=0)
for _ in tqdm(range(int(1e6))):
    table.reset_game()
    while table.dealer.get_pct_cards_left_in_shoe() > (1 - table.shoe_penetration):
        table.new_round()
        round_count += 1

        # While the agent has an active hand, let the agent play by the book
        while(not table.is_end_of_turn(table.agent)):
            # Assert the player has exactly 1 active hand
            assert sum(hand.active for hand in table.agent.hands) == 1

            # Assert the active hand is not busted
            assert not table.is_hand_busted(table.agent.hands[table.get_active_hand_idx(table.agent)])

            # Assert each hand has at least 1 card
            assert all(len(hand.cards) for hand in table.agent.hands)

            book_action = table.get_book_action(table.agent)
            assert book_action in table.get_possible_actions(table.agent)
            state, reward, done = table.step(book_action)
            action_count += 1

            if done:
                assert table.is_end_of_turn(table.agent)

print(round_count, action_count)

100%|██████████| 1000000/1000000 [23:04<00:00, 722.51it/s]

56195128 86176974



