In [3]:
import sys
sys.path.append('../src/')

from blackjack_utils import Hand, Decks, dealer_simulation

import numpy as np
import gymnasium as gym
from gymnasium import spaces

Making environment for blackjack training

Action space:
4: 0 - hit, 1 - stand, 2 - split, 3 - double down

Observation space:

All hands <= 21:
04: 1 - 2 pair, 05: 1 - 5 sum, 06: 2 - 3 pair, 6 sum, 07: 1 - 7 sum, 08: 2 - 4 pair, 8 sum, 09: 1 - 9 sum, 10: 2 - 5 pair, 10 sum, 11: 1 - 11 sum, 12: 3 - 6 pair, Ace pair, 12 sum, 13: 2 - Ace 2, 13 sum, 14: 3 - 7 pair, Ace 3, 14 sum, 15: 2 - Ace 4, 15 sum, 16: 3 - 8 pair, Ace 5, 16 sum, 17: 2 - Ace 6, 17 sum, 18: 3 - 9 pair, Ace 7, 18 sum, 19: 2 - Ace 8, 19 sum, 20: 2 - 10/Face pair, Ace 9

Total 33 hands

Total 10 possible dealer up cards


Hand assignment table:
- 1: 5 sum
- 2: 6 sum
- 3: 7 sum
- 4: 8 sum
- 5: 9 sum
- 6: 10 sum
- 7: 11 sum
- 8: 12 sum
- 9: 13 sum
- 10: 14 sum
- 11: 15 sum
- 12: 16 sum
- 13: 17 sum
- 14: 18 sum
- 15: 19 sum
- 16: Ace 2
- 17: Ace 3
- 18: Ace 4
- 19: Ace 5
- 20: Ace 6
- 21: Ace 7
- 22: Ace 8
- 23: Ace 9
- 24: 2 pair
- 25: 3 pair
- 26: 4 pair
- 27: 5 pair
- 28: 6 pair
- 29: 7 pair
- 30: 8 pair
- 31: 9 pair
- 32: 10/Face pair
- 33: Ace pair

Dealer card table:
- 1: 2
- 2: 3
- 3: 4
- 4: 5
- 5: 6
- 6: 7
- 7: 8
- 8: 9
- 9: 10/Face
- 10: Ace

SUBTRACT 1 FROM DEALER

USING ACTION MASK FOR WHEN SPLIT/DOUBLE CANNOT OCCUR

For just hit, split, double down:

hand assignment table:
- 1: 4 sum
- 2: 5 sum
- 3: 6 sum
- 4: 7 sum
- 5: 8 sum
- 6: 9 sum
- 7: 10 sum
- 8: 11 sum
- 9: 12 sum
- 10: 13 sum
- 11: 14 sum
- 12: 15 sum
- 13: 16 sum
- 14: 17 sum
- 15: 18 sum
- 16: 19 sum
- 17: 20 sum
- 18: Ace 2
- 19: Ace 3
- 20: Ace 4
- 21: Ace 5
- 22: Ace 6
- 23: Ace 7
- 24: Ace 8
- 25: Ace 9

SUBTRACT 1 FROM HAND

- action space: 3 - 0 hit, 1 stand, 2 double
- observation: 25 hands, 10 dealer

In [153]:
class BlackjackEnv(gym.Env):
    metadata = {
        "render_modes": ["rgb_array"]
    }

    def __init__(self):
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Tuple((spaces.Discrete(25), spaces.Discrete(10)))

        self.shoe = Decks(6)
        self.shoe.shuffle()

    def _get_hand_num(self, hand):
        # If pair of aces
        if hand[0].value == 11 and hand[1].value == 11:
            return 9
        # If one card is ace
        elif hand[0].value == 11:
            return hand[1].value + 16
        elif hand[1].value == 11:
            return hand[0].value + 16
        # If just summing card total
        else:
            return hand.value() - 3
        
    def _get_dealer_num(self, card):
        return card.value - 1
    
    def _action_mask(self):
        if self.first_action:
            return np.ones(3, dtype=np.int8)
        
        return np.array([1, 1, 0], dtype=np.int8)
    
    def _get_obs(self):
        return (
            self._get_hand_num(self.player_hand),
            self._get_dealer_num(self.dealer_upcard)
        )

    def reset(self):
        if len(self.shoe) <= 78:
            self.shoe = Decks(6)
            self.shoe.shuffle()

        self.player_hand = Hand()
        self.dealer_hand = Hand()

        self.first_action = True
        self.round_done = False
        self.multiplier = 1

        for _ in range(2):
            self.shoe.deal(self.player_hand)
            self.shoe.deal(self.dealer_hand)
        
        self.dealer_upcard = self.dealer_hand[0]

        return self._get_obs(), {'action_mask': self._action_mask()}
    
    def step(self, action):
        if action == 0: # Hit
            self.shoe.deal(self.player_hand)
        elif action == 1: # Stand
            self.round_done = True
        elif action == 2: # Double Down
            self.shoe.deal(self.player_hand)
            self.multiplier = 2
            self.round_done = True

        self.first_action = False

        if self.player_hand.value() >= 21:
            self.round_done = True

        reward = 0
        
        if self.round_done:
            dealer_simulation(self.dealer_hand, self.shoe)

            if self.player_hand.value() > 21:
                reward = -1 * self.multiplier
            elif self.dealer_hand.value() > 21 or self.dealer_hand.value() < self.player_hand.value():
                reward = 1 * self.multiplier # Payout for win
            elif self.dealer_hand.value() > self.player_hand.value():
                reward = -1 * self.multiplier
            # Else tie, so do nothing

        return self._get_obs(), reward, self.round_done, False, {'action_mask': self._action_mask()}


In [165]:
env = BlackjackEnv()

total_rewards = 0
episodes = 10000

for _ in range(episodes):
    _, info = env.reset()

    while not env.round_done:
        action = env.action_space.sample(info["action_mask"])

        _, reward, _, _, info = env.step(action)
        total_rewards += reward
        
avg_reward = total_rewards / episodes
print(f'{avg_reward * 100:.2f}')

-49.21
