In [8]:
# ========= Reinforcement Learning and Dynamic Optimization ======== #
# -------------Card Counting Black Jack: Phase 1-------------------- #
#
# Michalis Lamprakis  2020030077
# Dimitris Ilia       2020030200

HIT  = 0
STICK = 1
A = [HIT, STICK]

# BlackJack Environment.
# Methods starts with _ naming convention for private methods (called only inside the environment).

class BlackjackEnv:

    # Constructor initializes the environment.
    # Called with the creation of the environment.
    def __init__(self):
      self.reset()

    # Deck creation.
    # Each card is a tuple (rank, suit) e.g (4,hearts) , (A,spades)
    def _init_deck(self):
      suits = ['hearts', 'spades', 'diamonds', 'clubs']
      ranks = ['2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K', 'A']
      self.deck = []
      for suit in suits:
          for rank in ranks:
              self.deck.append((rank, suit))


    # Card value function
    # Calculate the value of each card.
    def _card_value(self,card):
        rank = card[0] # Take the rank of each card from the tuple
        if rank in ['J', 'Q', 'K']:
            return 10
        elif rank == 'A':
            return 11  # Initially count Ace as 11, change it later if needed
        else:
            return int(rank) # Else, each card counts as the number of the rank (it is a string so we convert it into int)

    # Calculate hand value
    def _hand_value(self,hand):
        value = 0 # Total hand value
        num_of_aces = 0 # Keeps track the num of aces

        # For each card in hand sum the value and if there is an ace increase num_aces
        for card in hand:
            value += self._card_value(card)
            if card[0] == 'A':
                num_of_aces += 1
        # While the total hand value is greater than 21 AND there is an ace in the game subtract 10 from the sum
        # (this means we count the ace from 11 to 1) and deacrease the number of aces, repeat until sum is less than 21 or the num of aces is 0.
        while (value > 21) and (num_of_aces > 0):
            value -= 10  # convert an Ace from 11 to 1
            num_of_aces -= 1

        usable_ace = (num_of_aces > 0) # If num of aces is > 0 there is a usable ace so return it
        return value, usable_ace


    # Resets the game
    def reset(self):
        self._init_deck()
        random.shuffle(self.deck) # Shuffle the deck

        # Make user's and dealer hand.
        # pop() removes the top card from the shuffled game_deck.
        self.player = [self.deck.pop(), self.deck.pop()]
        self.dealer = [self.deck.pop(), self.deck.pop()]

        return self._get_obs(), {} # Returns the initial observation (state).


    # Obs stands for observation
    # Returns the state of the game as a tuple of {players hand value, dealer's first visible card, wheather player have a usable ace or not, truncated, info}
    # We use the standard open AI gym format even thow we dont use all the elements from the tuple
    def _get_obs(self):
        value, usable_ace = self._hand_value(self.player)
        return (value,
                self._card_value(self.dealer[0]),   # 2..11
                int(usable_ace))                    # 0 or 1

    # Handles the game logic for each action.
    # returns a tuple {new state obervation, reward (1,0,-1), is the game terminating (boolean) }
    def step(self, action):
        # Player hits
        if action == HIT:
            self.player.append(self.deck.pop())
            value, _ = self._hand_value(self.player)

            # If value is > 21 player loses and game ends else game continues.
            if value > 21:
                return self._get_obs(), -1, True, False, {}
            else:
                return self._get_obs(), 0,  False, False, {}

        # Else player sticks → dealer’s turn
        # If dealer's hand value is 16 or less the dealer MUST draw another card else the dealer MUST stay.
        # Here we return terminal reward that checks the new conditions (returns teh same tuple)
        dealer_value, _ = self._hand_value(self.dealer)
        while dealer_value < 17:
            self.dealer.append(self.deck.pop())
            dealer_value, _ = self._hand_value(self.dealer)
        return self._terminal_reward()

    # Calculates final result.
    # returns a tuple sa before {new state obervation, reward (1,0,-1), is the game terminating (boolean), truncated, info }
    def _terminal_reward(self):
        player_total, _ = self._hand_value(self.player)
        dealer_total, _ = self._hand_value(self.dealer)
        if dealer_total > 21 or player_total > dealer_total:  # Player wins
            reward = 1
        elif player_total < dealer_total:                     # Dealer's win
            reward = -1
        else:
            reward = 0                                        # It's a draw
        return self._get_obs(), reward, True, False, {}


In [9]:
# Implement just a random policy
def evaluate_random_policy(n_games=100_000):
    env = BlackjackEnv()
    wins = draws = losses = 0
    for _ in range(n_games):
        s, _ = env.reset()
        done = False
        while not done:
            a = random.choice([HIT, STICK])
            s, r, done, _, _ = env.step(a)
        if r == 1:
            wins += 1
        elif r == 0:
            draws += 1
        else:
            losses += 1
    total = wins + draws + losses
    return wins / total, draws / total, losses / total


In [10]:
# Just a simple threshold policy, hits if the player's total is below a fixed threshold (e.g. 17)

def threshold_policy(player_total, threshold=17):
    return HIT if player_total < threshold else STICK


def evaluate_threshold_policy(threshold=17, n_games=100_000):
    env = BlackjackEnv()
    wins = draws = losses = 0

    for _ in range(n_games):
        s, _ = env.reset()
        done = False

        while not done:
            player_total = s[0]
            action = threshold_policy(player_total, threshold)
            s, r, done, _, _ = env.step(action)

        if r == 1:
            wins += 1
        elif r == 0:
            draws += 1
        else:
            losses += 1

    total = wins + draws + losses
    return wins / total, draws / total, losses / total


In [11]:
import numpy as np
from collections import defaultdict
import random
import tqdm                             # This is for the progress bar

# Q Table creation
# defaultdict: Automatic state creation when the agent encounters a new state. The state tuple becomes a dictionary key and the default Q values are [0.0, 0.0].
# We expect 360x2 table because:
# Player's hand total: 4-21 (18 possible values) (e.g A and 2 is not 3 but 13 that's why)
# Dealer's visible card: 2-11 (10 possible values)
# Usable ace: Yes/No (2 possible values)
# 18 × 10 × 2 = 360 possible state combinations
# And 2 possible actions HIT or STICK
Q = defaultdict(lambda: np.zeros(2))


alpha = 0.1                                     # Learning rate (how aggressively we update values)
gamma = 1.0                                     # Discount factor (1.0 = no discount for future rewards)
e_start = 1.0                                   # Initial exploration rate
e_end = 0.05                                    # Minimum exploration rate
e_decay = 1e5                                   # Rate of exploration deca
episodes = 500_000                              # Number of episodes



# ε-Greedy Policy
# Balances exploration (random actions) with exploitation (best known actions)
# Starts with 100% exploration (epsilon=1.0), decays over time
def policy(state, eps):
    if random.random() < eps:
        return random.choice(A)
    return int(np.argmax(Q[state]))

# Training part
def train():
  env = BlackjackEnv()                            # Creates a new environment
  eps = e_start
  for ep in tqdm.tqdm(range(episodes)):
      # Reset environment in every loop
      state, _ = env.reset()
      terminated = False

      # Continue playing while terminaded is not false.
      while not terminated:
          # Action is HIT or STICK
          action = policy(state, eps)
          next_state, reward, terminated, _, _ = env.step(action)


          if terminated:
              td_target = reward                   # no bootstrap
          else:
              best_next  = np.max(Q[next_state])
              td_target  = reward + gamma * best_next

          td_error  = td_target - Q[state][action]
          Q[state][action] += alpha * td_error
          state = next_state

      # Simple linear decay
      # Linearly reduces exploration rate from 1.0 to 0.05 over episodes
      eps = max(e_end, eps - (e_start-e_end) / e_decay)

# Plays 100,000 games to get statistically performance metrics
# Play with the learned policy
def evaluate(agent_Q, n_games=100_000):
    env = BlackjackEnv()
    wins = draws = losses = 0
    for _ in range(n_games):
        s, _ = env.reset()
        done = False
        while not done:
            a = int(np.argmax(agent_Q[s]))  # Returns the index of the highest Q-value in the array.(0=HIT, 1=STICK)
            s, r, done, _, _ = env.step(a)
        if r == 1:
            wins  += 1
        elif r == 0:
            draws += 1
        else:
            losses += 1
    total = wins + draws + losses
    return wins/total, draws/total, losses/total

def play_blackjack_manually():
    env = BlackjackEnv()
    state, _ = env.reset()
    done = False

    # Print player's starting card and show only the one of the dealer's. (also calculate the value)
    #****also minor changes to prints getting from enviroments so tuples of cards to be shown here
    print(f"\nYour hand: {env.player} Your starting hand value: {state[0]}, dealer shows: {env.dealer[0]}")

    while not done:
        action = input("Do you want to (h)it or (s)tick? ").lower()
        if action not in ['h', 's']:
            print("Invalid input. Please choose 'h' or 's'.")
            continue
        # If action = HIT call step with action_code = 0 means hit. The same with STICK
        if action == 'h':
            action_code = 0
        else:
            action_code = 1
        state, reward, done, _, _ = env.step(action_code)
        # Print the results

        #*** my changes here so i can get more prints to se hand values as would happen if you played blackjack in a casino
        if not done and action_code == 0:
            print(f"New state: Your hand: {env.player}, hand value = {state[0]}, dealer shows: {env.dealer[0]}")
        elif not done and action_code == 1:
            print(f"New state: Your hand: {env.player}, hand value = {state[0]}, dealer shows: {env.dealer}")
        else:
            if reward == 1:
                print(f"Final state: Your hand: {env.player}, hand value = {state[0]}, dealer shows: {env.dealer}")
                print("You win!")

            elif reward == -1:
                print(f"Final state: Your hand: {env.player}, hand value = {state[0]}, dealer shows: {env.dealer}")
                print("Dealer wins!")

            else:
                print(f"Final state: Your hand: {env.player}, hand value = {state[0]}, dealer shows: {env.dealer}")
                print("It's a draw!")


# Implementation of best proven strategy for STICK/HIT Poker game (no double etc).
# The strategy is widely used in casinos (ref: https://www.blackjackapprenticeship.com/blackjack-strategy-charts/)
def best_strategy_action(player_total, dealer_card, usable_ace):
  #SOFT Hands
  if usable_ace:
    if player_total >= 19:
        return STICK
    elif player_total == 18:
        if dealer_card in [2, 7, 8]:
            return STICK
        else:
            return HIT
    else:
        return HIT
  #HARD Hands
  else:
    if player_total >= 17:
        return STICK
    elif 13 <= player_total <= 16:
        if dealer_card <= 6:
            return STICK
        else:
            return HIT
    elif player_total == 12:
        if 4 <= dealer_card <= 6:
            return STICK
        else:
            return HIT
    else:
        return HIT

# Function that compares our learned policy to actual basic strategy
def compare_to_best_strategy():
    mismatches = 0
    total = 0
    # Only tests hands where decision-making matters.
    # Hands below 12 always require a hit, so they’re usually excluded.
    for player_total in range(12, 22):
        for dealer_card in range(2, 12):
            for usable_ace in [0, 1]:
                state = (player_total, dealer_card, usable_ace)
                agent_action = int(np.argmax(Q[state]))
                strategy_action = best_strategy_action(player_total, dealer_card, usable_ace)
                if agent_action != strategy_action:
                    print(f"Mismatch: State {state}, Agent: {agent_action}, Strategy: {strategy_action}")
                    mismatches += 1
                total += 1
    print(f"\nTotal mismatches: {mismatches}/{total} ({(mismatches/total)*100:.2f}% off-strategy)")




# w, d, l = evaluate(Q)
# print(f"Win {w:.2%}   Draw {d:.2%}   Lose {l:.2%}")

In [12]:
# ---------------------------------------------------------------------
#  Hi-Lo single-deck environment (inherits almost everything)
# ---------------------------------------------------------------------
class BlackjackHiLoEnv(BlackjackEnv):

    def __init__(self):
        self.running_count = 0   # shared across hands until shoe is shuffled
        self._init_deck()
        random.shuffle(self.deck)
        super().__init__()       # does a reset()

    # ---------- Hi-Lo helpers ----------------------------------------
    #return the hi-lo value of a card
    @staticmethod
    def _hilo_value(card):
        rank = card[0]
        if rank in ['2', '3', '4', '5', '6']:
            return +1
        elif rank in ['7', '8', '9']:
            return 0
        else:                                   # 10, J, Q, K, A
            return -1

    #in which state are we?
    def _hiLoState(self):
        if   self.running_count >  3: return 2  # High
        elif self.running_count < -3: return 0  # Low
        return 1                                # Neutral

    # ---------- deck management --------------------------------------
    def _prepare_deck(self):
        """Shuffle when ≤10 cards remain and wipe the running count."""
        if len(self.deck) <= 10:
            self._init_deck()
            random.shuffle(self.deck)
            self.running_count = 0

    def _draw(self):
        """Pop a card and update the Hi-Lo running count."""
        card = self.deck.pop()
        self.running_count += self._hilo_value(card)
        return card

    # ---------- overrides --------------------------------------------
    #difference from *basic* reset is it only resets if cards are lower than 10
    def reset(self):
        self._prepare_deck()
        self.player = [self._draw(), self._draw()]
        self.dealer = [self._draw(), self._draw()]
        return self._get_obs(), {}
    #same as *basic* get observation space but with the added hilo state, hilo state from dealer is only the face upcard
    def _get_obs(self):
        value, usable_ace = self._hand_value(self.player)
        upcard = self._card_value(self.dealer[0])     # 2-11
        return (value, upcard, int(usable_ace), self._hiLoState())

    #the step is the same as *basic*
    def step(self, action):
        if action == HIT:                     # player hits
            self.player.append(self._draw())
            value, _ = self._hand_value(self.player)
            if value > 21: #bigger than 21 we lose
                return self._get_obs(), -1, True, False, {}
            return self._get_obs(), 0, False, False, {} #continue

        # player sticks → dealer plays
        dealer_value, _ = self._hand_value(self.dealer)
        while dealer_value < 17:  #hit until >17
            self.dealer.append(self._draw())
            dealer_value, _ = self._hand_value(self.dealer)
        return self._terminal_reward()  #return the reward


In [13]:
# ---------------------------------------------------------------------
#  Card-counting learner
# ---------------------------------------------------------------------
Q_count = defaultdict(lambda: np.zeros(2))

#what choice we take
def policy_count(state, eps):
    if random.random() < eps: #if random < ε then we take random choice
        return random.choice(A)
    return int(np.argmax(Q_count[state])) #else take best course action

#start training
def train_count(episodes):
    env = BlackjackHiLoEnv()#hilo enviroment
    eps = e_start
    for _ in tqdm.tqdm(range(episodes)):
        state, _ = env.reset()
        done = False
        while not done:
            a  = policy_count(state, eps)
            nxt, r, done, _, _ = env.step(a)

            # TD target
            td_target = r if done else r + gamma * np.max(Q_count[nxt])
            # TD update
            Q_count[state][a] += alpha * (td_target - Q_count[state][a])
            state = nxt

        eps = max(e_end, eps - (e_start - e_end) / e_decay)

#evaluate
def evaluate_count(n_games=1000_000):
    env = BlackjackHiLoEnv()
    w = d = l = 0
    for _ in range(n_games):
        s, _ = env.reset()
        #print(env.deck)
        done = False
        while not done:
            a = int(np.argmax(Q_count[s]))
            s, r, done, _, _ = env.step(a)
        if   r == 1: w += 1
        elif r == 0: d += 1
        else:        l += 1
    tot = w + d + l
    return w/tot, d/tot, l/tot


In [None]:
# ---------------------------------------------------------------------
#  Pretty-print helper for either Q or Q_count(hilo)
# ---------------------------------------------------------------------
def print_q_table(table, count_version=False, max_rows=None):
    """
    table          – defaultdict holding numpy[2] (Q or Q_count)
    count_version  – True if the key is (player, upcard, ace, bucket)
    max_rows       – optional int to limit lines displayed
    """
    def bucket_name(b):
        return {0: "Low ", 1: "Neut", 2: "High"}[b]

    rows = 0
    for key in sorted(table.keys()):
        if count_version:
            p, d, ace, bucket = key
            bucket_str = bucket_name(bucket)
            state_str  = f"(P={p:>2}, D={d:>2}, A={ace}, C={bucket_str})"
        else:                                    # basic 3-tuple version
            p, d, ace = key
            state_str = f"(P={p:>2}, D={d:>2}, A={ace})"

        q_hit, q_stick = table[key]
        best = "HIT" if q_hit > q_stick else "STICK"

        # right-align the whole state column to 26 chars for neatness
        print(f"S={state_str:>26} | "
              f"Q_hit={q_hit:+.3f}  Q_stick={q_stick:+.3f}  -> {best}")

        rows += 1
        if max_rows and rows >= max_rows:
            print("… (truncated) …")
            break


# MENU
def main_menu():
    while True:
        print("\n=== BLACKJACK MENU ===")
        print("1. Play against the dealer (manual)")
        print("2. Evaluate random policy")
        print("3. Evaluate threshold policy")
        print("4. Train *basic* Q-agent")
        print("5. Evaluate *basic* agent")
        print("6. Compare to basic-strategy chart")
        print("7. Train *Hi-Lo* counting agent")
        print("8. Evaluate *Hi-Lo* agent")
        print("9. Exit")


        choice = input("Select an option (1-9): ")
        if choice == '1':
            play_blackjack_manually()
        elif choice == '2':
            w, d, l = evaluate_random_policy()
            print(f"Win {w:.2%}  Draw {d:.2%}  Lose {l:.2%}")
        elif choice == '3':
            w, d, l = evaluate_threshold_policy()
            print(f"Win {w:.2%}  Draw {d:.2%}  Lose {l:.2%}")
        elif choice == '4':
            train()
        elif choice == '5':
            w, d, l = evaluate(Q)
            print(f"Win {w:.2%}  Draw {d:.2%}  Lose {l:.2%}")
        elif choice == '6':
            compare_to_best_strategy()
        elif choice == '7':
            train_count(episodes=1_500_000)
            # Print the card-counting table (all rows
        elif choice == '8':
            w, d, l = evaluate_count(n_games=1_000_000)
            print(f"Win {w:.2%}  Draw {d:.2%}  Lose {l:.2%}")
        # elif choice == '7':
        #     print_q_table(Q_count, count_version=False)
        # elif choice == '8':
        #     print_q_table(Q_count, count_version=True)
        elif choice == '9':
            print("Goodbye!")
            break

if __name__ == "__main__":
    main_menu()
