In [1]:
import numpy as np
import gymnasium as gym

In [2]:
class Agent:
    """
    A simple Monte Carlo agent for learning the state value function in Blackjack.

    Attributes:
        V (dict): A dictionary that maps state tuples to their estimated values.
        returns (dict): A dictionary that collects lists of returns for each state.
        states_visited (dict): A dictionary that tracks whether a state has been visited in an episode.
        memory (list): A list used to store state and reward for each step of the episode.
        gamma (float): The discount factor used in calculating returns.
    """
    def __init__(self, gamma=0.99):
        """
        Initializes the Agent object with a discount factor and default state space definitions.

        Args:
            gamma (float): The discount factor for the Monte Carlo learning, defaults to 0.99.
        """
        self.V = {}
        self.sum_space = [i for i in range(4, 22)]
        self.dealer_show_card_space = [i for i in range(1, 11)]
        self.ace_space = [False, True]
        self.action_space = [0, 1]
        self.state_space = []
        
        self.returns = {}
        self.states_visited = {}
        self.memory = []
        self.gamma = gamma

        self.init_vals()

    def init_vals(self):
        """
        Initializes the value function, returns, states visited, and state space for all possible states
        in a game of Blackjack.
        """
        for total in self.sum_space:
            for card in self.dealer_show_card_space:
                for ace in self.ace_space:
                    self.V[(total, card, ace)] = 0
                    self.returns[(total, card, ace)] = []
                    self.states_visited[(total, card, ace)] = 0
                    self.state_space.append((total, card, ace))

    def policy(self, state):
        """
        Defines the policy under which the agent acts. The policy is simple:
        hit if the total is less than 20, otherwise stand.

        Args:
            state (tuple): The current state tuple (total, dealer's card, has_ace).

        Returns:
            int: The action to take, where 0 is stand and 1 is hit.
        """
        total, _, _ = state
        action = 0 if total >= 20 else 1
        return action

    def update_V(self):
        """
        Updates the value estimates V for all states based on the returns obtained from completed episodes.
        It uses first-visit Monte Carlo method for updating.
        """
        for idt, (state, _) in enumerate(self.memory):
            G = 0
            if self.states_visited[state] == 0:
                self.states_visited[state] += 1
                discount = 1  # gamma ^ 0 == 1
                for t, (_, reward) in enumerate(self.memory[idt:]):
                    G += reward * discount
                    discount *= self.gamma
                    self.returns[state].append(G)

        for state, _ in self.memory:
            self.V[state] = np.mean(self.returns[state])

        for state in self.state_space:
            self.states_visited[state] = 0

        self.memory = []

In [3]:
def main():
    """
    Main function to create the environment, instantiate the agent, and run multiple episodes
    to learn the value function.

    It uses the `gymnasium` Blackjack environment and runs for a specified number of episodes.
    """
    env = gym.make("Blackjack-v1")
    agent = Agent()
    n_episodes = 500000

    for i in range(n_episodes):
        if i % 50000 == 0:
            print("Starting episode:", i)

        observation, info = env.reset()
        terminated, truncated = False, False

        while not terminated or truncated:
            action = agent.policy(observation)
            observation_, reward, terminated, truncated, info = env.step(action)
            agent.memory.append((observation, reward))
            observation = observation_

        agent.update_V()

    print("Likely Win State:", agent.V[21, 3, True])
    print("Likely Lose State:", agent.V[4, 1, False])

In [4]:
main()

Starting episode: 0
Starting episode: 50000
Starting episode: 100000
Starting episode: 150000
Starting episode: 200000
Starting episode: 250000
Starting episode: 300000
Starting episode: 350000
Starting episode: 400000
Starting episode: 450000
Likely Win State: 0.9691252144082333
Likely Lose State: -0.18684704286783044
