In [1]:
from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym


In [None]:
env = gym.make("CartPole-v1", render_mode = "human")
observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, truncated, terminated, info = env.step(action)
    
    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [2]:
env = gym.make("ALE/Bowling-v5", render_mode = "human")
observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, truncated, terminated, info = env.step(action)
    
    if terminated or truncated:
        observation, info = env.reset()

env.close()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
  if not isinstance(terminated, (bool, np.bool8)):


KeyboardInterrupt: 

In [3]:
observation

array([[[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       ...,

       [[180, 122,  48],
        [180, 122,  48],
        [180, 122,  48],
        ...,
        [180, 122,  48],
        [180, 122,  48],
        [180, 122,  48]],

       [[180, 122,  48],
        [180, 122,  48],
        [180, 122,  48],
        ...,
        [180, 122,  48],
        [180, 122,  48],
        [180, 122,  48]],

       [[180, 122,  48],
        [180, 122,  48],
        [180, 122,  48],
        ...,
        [180, 122,  48],
        [180, 122,  48],
        [180, 122,  48]]

In [4]:
info

{'lives': 0, 'episode_frame_number': 1024, 'frame_number': 1024}

# Observing Env

In [13]:
env = gym.make("Blackjack-v1")

In [14]:
done = False
observation, info = env.reset()



In [15]:
action = env.action_space.sample()

observation, reward, terminated,truncated, info = env.step(action)

  if not isinstance(terminated, (bool, np.bool8)):


# Epsilon-Greedy Strategy


In [17]:
class BlackjackAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [18]:
# hyperparameters
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = BlackjackAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [8]:
from collections import deque
from gym.wrappers import RecordEpisodeStatistics

In [19]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 100000/100000 [00:08<00:00, 11628.98it/s]


In [None]:
defaultdict(lambda: np.zeros(env.action_space.n))

defaultdict(<function __main__.<lambda>()>, {})