# Environment Basics

In [1]:
!pip install gymnasium==0.27.0
!pip install tqdm



In [2]:
import gymnasium as gym
import seaborn as sns
from tqdm import tqdm
import numpy as np

In [12]:
env = gym.make("Blackjack-v1", sab=True, render_mode="rgb_array")

### Observing the environment

In [13]:
# Resets the environment to get the first observation
done = False
observation, info = env.reset()

#observation = (16, 9, False)


#observation consistes of tuple with 3 values
#1. The players current sum
#2. Values of the dealers face-up card
#3. Boolean whether the player holds a usable ace (it is usuable if it counts as 11 without busting)

In [14]:
#sample a random action  
action = env.action_space.sample()

#execute the actions in our environment and receiece info after taking the steo
observation, reward, terminated,truncated, info = env.step(action)

#observation=(24,10,False)
#reward=-1.0
#terminated=True
#truncated=False
#info={}


# Epsilon-Greedy Strategy

In [15]:
class BlackjackAgent:
    def __init__(
        self,
        learning_rate:float,
        initial_epsilon:float,
        epsilon_decay:float,
        final_epsilon:float,
        discount_factor:float = 0.95
    ):
        """
        Initialize a RL agent with empty dictionary of state-action value (q_values), a learning rate and an epsilon

        discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = (lambda:np.zeroes(env.action_space.n))
        self.lr= learning_rate
        self.discount_factor=dicount_factor
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        
        self.training_error = []
        
    def get_action(self, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        if np.random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
                       
    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (reward + self.discount_factor*future_q_value - self.q_values[obs][action])
                       
        self.q_values[obs][action] = (self.q_values[obs][action] + self.lr*temporal_difference)
        self.training_error.append(temporal_difference)
                       
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon-epsilon_decay)
        

In [16]:
class BlackjackAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = (lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)

In [17]:
# hyperparameters
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = BlackjackAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [19]:
from collections import deque
from gym.wrappers import RecordEpisodeStatistics

In [20]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent_.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

  0%|          | 0/100000 [00:00<?, ?it/s]


ValueError: Attempted to add episode stats when they already exist