<a href="https://colab.research.google.com/github/macgoral/hcai_exercises/blob/main/QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install gymnasium pygame numpy --quiet

import numpy as np
import gymnasium as gym
# we create the environment
env = gym.make("FrozenLake-v1", is_slippery=True)
#env = gym.make("Taxi-v3")

n_observations = env.observation_space.n
n_actions = env.action_space.n

# We start with a Q table of "all zeros"
Q_table = np.zeros((n_observations, n_actions))
print(Q_table)



[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:


# the number of episodes we use for training
n_episodes = 1000

# maximum of transitions per episode
max_traj_per_episode = 1000

# Epsilon-Greedy with start value 1
epsilon = 1

# value for decreasing Epsilon per step
epsilon_decay = 0.05

# minimum of exploration proba
min_epsilon = 0.000000001

# discounted factor
gamma = 0.99

# learning rate
lr = 0.01

total_rewards_episode = list()
rewards_per_episode = list()


# we train for a number of episodes
for e in range(n_episodes):
    # every time we start, we reset the environment
    # (Gymnasium now returns (observation, info))
    current_state, info = env.reset()
    done = False

    # we sum the individual rewards an agent gets per episode
    total_episode_reward = 0

    for i in range(max_traj_per_episode):
        # Epsilon-Greedy calculation - do we explore, or exploit?
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state, :])

        # we perform the selected action on the environment and get
        # (1) the next state we are landing in,
        # (2) the reward of the last action,
        # (3) information if the episode ended
        # Gymnasium returns: observation, reward, terminated, truncated, info
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # After each step, we update our Q-value for the last transition according to Q-learning
        Q_table[current_state, action] = Q_table[current_state, action] + lr * (
            reward + gamma * max(Q_table[next_state, :]) - Q_table[current_state, action]
        )

        # SARSA
        # next_action = np.argmax(Q_table[next_state, :]) if np.random.uniform(0,1) > epsilon else env.action_space.sample()
        # Q_table[current_state, action] = Q_table[current_state, action] + lr * (
        #     reward + gamma * Q_table[next_state, next_action] - Q_table[current_state, action]
        # )

        # After each step, we add the reward of the last action (if any) to our episode reward
        total_episode_reward = total_episode_reward + reward

        # If the last action yielded to the end of an episode, we stop and continue with a new run
        if done:
            break
        current_state = next_state

    # We reduce epsilon to slowly move from exploration to exploitation
    epsilon = max(min_epsilon, np.exp(-epsilon_decay * e))

    # At the end of every episode, we store the cumulative reward that we got
    rewards_per_episode.append(total_episode_reward)

    if e % 1000 == 0:
        print(epsilon)



# at the end of training, we also show how our Q-values have been updated
print(Q_table)

# We also show in steps of 1000 if and how the average reward has been increased
print("Mean rewards")
for i in range(n_episodes // 1000):
    print((i + 1) * 1000, ": mean episode reward: ",
          np.mean(rewards_per_episode[1000 * i:1000 * (i + 1)]))


1.0
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Mean rewards
1000 : mean episode reward:  0.0
