<a href="https://colab.research.google.com/github/nellika/RL-tabular/blob/main/RL_Workshop_1_Tabular_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from tqdm import tqdm
import collections
import imageio

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode="rgb_array")

In [None]:
# print the state space and action space
print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

In [4]:
!mkdir Q
!mkdir SARSA

# Q-learning

In [5]:
def learn(state, state2, reward, action, lr_rate):
  predict = Q[state, action]
  target = reward + gamma * np.max(Q[state2, :])
  Q[state, action] = Q[state, action] + lr_rate * (target - predict)

In [10]:
def choose_action(state):
  action = np.argmax(Q[state,:])
  return action

def choose_action(state, episode):
  action = np.argmax(Q[state, :] + np.random.randn(1, env.action_space.n) * (1. / (episode + 1)))
  return action

In [None]:
Q = np.zeros([env.observation_space.n, env.action_space.n])

max_steps = 50
num_episodes = 5000
G = collections.deque(maxlen=50)
episode_lengths = []
Q_tables = []

gamma = 0.99
lr_rate = 0.85

frames = []

for episode in tqdm(range(num_episodes), desc ="Training: ", leave=True):
  state = env.reset()

  episode_length = 0
  reward_sum = 0

  if episode % 500 == 0:
      frames = []
      frames.append(np.array(env.render())[0])
      Q_tables.append(Q.copy())

  done = False
  while not done or episode_length < max_steps:
    env.render()
    action = choose_action(state, episode)
    new_state, reward, done, info = env.step(action)

    learn(state, new_state, reward, action, lr_rate)
    state = new_state

    if episode % 500 == 0: frames.append(np.array(env.render())[0])
    reward_sum += reward
    episode_length += 1

  episode_lengths.append(episode_length)
  G.append(reward_sum)

  if episode % 500 == 0:
      tqdm.write("Success rate: " + str(np.mean(G)))
      imageio.mimwrite(f"Q/episode_{episode}.gif", frames, fps=8)

In [None]:
fig, axs = plt.subplots(1,len(Q_tables))
for i in range(len(Q_tables)):
  ax = axs[i]
  max_values = np.array(Q_tables[i].max(axis=1)).reshape((4, 4))
  im = ax.imshow(max_values, cmap='viridis')
  # cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.show()

In [None]:
fig, axs = plt.subplots(1,len(Q_tables))
for i in range(len(Q_tables)):
  ax = axs[i]
  max_values = np.argmax(Q_tables[i], axis=1).reshape((4, 4))
  im = ax.imshow(max_values, cmap='viridis')
  # cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.show()

# SARSA

In [None]:
def learn(state, new_state, reward, action, new_action):#SARSA
  predict = Q[state, action]
  target = reward + gamma * Q[new_state, new_action]
  Q[state, action] = Q[state, action] + lr_rate * (target - predict)

In [37]:
def choose_action(state):
  action = np.argmax(Q[state,:])
  return action

def choose_action(state, episode):
  action = np.argmax(Q[state, :] + np.random.randn(1, env.action_space.n) * (1. / (episode + 1)))
  return action

In [None]:
Q = np.zeros([env.observation_space.n, env.action_space.n])

max_steps = 50
num_episodes = 5000
G = collections.deque(maxlen=50)
episode_lengths = []
Q_tables = []

gamma = 0.99
lr_rate = 0.85

frames = []

for episode in tqdm(range(num_episodes), desc ="Training: ", leave=True):
  state = env.reset()

  episode_length = 0
  reward_sum = 0

  if episode % 500 == 0:
      frames = []
      frames.append(np.array(env.render())[0])
      Q_tables.append(Q.copy())

  done = False
  while not done or episode_length < max_steps:
    env.render()

    new_state, reward, done, info = env.step(action)
    new_action = choose_action(new_state, episode)
    learn(state, new_state, reward, action, new_action)
    state = new_state
    action = new_action

    if episode % 500 == 0: frames.append(np.array(env.render())[0])
    reward_sum += reward
    episode_length += 1

  episode_lengths.append(episode_length)
  G.append(reward_sum)

  if episode % 500 == 0:
      tqdm.write("Success rate: " + str(np.mean(G)))
      imageio.mimwrite(f"SARSA/episode_{episode}.gif", frames, fps=8)

In [None]:
fig, axs = plt.subplots(1,len(Q_tables))
for i in range(len(Q_tables)):
  ax = axs[i]
  max_values = np.array(Q_tables[i].max(axis=1)).reshape((4, 4))
  im = ax.imshow(max_values, cmap='viridis')
  # cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.show()

In [None]:
fig, axs = plt.subplots(1,len(Q_tables))
for i in range(len(Q_tables)):
  ax = axs[i]
  max_values = np.argmax(Q_tables[i], axis=1).reshape((4, 4))
  im = ax.imshow(max_values, cmap='viridis')
  # cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.show()