In [2]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb

!pip install pyvirtualdisplay
!pip install pygame # UI for gym envs

!pip install huggingface_hub
!pip install pickle5
!pip install imageio imageio_ffmpeg

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle

In [3]:
!pip install pyyaml==6.0 # avoid key error metadata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [71]:
env = gym.make("FrozenLake-v0", map_name="4x4", is_slippery=False)
env.reset()

qstates = env.observation_space.n
qactions = env.action_space.n

# Q-table
qtable = np.zeros((qstates, qactions)) 

In [72]:
def greedy_policy(qtable, state, eps_greedy=False, epsilon=0.01):
  if eps_greedy:
    rand = np.random.rand()
    if rand < epsilon:
      return env.action_space.sample() # np.random.randint(1,3) # 
  return np.argmax(qtable[state])

In [73]:
#Hyperparameters:
n_training_episodes = 10_000

Learning_rate = 0.95
Gamma = 0.99                  # reward discount rate
Max_steps = 99                # max steps by agent per episode

n_eval_episodes = 100
env_id = "FrozenLake-v0"



In [74]:
def perform(env, max_steps, qtable, n_episodes, train=True, min_eps=0.05, max_eps=1.0, decay_rate=0.000_5):
  if train:
    epsilon = max_eps
  else:
    mean_rew = 0

  for _episode in range(n_episodes):
    if train:
      epsilon = max(min_eps, epsilon - decay_rate)
      eps_greedy = True
    else:
      eps_greedy = False
      epsilon = None
      episode_reward = 0

    state = env.reset()
    done = False

    for step in range(Max_steps):
      action = greedy_policy(qtable, state, eps_greedy=eps_greedy, epsilon=epsilon)
      new_state, reward, done, info = env.step(action)
      if train:
        qtable[state][action] += Learning_rate * (reward + Gamma * max(qtable[new_state]) - qtable[state][action] )
      else:
        episode_reward += reward

      if done:
        if reward == 1.0:
          print('Learning! qtable = ', qtable)
        break

      state = new_state

    if not train:
      mean_rew = (mean_rew * _episode + episode_reward)/(_episode + 1)
  
  if train:  
    return qtable
  else:
    return mean_rew
      

In [76]:
# Training
%%time 
QFrozenLake = perform(env, Max_steps, qtable, n_episodes=100, train=True)

Learning! qtable =  [[0.94136123 0.95086993 0.94861255 0.94136123]
 [0.94136122 0.         0.95819449 0.94861255]
 [0.94861255 0.96787322 0.94861244 0.95819449]
 [0.95819444 0.         0.94860551 0.94861244]
 [0.95086816 0.96047471 0.         0.94134165]
 [0.         0.         0.         0.        ]
 [0.         0.97764972 0.         0.95807471]
 [0.         0.         0.         0.        ]
 [0.96046718 0.         0.9701765  0.95074252]
 [0.96031155 0.97997627 0.97985557 0.        ]
 [0.97015904 0.98998764 0.         0.96742874]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.97979586 0.98987622 0.96961575]
 [0.97856216 0.9899932  0.99999969 0.97985557]
 [0.         0.         0.         0.        ]]
Learning! qtable =  [[0.94147981 0.95098971 0.95098937 0.94147981]
 [0.94147948 0.         0.96059563 0.95098937]
 [0.95098966 0.97029869 0.95098937 0.96059531]
 [0.96059533 0.         0.94861254 0.95098343]
 [0.95098595 0.9605

In [77]:
import pandas as pd
pd.DataFrame(QFrozenLake)

Unnamed: 0,0,1,2,3
0,0.94148,0.95099,0.95099,0.94148
1,0.94148,0.0,0.960596,0.95099
2,0.95099,0.970299,0.95099,0.960596
3,0.960596,0.0,0.950984,0.95099
4,0.95099,0.960596,0.0,0.94148
5,0.0,0.0,0.0,0.0
6,0.0,0.9801,0.0,0.960596
7,0.0,0.0,0.0,0.0
8,0.960596,0.0,0.970299,0.95099
9,0.960555,0.979977,0.9801,0.0


In [13]:
# Evaluation
n_eval_episodes = 100

mean_rew = perform(env, Max_steps, qtable=QFrozenLake, n_episodes=n_eval_episodes, train=False)

In [14]:
mean_rew

1.0