In [223]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb

!pip install pyvirtualdisplay
!pip install pygame # UI for gym envs

!pip install huggingface_hub
!pip install pickle5
!pip install imageio imageio_ffmpeg

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle

In [224]:
!pip install pyyaml==6.0 # avoid key error metadata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [225]:
env = gym.make("FrozenLake-v0", map_name="4x4", is_slippery=False)
env.reset()

qstates = env.observation_space.n
qactions = env.action_space.n

# Q-table
qtable = np.zeros((qstates, qactions)) 

In [232]:
def greedy_policy(qtable, state, eps_greedy=False, epsilon=0.01):
  if eps_greedy:
    rand = np.random.rand()
    if rand < epsilon:
      return env.action_space.sample() # np.random.randint(1,3) # 
  return np.argmax(qtable[state])

In [248]:
#Hyperparameters:
n_training_episodes = 10_000

Learning_rate = 0.95
Gamma = 0.99                  # reward discount rate
Max_steps = 99                # max steps by agent per episode

n_eval_episodes = 100
env_id = "FrozenLake-v0"



In [251]:
def perform(env, max_steps, qtable, n_episodes, train=True, min_eps=0.05, max_eps=1.0, decay_rate=0.000_5):
  if train:
    epsilon = max_eps
  else:
    mean_rew = 0

  for _episode in range(n_episodes):
    if train:
      epsilon = max(min_eps, epsilon - decay_rate)
      eps_greedy = True
    else:
      eps_greedy = False
      epsilon = None
      episode_reward = 0

    state = env.reset()
    done = False

    for step in range(Max_steps):
      action = greedy_policy(qtable, state, eps_greedy=eps_greedy, epsilon=epsilon)
      new_state, reward, done, info = env.step(action)
      if train:
        qtable[state][action] += Learning_rate * (reward + Gamma * max(qtable[new_state]) - qtable[state][action] )
      else:
        episode_reward += reward

      if done:
        break

      state = new_state
      
    if not train:
      mean_rew = (mean_rew * _episode + episode_reward)/(_episode + 1)
  
  if train:  
    return qtable
  else:
    return mean_rew
      

In [252]:
# Training
%%time 
n_training_episodes = 1000
QFrozenLake = perform(env, max_steps, qtable, n_episodes=n_training_episodes, train=True)

CPU times: user 182 ms, sys: 1.93 ms, total: 184 ms
Wall time: 178 ms


In [253]:
QFrozenLake

array([[0.94148015, 0.95099005, 0.95099005, 0.94148015],
       [0.94148015, 0.        , 0.96059601, 0.95099005],
       [0.95099005, 0.970299  , 0.95099005, 0.96059601],
       [0.96059601, 0.        , 0.95099005, 0.95099005],
       [0.95099005, 0.96059601, 0.        , 0.94148015],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.        , 0.96059601],
       [0.        , 0.        , 0.        , 0.        ],
       [0.96059601, 0.        , 0.970299  , 0.95099005],
       [0.96059601, 0.9801    , 0.9801    , 0.        ],
       [0.970299  , 0.99      , 0.        , 0.970299  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.99      , 0.970299  ],
       [0.9801    , 0.99      , 1.        , 0.9801    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [254]:
# Evaluation
n_eval_episodes = 100

mean_rew = perform(env, max_steps, qtable=QFrozenLake, n_episodes=n_eval_episodes, train=False)

In [255]:
mean_rew

1.0