In [None]:
#test building a RL algorithym for frozen lake

In [2]:
import gym
import retro
import numpy as np
import random
import time
from IPython.display import clear_output

In [3]:
#Exploring the env object
def print_env_doc_strings(env):
    #We can see the rewards available from the end
    print("---------------ENV INFO---------------\n")
    print(env.unwrapped.__doc__)    
    
    #We can see the rewards available from the end
    print("--------------------------------------\n\n-------------REWARD RANGE-------------\n")
    print(env.reward_range.__doc__)

    #we can see the action_space available
    print("--------------------------------------\n\n-------------ACTION SPACE-------------\n")
    print(env.action_space.__doc__)

    #we can see the observation/state space
    print("-------------------------------------\n\n-------------STATE SPACE-------------\n")
    print(env.observation_space.__doc__)
    print("-------------------------------------")

In [4]:
#this gives us the possible states, actions and rewards
env = gym.make("FrozenLake-v0")
print_env_doc_strings(env)

---------------ENV INFO---------------


    Winter is here. You and your friends were tossing around a frisbee at the park
    when you made a wild throw that left the frisbee out in the middle of the lake.
    The water is mostly frozen, but there are a few holes where the ice has melted.
    If you step into one of those holes, you'll fall into the freezing water.
    At this time, there's an international frisbee shortage, so it's absolutely imperative that
    you navigate across the lake and retrieve the disc.
    However, the ice is slippery, so you won't always move in the direction you intend.
    The surface is described using a grid like the following

        SFFF
        FHFH
        FFFH
        HFFG

    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located

    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.

   

In [5]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
num_episodes = 20000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

chunks = 2500

In [7]:
rewards_all_episodes = []

#Q-Learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    
    done=False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode):

        #env.render()
        # Exploration vs Exploitation step
        if random.uniform(0,1) > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)

        # Update Q-table for Q(s,a)
        q_table[state,action] = (q_table[state, action] * (1 - learning_rate) + 
                                 learning_rate * 
                                 (reward + discount_rate * np.max(q_table[new_state, :]))
                                )
        
        state = new_state
        rewards_current_episode += reward
        
        if done:
            break
            
    # Exploration Rate decacy ready for next episode
    exploration_rate = (min_exploration_rate + 
                        (max_exploration_rate - min_exploration_rate) * 
                        np.exp(-exploration_decay_rate * episode)
                       )
    if episode % chunks == (chunks-1) or episode == num_episodes-1:
        rewards_per_100_episodes = np.array(rewards_all_episodes[-chunks:])
        print(episode+1, np.mean(rewards_per_100_episodes))
    rewards_all_episodes.append(rewards_current_episode)

    

2500 0.14725890356142457
5000 0.5592
7500 0.6708
10000 0.6736
12500 0.696
15000 0.678
17500 0.678
20000 0.7036


In [8]:
#we can then print the learned optimal policy - matrix of state vs action
print(q_table)

[[0.61550054 0.53079561 0.52825869 0.53474988]
 [0.41397556 0.36225655 0.32808077 0.55202637]
 [0.41584028 0.44405689 0.42441994 0.50706228]
 [0.24429734 0.25650916 0.32289427 0.47608504]
 [0.62178767 0.42908285 0.3765692  0.34631378]
 [0.         0.         0.         0.        ]
 [0.41112109 0.19016877 0.15888744 0.129621  ]
 [0.         0.         0.         0.        ]
 [0.52761134 0.48082925 0.34549211 0.63693605]
 [0.44467049 0.68446795 0.54957577 0.49434036]
 [0.62549205 0.52244802 0.45312668 0.39391986]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.53485107 0.57662585 0.75811757 0.64378772]
 [0.70829595 0.86990351 0.77191149 0.77815989]
 [0.         0.         0.         0.        ]]


In [12]:
#Now lets watch the trained agent play the game
for episode in range(10):
    state = env.reset()
    done=False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.05)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state = new_state
        
env.close()


  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
