In [2]:
!pip install gym



In [3]:
import gym
import numpy as np
import random
from IPython.display import clear_output

In [4]:
#S F F F       (S: starting point, safe)
#F H F H       (F: frozen surface, safe)
#F F F H       (H: hole, stuck forever)
#H F F G       (G: goal, safe)

#position of the agent is the red rectangle
#the way the environment is explained in the assignment assumes that we are using the "slippery" version

env = gym.make("FrozenLake-v0", is_slippery=True).env
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [5]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.95
epsilon = 1.0 
epsilon_decay = 0.001
episodes = 100000

# For plotting metrics
total_epochs = 0
outcomes = []

for i in range(episodes):
    state = env.reset()

    # By default, we consider our outcome to be a failure
    outcomes.append("Failure")

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        
        state = next_state
        
        # If we have a reward, it means that our outcome is a success
        if reward:
          outcomes[-1] = "Success"
        
        epochs += 1
    
    total_epochs += epochs

    # Update epsilon
    epsilon = max(epsilon - epsilon_decay, 0)

    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("100,000 Episodes are Done:")
print('===========================================\n')
print(f"Results after {episodes} episodes:\n")
print('Final Q-table:')
print(q_table)
print('\n')
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f'"Success" occured {outcomes.count("Success")} time(s)')
print(f'"Failure" occured {outcomes.count("Failure")} time(s)')

Episode: 99900
100,000 Episodes are Done:

Results after 100000 episodes:

Final Q-table:
[[0.17729681 0.10848999 0.10848036 0.10852488]
 [0.06681803 0.01612028 0.04937371 0.15977315]
 [0.14752665 0.07834129 0.05451075 0.03847484]
 [0.00319018 0.03950916 0.00557526 0.00411083]
 [0.20370253 0.11222863 0.1076757  0.1157015 ]
 [0.         0.         0.         0.        ]
 [0.17493781 0.03881005 0.03939764 0.02387834]
 [0.         0.         0.         0.        ]
 [0.13253522 0.13438265 0.13188479 0.2598918 ]
 [0.18175355 0.36447427 0.17536933 0.19372814]
 [0.37902293 0.18752287 0.12971971 0.13883331]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.22474883 0.25069184 0.51268879 0.29840393]
 [0.34937051 0.69754228 0.45076112 0.44940476]
 [0.         0.         0.         0.        ]]


Average timesteps per episode: 42.94798
"Success" occured 77238 time(s)
"Failure" occured 22762 time(s)
