In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("FrozenLake-v0")


In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n    


In [4]:
print("Actions: %d, States: %d" % (action_size, state_size))

Actions: 4, States: 16


In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
total_episodes = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005


In [7]:
rewards = []
for episode in range(total_episodes):
    state = env.reset()
    
    total_reward = 0
    step = 0
    for step in range(max_steps):
        
        tradeoff = random.uniform(0,1)
        
        if tradeoff > epsilon:
            action = np.argmax(qtable[state, :])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - 
                                                                         qtable[state, action])
        rewards.append(reward)
        state = new_state
        if done: 
            break
        
    epsilon = (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    
print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.6854666666666667
[[3.96226247e-01 7.22183170e-03 6.81365915e-03 7.79550791e-03]
 [8.15194724e-04 1.08501332e-03 2.47892817e-03 1.62627865e-01]
 [1.29459194e-03 1.56381131e-03 1.19492197e-03 7.74949778e-02]
 [3.40769779e-04 1.23643546e-03 1.26390679e-03 6.24276693e-02]
 [3.81023515e-01 3.03554872e-03 4.07599250e-03 3.26320487e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.27181595e-06 1.65371333e-06 1.81548733e-01 2.49474403e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.87369941e-03 1.81366496e-03 3.35895725e-03 4.15735662e-01]
 [2.91758768e-03 1.75138606e-01 1.35273216e-03 3.32183319e-03]
 [3.42559507e-02 1.50638605e-03 1.72639675e-03 4.54541565e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.37576488e-02 4.07941126e-02 7.04749960e-01 1.96602668e-03]
 [7.73682696e-02 9.67504446e-01 6.48285375e-02 7.37858074e-02]
 [0.00000000e+00 0.

In [10]:
# Test
for episode in range(10):
    
    state = env.reset()
    step = 0
    done = False
    print("*********************************************************")
    print("Episode: %d" % episode)
    total_reward = 0
    for step in range(max_steps):
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            env.render()
            print("Number of steps: %d, Reward: %d" % (step, total_reward))
            break
        state = new_state
        
env.close()

*********************************************************
Episode: 0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps: 33, Reward: 1
*********************************************************
Episode: 1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps: 17, Reward: 1
*********************************************************
Episode: 2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps: 27, Reward: 1
*********************************************************
Episode: 3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps: 78, Reward: 1
*********************************************************
Episode: 4
*********************************************************
Episode: 5
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Number of steps: 8, Reward: 0
*********************************************************
Episode: 6
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps: 19, Reward: 1
*********************************************************
Episode: 7
  (Down)
SFFF
FHFH
FFFH
HFF[41mG