In [1]:
import numpy as np
import gym

In [2]:
env = gym.make('FrozenLake-v0')

In [3]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
state_size = env.observation_space.n
action_size = env.action_space.n
(state_size, action_size)

(16, 4)

In [5]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
episodes_count = 15000
lr = 0.8                      # learning rate
max_steps_count = 100         # Max steps the game should run or else end the session
gamma = 0.95                  # Discount rate (on future reward)

# Exploration Parameters
max_epsilon = 1.0
min_epsilon = 0.01             
epsilon = 1.0                 # Initial epsilon (Exploration probability)
decay_rate = 0.005            # decay rate for epsilon (exponentially decrease exploration)

In [7]:
rewards = [] # List of total reward in each episode

for episode in range(episodes_count):
    state = env.reset()
    episode_reward_sum = 0
    
    for step in range(max_steps_count):
        action = None
        
        exploration_prob = np.random.uniform()
        
        if exploration_prob <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[state, :])
        
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + \
                                    lr * (reward + (gamma * np.max(qtable[new_state, :])) - qtable[state, action])
        
        state = new_state
        episode_reward_sum += reward
        
        # if current game session has finished (won or dead)
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(episode_reward_sum)

In [8]:
print(qtable)
print(sum(rewards)/ episodes_count)

[[4.55761957e-01 6.80011646e-02 7.31623689e-02 5.99658035e-02]
 [6.24866928e-04 2.34906937e-05 5.95688498e-03 1.41711160e-01]
 [6.12693204e-02 2.13921146e-03 2.19115277e-03 3.36576126e-03]
 [1.61242132e-03 2.20812708e-03 7.13042915e-04 3.39762205e-03]
 [5.78581210e-01 3.87430792e-02 4.11839154e-02 3.70707567e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.54593521e-04 4.06509409e-07 9.00312102e-03 2.55456101e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.21376759e-03 2.58287955e-02 7.78072990e-02 5.80165553e-01]
 [4.93342272e-02 1.47603295e-01 1.43343642e-03 2.46136882e-03]
 [4.10778507e-01 3.74956097e-02 2.10561258e-02 2.34603850e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.34278168e-02 1.66668344e-01 7.63837737e-01 6.06355463e-02]
 [8.83882661e-02 9.54601012e-01 3.42365490e-01 3.32512958e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [9]:
# Print the action in every place
env.reset()
env.render()

env_moves = list(np.argmax(qtable,axis=1).reshape(4,4).tolist())

for i in range(len(env_moves)):
    for j in range(len(env_moves[0])):
        if env_moves[i][j] == 0:
            env_moves[i][j] = 'L'
        elif env_moves[i][j] == 1:
            env_moves[i][j]= 'D'
        elif env_moves[i][j] == 2:
            env_moves[i][j] = 'R'
        else:
            env_moves[i][j] = 'U'
for row in env_moves:
    print(*row)


[41mS[0mFFF
FHFH
FFFH
HFFG
L U L U
L L R L
U D L L
L R D L


In [10]:
state = env.reset()
step = 0
done = False

for step in range(max_steps_count):

    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(qtable[state,:])
    env.render()     
    new_state, reward, done, info = env.step(action)
    if done:
        # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
        env.render()

        # We print the number of step it took.
        print("Number of steps", step)
        break
    state = new_state



[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H

In [11]:
env.close()