In [1]:
import numpy as np
import gym

In [2]:
env = gym.make('FrozenLake-v0')

In [3]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
state_size = env.observation_space.n
action_size = env.action_space.n
(state_size, action_size)

(16, 4)

In [5]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
episodes_count = 15000
lr = 0.8                      # learning rate
max_steps_count = 100         # Max steps the game should run or else end the session
gamma = 0.95                  # Discount rate (on future reward)

# Exploration Parameters
max_epsilon = 1.0
min_epsilon = 0.01             
epsilon = 1.0                 # Initial epsilon (Exploration probability)
decay_rate = 0.005            # decay rate for epsilon (exponentially decrease exploration)

In [12]:
rewards = [] # List of total reward in each episode

for episode in range(episodes_count):
    state = env.reset()
    episode_reward_sum = 0
    
    for step in range(max_steps_count):
        action = None
        
        exploration_prob = np.random.uniform()
        
        if exploration_prob <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[state, :])
        
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + \
                                    lr * (reward + (gamma * np.max(qtable[new_state, :])) - qtable[state, action])
        
        state = new_state
        episode_reward_sum += reward
        
        # if current game session has finished (won or dead)
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(episode_reward_sum)

In [13]:
print(qtable)
print(sum(rewards)/ episodes_count)

[[3.71659166e-01 6.34819227e-02 9.69463163e-02 6.34638208e-02]
 [1.18580300e-02 2.94998231e-03 2.13035568e-07 3.04284197e-02]
 [1.69851936e-02 1.88188753e-02 2.89919925e-02 1.82872305e-02]
 [9.89092908e-03 1.14264420e-02 2.62511616e-05 3.02287103e-02]
 [4.37950071e-01 6.75140717e-02 8.78826804e-03 6.44580455e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.19188567e-03 1.28102220e-07 1.11751160e-07 1.47632667e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.15093414e-03 1.70179351e-03 3.88490791e-01 7.09158807e-01]
 [3.91348808e-02 3.97982906e-01 5.58451277e-02 6.28241075e-02]
 [7.76902753e-01 3.15690595e-06 9.38214015e-03 3.91968272e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.06951704e-04 4.31548261e-01 3.51206487e-01 1.23093676e-01]
 [1.97495922e-01 6.65065144e-01 1.96440493e-01 7.04264853e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [14]:
# Print the action in every place
env.reset()
env.render()

env_moves = list(np.argmax(qtable,axis=1).reshape(4,4).tolist())

for i in range(len(env_moves)):
    for j in range(len(env_moves[0])):
        if env_moves[i][j] == 0:
            env_moves[i][j] = 'L'
        elif env_moves[i][j] == 1:
            env_moves[i][j]= 'D'
        elif env_moves[i][j] == 2:
            env_moves[i][j] = 'R'
        else:
            env_moves[i][j] = 'U'
for row in env_moves:
    print(*row)


[41mS[0mFFF
FHFH
FFFH
HFFG
L U R U
L L L L
U D L L
L D D L


In [15]:
state = env.reset()
step = 0
done = False

for step in range(max_steps_count):

    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(qtable[state,:])
    env.render()     
    new_state, reward, done, info = env.step(action)
    if done:
        # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
        env.render()

        # We print the number of step it took.
        print("Number of steps", step)
        break
    state = new_state



[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH

In [11]:
env.close()