In [1]:
import numpy as np
import gym

In [2]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)

In [3]:
env = gym.make('FrozenLakeNotSlippery-v0')

In [4]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [5]:
state_size = env.observation_space.n
action_size = env.action_space.n
(state_size, action_size)

(16, 4)

In [6]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [11]:
episodes_count = 15000
lr = 0.8                      # learning rate
max_steps_count = 100         # Max steps the game should run or else end the session
gamma = 0.95                  # Discount rate (on future reward)

# Exploration Parameters
max_epsilon = 1.0
min_epsilon = 0.01             
epsilon = 1.0                 # Initial epsilon (Exploration probability)
decay_rate = 0.005            # decay rate for epsilon (exponentially decrease exploration)

In [12]:
rewards = [] # List of total reward in each episode

for episode in range(episodes_count):
    state = env.reset()
    episode_reward_sum = 0
    
    for step in range(max_steps_count):
        action = None
        
        exploration_prob = np.random.uniform()
        
        if exploration_prob <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[state, :])
        
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + \
                                    lr * (reward + (gamma * np.max(qtable[new_state, :])) - qtable[state, action])
        
        state = new_state
        episode_reward_sum += reward
        
        # if current game session has finished (won or dead)
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(episode_reward_sum)

In [13]:
print(sum(rewards)/ episodes_count)
print(qtable)

0.9725333333333334
[[0.73509189 0.77378094 0.6983373  0.73509189]
 [0.73509189 0.         0.         0.65859116]
 [0.6983373  0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.66342043]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.857375   0.95       0.         0.63024934]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]


In [17]:
# Print the action in every place
env.reset()
env.render()

env_moves = list(np.argmax(qtable,axis=1).reshape(4,4).tolist())

for i in range(len(env_moves)):
    for j in range(len(env_moves[0])):
        if env_moves[i][j] == 0:
            env_moves[i][j] = 'L'
        elif env_moves[i][j] == 1:
            env_moves[i][j]= 'D'
        elif env_moves[i][j] == 2:
            env_moves[i][j] = 'R'
        else:
            env_moves[i][j] = 'U'
for row in env_moves:
    print(*row)

SyntaxError: invalid syntax (<ipython-input-17-7f5bef11a26c>, line 6)

In [15]:
state = env.reset()
step = 0
done = False
for step in range(max_steps_count):

    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(qtable[state,:])
    env.render()     
    print(state, action)
    new_state, reward, done, info = env.step(action)
    if done:
        # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
        env.render()

        # We print the number of step it took.
        print("Number of steps", step)
        break
    state = new_state




[41mS[0mFFF
FHFH
FFFH
HFFG
0 1
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
4 1
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
8 2
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
9 1
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
13 2
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
14 2
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 5


In [40]:
env.close()