In [3]:
# Q Learning is a value based RL Algorithm. Value based means the algorithm tries to maximize the expected future reward at each state
import numpy as np
import gym
import random


In [26]:
env = gym.make("FrozenLake-v0") 
print(env)

[2018-06-30 23:56:39,962] Making new env: FrozenLake-v0


<TimeLimit<FrozenLakeEnv instance>>


In [31]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(action_size, state_size)

4 16


In [7]:
# This table will store the rewards for each action we take at each state
# The columns represent the actions
# The rows represent the different states an agent can be in
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [29]:
# The hyperparameters we will use for training the Q Learning model
total_episodes = 100000 #Total episodes for training the model
learning_rate = 0.5 # Learning rate for how fast the model learns
max_steps = 99 #Max number of steps that the algorithm can take for each episode. This helps to speed up learning 
gamma = 0.95 # Discount factor to choose future vs nearby reward

epsilon = 1.0 # Exploration vs exploitation problem resolution
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01 #In order to decrease the epsilon rate to shift from exploration to exploitation as we keep learning more and more

In [30]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    episode += 1
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.53116
[[2.25395001e-01 9.71001140e-02 7.07465426e-02 1.06790412e-01]
 [4.33808221e-02 4.71496425e-02 2.96987065e-02 8.22431919e-02]
 [3.43386542e-02 2.67464573e-02 3.55240747e-02 7.29515956e-02]
 [8.27848925e-03 1.69179931e-02 3.02844097e-02 5.39886032e-02]
 [2.65492287e-01 4.43807731e-02 1.67502716e-03 1.20027348e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.84091198e-04 4.38338846e-04 2.18583519e-01 7.08907000e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.71304239e-01 9.80619298e-02 1.77530379e-01 3.78747605e-01]
 [1.52210788e-01 6.43081522e-01 1.75705655e-01 1.98674178e-01]
 [6.81436742e-01 4.66846461e-02 1.85163847e-02 2.39473318e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.30343170e-01 1.29096352e-01 8.93448174e-01 2.43546852e-01]
 [3.33456646e-01 9.95465770e-01 2.41873343e-01 3.38852711e-01]
 [0.00000000e+00 0.00000000e+0

In [25]:
env.reset()

for episode in range(1):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        print("step: ", step)
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        print("action: ", action)
        new_state, reward, done, info = env.step(action)
        print("new_state: ",new_state)
        if done:
            print("done on step : ", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
step:  0

[41mS[0mFFF
FHFH
FFFH
HFFG
action:  0
new_state:  0
step:  1
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
action:  0
new_state:  0
step:  2
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
action:  0
new_state:  0
step:  3
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
action:  0
new_state:  0
step:  4
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
action:  0
new_state:  4
step:  5
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
action:  0
new_state:  4
step:  6
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
action:  0
new_state:  8
step:  7
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
action:  3
new_state:  8
step:  8
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
action:  3
new_state:  9
step:  9
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
action:  1
new_state:  13
step:  10
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
action:  2
new_state:  13
step:  11
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
action:  2
new_state:  14
step:  12
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
action:  1
new_state:  13
ste