The Q learning with the Taxi was done following the code form the video of Thomas Simonini. This time I'll try to do it on my own with Frozen Lake.

# Q Learning with Frozen Lake

First we will import the necessary libraries:


In [2]:
import numpy as np
import gym
import random

Now we will create the environment:

In [4]:
env = gym.make('FrozenLake-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


### Creating and initializing the Q table

In [8]:
action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### Creating the hyperparameters

In [137]:
total_episodes = 15000
total_test_episodes = 50
max_steps = 99

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

learning_rate = 0.9
gamma = 0.95

### The Q Learning Algorithm
#### Here we train our model
1. Initialize Q-values arbitrarily for all state-action pairs.
2. For each episode...
- Choose an action in the world state based on Q-value estimates.
- Take an action and observe the outcome state and reward.
- Update the Q table with the Bellman equation.

In [146]:
env.reset()

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        
        exp_tradeoff = random.uniform(0,1)

        if exp_tradeoff > epsilon:
            action = np.argmax(qtable[state, :])

        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
        
        state = new_state
        
        if done:
            break
        
    episode += 1
    epsilon = min_epsilon + (max_epsilon + min_epsilon)*np.exp(-decay_rate*episode)


### Creating animation

In [147]:
# for animation
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Total rewards: {frame['reward']}")
        sleep(.01)

# Play Frozen Lake!

In [148]:
env.reset()
rewards = []
frames = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        total_rewards +=  reward
        
        frames.append({'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': total_rewards})
        
        if done:
            rewards.append(total_rewards)
            break
        
        state = new_state

env.close()
#print_frames(frames)
print(sum(rewards)/total_test_episodes)

0.56
