# FrozenLake-v0

In [1]:
import gym
import torch

In [2]:
def fit_q_table(env,
          episodes=10_000,
          validate_n=1000,
          validation_episodes=100,
          learning_rate=0.1,
          epsilon=1.0, epsilon_decay=0.99995, epsilon_min=0.1,
          discount_factor=0.99,
          verbose=True):

    q_table = torch.zeros((env.observation_space.n, env.action_space.n))

    best_q_table = q_table.clone()
    best_score = 0.0

    for ep in range(1, episodes+1):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            # Determine action via exploration or explotation according to random value
            if torch.rand(1).item() < epsilon:
                action = env.action_space.sample()
            else:
                action = torch.argmax(q_table[state]).item()

            new_state, reward, done, _ = env.step(action)
            total_reward += reward
            # update q table using bellman's equation
            target_value = torch.max(q_table[new_state])
            q_table[state, action] += learning_rate*(reward + discount_factor*target_value - q_table[state, action])   

            state = new_state

        # update exploration probability
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay    

        if ep % validate_n == 0:
            rewards = play_episodes(validation_episodes, env, q_table)
            mean_reward = rewards.mean().item()

            if mean_reward > best_score:
                best_score = mean_reward
                best_q_table = q_table.clone()
                if verbose:
                    print(f'Episode {ep}: New best score! {best_score}')
                    
    return best_score, best_q_table

In [3]:
def play_episodes(count, env, q_table, render=False):
    rewards = torch.zeros((count,))

    for ep in range(count):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            if render:
                env.render()

            action = torch.argmax(q_table[state]).item()

            state, reward, done, _ = env.step(action)
            total_reward += reward

        rewards[ep] = total_reward

    # render last state
    if render:
        env.render()

    return rewards

## Slippery 4x4

In [4]:
env = gym.make('FrozenLake-v0')
score, q_table = fit_q_table(env, episodes=10_000)

print(f'Best score: {score}')

Episode 1000: New best score! 0.3499999940395355
Episode 2000: New best score! 0.75
Episode 4000: New best score! 0.7900000214576721
Best score: 0.7900000214576721


### Benchmark best model

In [5]:
episodes = 1000
rewards = play_episodes(episodes, env, q_table)
rewards.mean().item()

0.7440000176429749

In [6]:
play_episodes(1, env, q_table, render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
S

tensor([1.])

### Save the Q-Table to a file

In [7]:
torch.save(q_table, 'saved/FrozenLake-v0-qtable.pt') 

## Non slippery 4x4

In [8]:
env = gym.make('FrozenLake-v0', is_slippery=False)
score, q_table = fit_q_table(env, episodes=1000, validate_n=50, epsilon_min=0.25)
print(f'Best score: {score}')

episodes = 1000
rewards = play_episodes(episodes, env, q_table)
print(f'Mean reward: {rewards.mean().item()}')

torch.save(q_table, 'saved/FrozenLake-v0-nonslippery-qtable.pt')

Episode 100: New best score! 1.0
Best score: 1.0
Mean reward: 1.0


In [9]:
play_episodes(1, env, q_table, render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


tensor([1.])

## Slippery 8X8

In [10]:
env = gym.make('FrozenLake8x8-v0')
score, q_table = fit_q_table(env, episodes=10_000)

print(f'Best score: {score}')

Episode 1000: New best score! 0.18000000715255737
Episode 2000: New best score! 0.6000000238418579
Episode 3000: New best score! 0.6200000047683716
Episode 4000: New best score! 0.7599999904632568
Episode 8000: New best score! 0.8199999928474426
Episode 10000: New best score! 0.8600000143051147
Best score: 0.8600000143051147


### Benchmark best model

In [11]:
episodes = 1000
rewards = play_episodes(episodes, env, q_table)
rewards.mean().item()

0.8429999947547913

In [12]:
play_episodes(1, env, q_table, render=True)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41m

tensor([1.])

### Save the Q-Table to a file

In [13]:
torch.save(q_table, 'saved/FrozenLake8x8-v0.pt') 

## Non slippery 8x8

In [14]:
env = gym.make('FrozenLake8x8-v0', is_slippery=False)
score, q_table = fit_q_table(env, episodes=1000, validate_n=50, epsilon_min=0.25)
print(f'Best score: {score}')

episodes = 1000
rewards = play_episodes(episodes, env, q_table)
print(f'Mean reward: {rewards.mean().item()}')

torch.save(q_table, 'saved/FrozenLake8x8-v0-nonslippery-qtable.pt')

Episode 400: New best score! 1.0
Best score: 1.0
Mean reward: 1.0


In [15]:
play_episodes(1, env, q_table, render=True)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFF[41mF[0mFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFF[41mF[0mFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFFFF[41mF[0m
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFF[41mF[0m
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHF[41mF[0m
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SF

tensor([1.])