# FrozenLake-v0

In [1]:
import gym
import torch

In [2]:
def fit_q_table(env,
          episodes=10_000,
          validate_n=1000,
          validation_episodes=100,
          learning_rate=0.1,
          epsilon=1.0, epsilon_decay=0.99995, epsilon_min=0.1,
          discount_factor=0.99,
          verbose=True):

    q_table = torch.zeros((env.observation_space.n, env.action_space.n))

    best_q_table = q_table.clone()
    best_score = 0.0

    for ep in range(1, episodes+1):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            # Determine action via exploration or explotation according to random value
            if torch.rand(1).item() < epsilon:
                action = env.action_space.sample()
            else:
                action = torch.argmax(q_table[state]).item()

            new_state, reward, done, _ = env.step(action)
            total_reward += reward
            # update q table using bellman's equation
            target_value = torch.max(q_table[new_state])
            q_table[state, action] += learning_rate*(reward + discount_factor*target_value - q_table[state, action])   

            state = new_state

        # update exploration probability
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay    

        if ep % validate_n == 0:
            rewards = play_episodes(env, q_table, episodes=validation_episodes)
            mean_reward = rewards.mean().item()

            if mean_reward > best_score:
                best_score = mean_reward
                best_q_table = q_table.clone()
                if verbose:
                    print(f'Episode {ep}: New best score! {best_score}')
                    
    return best_score, best_q_table

In [3]:
def play_episodes(env, q_table, render=False, episodes=1):
    rewards = torch.zeros((episodes,))

    for ep in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            if render:
                env.render()

            action = torch.argmax(q_table[state]).item()

            state, reward, done, _ = env.step(action)
            total_reward += reward

        rewards[ep] = total_reward

    # render last state
    if render:
        env.render()

    return rewards

## Slippery 4x4

In [4]:
fls_44_env = gym.make('FrozenLake-v0')
fls_44_score, fls_44_q_table = fit_q_table(fls_44_env, episodes=10_000)

print(f'Best score: {fls_44_score}')

Episode 1000: New best score! 0.3199999928474426
Episode 2000: New best score! 0.36000001430511475
Episode 3000: New best score! 0.6399999856948853
Episode 4000: New best score! 0.7400000095367432
Episode 8000: New best score! 0.7699999809265137
Best score: 0.7699999809265137


### Benchmark best model

In [5]:
fls_44_rewards = play_episodes(fls_44_env, fls_44_q_table, episodes=1000)
fls_44_rewards.mean().item()

0.7440000176429749

In [6]:
play_episodes(fls_44_env, fls_44_q_table, render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH

tensor([1.])

### Save the Q-Table to a file

In [7]:
torch.save(fls_44_q_table, 'FrozenLake_saved/FrozenLake-v0-qtable.pt') 

## Non slippery 4x4

In [8]:
flns_44_env = gym.make('FrozenLake-v0', is_slippery=False)
flns_44_score, flns_44_q_table = fit_q_table(flns_44_env, episodes=1000, validate_n=50, epsilon_min=0.25)
print(f'Best score: {flns_44_score}')

flns_44_rewards = play_episodes(flns_44_env, flns_44_q_table, episodes=1000)
print(f'Mean reward: {flns_44_rewards.mean().item()}')

torch.save(flns_44_q_table, 'FrozenLake_saved/FrozenLake-v0-nonslippery-qtable.pt')

Episode 50: New best score! 1.0
Best score: 1.0
Mean reward: 1.0


In [9]:
play_episodes(flns_44_env, flns_44_q_table, render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


tensor([1.])

## Slippery 8X8

In [10]:
fls_88_env = gym.make('FrozenLake8x8-v0')
fls_88_score, fls_88_q_table = fit_q_table(fls_88_env, episodes=10_000)

print(f'Best score: {fls_88_score}')

Episode 1000: New best score! 0.05000000074505806
Episode 2000: New best score! 0.8100000023841858
Episode 8000: New best score! 0.8399999737739563
Episode 9000: New best score! 0.8500000238418579
Best score: 0.8500000238418579


### Benchmark best model

In [11]:
fls_88_rewards = play_episodes(fls_88_env, fls_88_q_table, episodes=1000)
fls_88_rewards.mean().item()

0.8270000219345093

In [12]:
play_episodes(fls_88_env, fls_88_q_table, render=True)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFH

tensor([1.])

### Save the Q-Table to a file

In [13]:
torch.save(fls_88_q_table, 'FrozenLake_saved/FrozenLake8x8-v0.pt') 

## Non slippery 8x8

In [14]:
flns_88_env = gym.make('FrozenLake8x8-v0', is_slippery=False)
flns_88_score, flns_88_q_table = fit_q_table(flns_88_env, episodes=1000, validate_n=50, epsilon_min=0.25)
print(f'Best score: {flns_88_score}')

flns_88_rewards = play_episodes(flns_88_env, flns_88_q_table, episodes=1000)
print(f'Mean reward: {flns_88_rewards.mean().item()}')

torch.save(flns_88_q_table, 'FrozenLake_saved/FrozenLake8x8-v0-nonslippery-qtable.pt')

Episode 550: New best score! 1.0
Best score: 1.0
Mean reward: 1.0


In [15]:
play_episodes(flns_88_env, flns_88_q_table, render=True)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFF[41mF[0mFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFF[41mF[0mF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFH[41mF[0mF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFF[41mF[0mF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF

tensor([1.])