# FrozenLake-v0

In [1]:
import gym
import torch

In [2]:
def train(env,
          episodes=10_000,
          validate_n=1000,
          validation_episodes=100,
          learning_rate=0.1, learning_rate_min=0.005, learning_rate_decay=0.9995,
          epsilon=1.0, epsilon_decay=0.999, epsilon_min=0.01,
          discount_factor=0.99,
          verbose=True):

    q_table = torch.zeros((env.observation_space.n, env.action_space.n))

    best_q_table = q_table.clone()
    best_score = 0.0

    for ep in range(1, episodes+1):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            # Determine action via exploration or explotation according to random value
            if torch.rand(1).item() < epsilon:
                action = env.action_space.sample()
            else:
                action = torch.argmax(q_table[state]).item()

            new_state, reward, done, _ = env.step(action)
            total_reward += reward
            # update q table using bellman's equation
            target_value = torch.max(q_table[new_state])
            q_table[state, action] += learning_rate*(reward + discount_factor*target_value - q_table[state, action])   

            state = new_state

        # update exploration probability
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay  

        # update learning rate
        if learning_rate > learning_rate_min:
            learning_rate *= learning_rate_decay  

        if ep % validate_n == 0:
            rewards = play_episodes(validation_episodes, env, q_table)
            mean_reward = rewards.mean().item()

            if mean_reward > best_score:
                best_score = mean_reward
                best_q_table = q_table.clone()
                if verbose:
                    print(f'Episode {ep}: New best score! {best_score}')
                    
    return best_score, best_q_table

In [3]:
def play_episodes(count, env, q_table):
    rewards = torch.zeros((count,))

    for ep in range(count):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = torch.argmax(q_table[state]).item()

            state, reward, done, _ = env.step(action)
            total_reward += reward

        rewards[ep] = total_reward
    
    return rewards

## 4x4

In [4]:
env = gym.make('FrozenLake-v0')
score, q_table = train(env, episodes=30_000, learning_rate=0.5)

print(f'Best score: {score}')
print(q_table)

Episode 1000: New best score! 0.6899999976158142
Episode 2000: New best score! 0.7300000190734863
Episode 4000: New best score! 0.800000011920929
Episode 11000: New best score! 0.8100000023841858
Best score: 0.8100000023841858
tensor([[0.5403, 0.5306, 0.5309, 0.5274],
        [0.3141, 0.3425, 0.3712, 0.4981],
        [0.4120, 0.4040, 0.4000, 0.4726],
        [0.2701, 0.2963, 0.2484, 0.4605],
        [0.5562, 0.3551, 0.4010, 0.3824],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.2775, 0.1341, 0.3503, 0.1148],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.3946, 0.4370, 0.4133, 0.5869],
        [0.3992, 0.6318, 0.4255, 0.3740],
        [0.5968, 0.5278, 0.4253, 0.2575],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.4428, 0.4626, 0.7384, 0.5445],
        [0.7324, 0.8690, 0.7785, 0.7684],
        [0.0000, 0.0000, 0.0000, 0.0000]])


### Benchmark best model

In [5]:
episodes = 1000
rewards = play_episodes(episodes, env, q_table)
rewards.mean().item()

0.7360000014305115

## 8X8

In [6]:
env = gym.make('FrozenLake8x8-v0')
score, q_table = train(env, episodes=30_000, learning_rate=0.5)

print(f'Best score: {score}')
print(q_table)

Episode 2000: New best score! 0.8500000238418579
Episode 9000: New best score! 0.8700000047683716
Episode 11000: New best score! 0.9399999976158142
Best score: 0.9399999976158142
tensor([[3.9947e-01, 4.0201e-01, 4.0182e-01, 4.0481e-01],
        [4.0455e-01, 4.1206e-01, 4.1668e-01, 4.1174e-01],
        [4.1953e-01, 4.2392e-01, 4.3389e-01, 4.2666e-01],
        [4.4213e-01, 4.4340e-01, 4.5310e-01, 4.4257e-01],
        [4.5683e-01, 4.6264e-01, 4.7376e-01, 4.6359e-01],
        [4.8375e-01, 4.8542e-01, 4.9662e-01, 4.8533e-01],
        [5.1099e-01, 5.1175e-01, 5.1334e-01, 5.0875e-01],
        [5.1879e-01, 5.1811e-01, 5.2026e-01, 5.1879e-01],
        [3.8526e-01, 3.8283e-01, 3.8710e-01, 4.0185e-01],
        [3.8425e-01, 3.8625e-01, 3.9429e-01, 4.1038e-01],
        [3.7138e-01, 3.6023e-01, 3.9192e-01, 4.2459e-01],
        [2.4708e-01, 2.6435e-01, 2.6520e-01, 4.4358e-01],
        [4.3376e-01, 4.2349e-01, 4.4269e-01, 4.6643e-01],
        [4.6684e-01, 4.7454e-01, 4.9354e-01, 4.7730e-01],
        [

### Benchmark best model

In [7]:
episodes = 1000
rewards = play_episodes(episodes, env, q_table)
rewards.mean().item()

0.8410000205039978