# OpenAI gym frozenlake

In [1]:
import gym

In [2]:
from collections import defaultdict

In [3]:
from tqdm import tqdm

In [4]:
import random

In [5]:
import matplotlib.pyplot as plt

In [6]:
N_GAMES = 100


In [7]:
env = gym.make('FrozenLake-v0')

In [8]:
def choose_action_greedy(q_table, obs, env):
    
    max_i = 0
    max_v = q_table[(obs, 0)]

    for i in range(env.action_space.n):
        v = q_table[(obs, i)]

        if v > max_v:
            max_v = v
            max_i = i

    return max_i

In [9]:
def choose_action_eps(q_table, obs, env, eps):
    
    # q_table[obs, a]
    
    if random.random() < eps:
        return env.action_space.sample()
    else:
        return choose_action_greedy(q_table, obs, env)

In [10]:
def test_game(env, q_table, n_games=N_GAMES):
    
    total_reward = 0.0
    
    for _ in range(n_games):

        done = False
        obs = env.reset()

        while not done:
            # Choose an action greedily
            new_obs, reward, done, info = env.step(choose_action_greedy(q_table, obs, env))
            total_reward += reward
            
            obs = new_obs

    return total_reward/n_games


In [11]:
def q_learning(q_table, obs, action, reward, new_obs, n_actions, lr, gamma):
    """
    lr : float
        Learning rate
    gamma : float
        Discount factor for future rewards
    """
    
    # TD(0) learning

    # Update entry using bellman's equation
#     q_table[(obs, action)] += lr * (
#         reward + 
#         gamma*max([q_table[(new_obs, a)] for a in range(n_actions)]) -
#         q_table[(obs, action)])
    
    
    target = reward + gamma * max([q_table[(new_obs, a)] for a in range(n_actions)]) 
    
    q_error = target - q_table[(obs, action)]
    
    q_table[(obs, action)] += lr * q_error

In [12]:
EPS_DECAY = 0.9993
GAMMA = 0.95
LR = 0.8

In [13]:
# Main learning loop

mean_rewards = list()

EVALUATE_EVERY = 1000

N_EPISODES = 15000

eps = 1.0

q_table = defaultdict(float)

# Save n of actions
n_actions = env.action_space.n

for i in tqdm(range(N_EPISODES)):
    done = False
    
    obs = env.reset()

    while not done:
        # Choose an action epsilon-greedily
        action = choose_action_eps(q_table, obs, env, eps)
        new_obs, reward, done, info = env.step(action)
        
        q_learning(q_table, obs, action, reward, new_obs, n_actions, lr=LR, gamma=GAMMA)
        obs = new_obs
        
    eps *= EPS_DECAY
    
    # Evaluate policy every N games
    if (i+1)%EVALUATE_EVERY == 0:
        
        test_reward = test_game(env, q_table)
#         print(f'\tEp: {i+1}  Test reward: {test_reward} {eps:.2f}')
        
        mean_rewards.append(test_reward)

  0%|          | 0/15000 [00:00<?, ?it/s]


TypeError: unhashable type: 'numpy.ndarray'

In [None]:
plt.figure(figsize=(18,9))
plt.plot(mean_rewards)
plt.title("Mean rewards every N Iterations");