# OpenAI gym frozenlake

In [1]:
import gym

In [2]:
from collections import defaultdict

In [3]:
from tqdm import tqdm

In [4]:
import random

In [5]:
import matplotlib.pyplot as plt

In [6]:
import numpy as np

In [7]:
thresholds_p = np.linspace(-1.1,0.5, 10)
thresholds_v = np.linspace(-0.6,0.6, 10)

np.digitize(0.-1.2, thresholds_p)


0

In [8]:
N_GAMES = 100

In [9]:
env = gym.make('MountainCar-v0')



In [10]:
thresholds_p = np.linspace(-1.1,0.5, 10)
thresholds_v = np.linspace(-0.6,0.6, 10)

def discretize_state(s_cont):
    
#     thresholds_p = np.linspace(-1.1,0.5, 10)
#     thresholds_v = np.linspace(-0.6,0.6, 10)
    
#     s_disc = (s_cont*beta).astype(int)

#     return tuple(s_disc)

    s_disc = (
        np.digitize(s_cont[0], thresholds_p),
        np.digitize(s_cont[1], thresholds_v))
    
    return s_disc
    

In [11]:
def choose_action_greedy(q_table, obs, env):
    
    max_i = 0
    max_v = q_table[(obs, 0)]

    for i in range(env.action_space.n):
        v = q_table[(obs, i)]

        if v > max_v:
            max_v = v
            max_i = i

    return max_i

In [12]:
def choose_action_eps(q_table, obs, env, eps):
    
    # q_table[obs, a]
    
    if random.random() < eps:
        return env.action_space.sample()
    else:
        return choose_action_greedy(q_table, obs, env)

In [13]:
def test_game(env, q_table, n_games=N_GAMES):
    
    total_reward = 0.0
    
    for _ in range(n_games):

        done = False
        obs = env.reset()

        while not done:
            # Choose an action greedily
            new_obs, reward, done, info = env.step(choose_action_greedy(q_table, discretize_state(obs), env))
            total_reward += reward
            
            obs = new_obs

    return total_reward/n_games


In [14]:
def q_learning(q_table, obs, action, reward, new_obs, n_actions, lr, gamma):
    """
    lr : float
        Learning rate
    gamma : float
        Discount factor for future rewards
    """
    
    # TD(0) learning

    # Update entry using bellman's equation
#     q_table[(obs, action)] += lr * (
#         reward + 
#         gamma*max([q_table[(new_obs, a)] for a in range(n_actions)]) -
#         q_table[(obs, action)])
    
    
    target = reward + gamma * max([q_table[(new_obs, a)] for a in range(n_actions)]) 
    
    q_error = target - q_table[(obs, action)]
    
    q_table[(obs, action)] += lr * q_error

In [15]:
EPS_DECAY = 0.99993
GAMMA = 0.99
LR = 0.8

In [16]:
# Main learning loop

mean_rewards = list()

EVALUATE_EVERY = 1000

N_EPISODES = 15000

eps = 1.0

q_table = defaultdict(float)

# Save n of actions
n_actions = env.action_space.n

# for i in tqdm(range(N_EPISODES)):
for i in range(N_EPISODES):
    done = False
    
    obs = env.reset()

    while not done:
        # Choose an action epsilon-greedily
        action = choose_action_eps(q_table, discretize_state(obs), env, eps)
        
        new_obs, reward, done, info = env.step(action)
        
        q_learning(q_table, discretize_state(obs), action, reward, discretize_state(new_obs), n_actions, lr=LR, gamma=GAMMA)
        obs = new_obs
        
    eps *= EPS_DECAY
    
    # Evaluate policy every N games
    if (i+1)%EVALUATE_EVERY == 0:
        
        test_reward = test_game(env, q_table)
        print(f'\tEp: {i+1}  Test reward: {test_reward} {eps:.2f}')
        
        mean_rewards.append(test_reward)

	Ep: 1000  Test reward: -200.0 0.50
	Ep: 2000  Test reward: -200.0 0.25
	Ep: 3000  Test reward: -200.0 0.12
	Ep: 4000  Test reward: -200.0 0.06
	Ep: 5000  Test reward: -200.0 0.03
	Ep: 6000  Test reward: -200.0 0.01
	Ep: 7000  Test reward: -200.0 0.01
	Ep: 8000  Test reward: -200.0 0.00
	Ep: 9000  Test reward: -200.0 0.00
	Ep: 10000  Test reward: -200.0 0.00


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(18,9))
plt.plot(mean_rewards)
plt.title("Mean rewards every N Iterations");