In [1]:
import gym
import time
import numpy as np
import IPython.display
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
env = gym.make('MountainCar-v0')

In [8]:
# reset to starting State
s = env.reset()
action_list = list(range(env.action_space.n))
while True:
    a = np.random.choice(action_list)   # random an action
    s, r, done, _ = env.step(a)      # perform action, get state, reward, is terminated
    if done:
        break
    env.render()
    time.sleep(0.05)

In [9]:
class NBinDiscretizer:
    def __init__(self, min_val, max_val, nbins):
        self.min_val = min_val
        self.max_val = max_val
        self.step = (max_val - min_val) / nbins
        self.nbins = int(nbins)
    def __call__(self, val):
        return int(round((val - self.min_val) / self.step)) % self.nbins

In [10]:
class Dicretezation:
    def __init__(self, discretezers):
        self.discretezers = discretezers
    def __getitem__(self, index):
        assert len(index) == len(self.discretezers)
        return tuple([self.discretezers[i](index[i]) for i in range(len(index))])

In [15]:
lr = 0.1
gamma = 0.9

In [17]:
n_quantization = 50
x_quantizer = NBinDiscretizer(env.observation_space.low[0], env.observation_space.high[0], n_quantization)
v_quantizer = NBinDiscretizer(env.observation_space.low[0], env.observation_space.high[0], n_quantization)
state_quantizer = Dicretezation([x_quantizer, v_quantizer])

In [18]:
Q = np.zeros((n_quantization, n_quantization, 3))

In [20]:
# inititalize some variables
epochs = 10000
epsilon = 0.9
epsilon_scale = epsilon / (epochs / 4)

# some metrics
max_reward = -1000
max_pos = -1000

# logging
log = display('', display_id=True)
reach_log = display('', display_id=True)

for epoch in tqdm(range(epochs), desc="Epoch"):
    ep_max_pos = -1000
    ep_reward = 0
    
    # reset environment
    obs = env.reset()
    done = False

    while not done:
        
        # take an action
        if np.random.random_sample() > epsilon:
            a = np.argmax(Q[state_quantizer[obs]])
        else:
            a = np.random.randint(0, env.action_space.n)
        
        # perform action
        new_obs, r, done, info = env.step(a)
        ep_reward += r
        
        if new_obs[0] >= env.goal_position:
            reach_log.update(f"Reach goal at epoch {epoch} with reward: {ep_reward}")
        # update Q
        cur_q_value = Q[state_quantizer[obs]][a]        
        new_q_value = (1-lr) * cur_q_value + lr * (r + gamma * max(Q[state_quantizer[new_obs]]))
        Q[state_quantizer[obs]][a] = new_q_value
        obs = new_obs
        ep_max_pos = max(obs[0], ep_max_pos)
        
    max_reward = max(ep_reward, max_reward)
    max_pos = max(ep_max_pos, max_pos)
    epsilon = max(0, epsilon - epsilon_scale)
    
    log.update("epoch {}: ep_reward: {:9.6f}, max_reward: {:9.6f}, ep_max_pos: {:.6f}, max_pos: {:.6f}, epsilon: {:.6f}".format(epoch, ep_reward, max_reward, ep_max_pos, max_pos, epsilon))

'epoch 9999: ep_reward: -200.000000, max_reward: -132.000000, ep_max_pos: 0.106835, max_pos: 0.524181, epsilon: 0.000000'

'Reach goal at epoch 9997 with reward: -165.0'

Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]