### Discretizing continuous observations

In [114]:
import gym
import numpy as np

In [96]:
n_states = 5
eta = 0.85
gamma =  1.0

In [87]:
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




In [88]:
obs = env.reset()

In [89]:
print(env.observation_space)

print("pos  | vel")
print(obs)
print(env.observation_space.high)
print(env.observation_space.low)

Box(2,)
pos  | vel
[-0.51282174  0.        ]
[0.6  0.07]
[-1.2  -0.07]


In [95]:
# 2d cell defining the 
env_dx = (env.observation_space.high - env.observation_space.low) / n_states
print(env_dx)

[0.36  0.028]


In [90]:
def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0]) / env_dx[0])
    b = int((obs[1] - env_low[1]) / env_dx[1])
    return a, b

In [91]:
obs_to_state(env, obs)

(1, 2)

In [121]:
q_table = np.zeros((n_states, n_states, 3))

for i in range(2000):
    a, b  = obs_to_state(env, obs)
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    a_, b_  = obs_to_state(env, obs)
#     print(obs, ' -> ', a, b)
    q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma * np.max(q_table[a_][b_]) - q_table[a][b][action])
    
#     print(q_table)

In [122]:
np.set_printoptions(threshold=np.nan)
print(q_table)

[[[ -7.64760311  -5.68290136  -6.68325047]
  [-10.37512796  -8.26542852  -9.53670722]
  [ -9.91556748  -7.44206942  -8.91819669]
  [ -7.34653645  -8.07541127  -7.13925898]
  [  0.           0.           0.        ]]

 [[ -9.33479061  -8.75446305  -8.33305322]
  [ -8.57523225  -7.65294764  -8.02663301]
  [  0.           0.           0.        ]
  [ -8.76335359  -7.44672612  -8.3500615 ]
  [ -8.43718936 -10.9596911  -10.0315854 ]]

 [[ -7.02023007  -9.17225483  -7.00438527]
  [ -9.25539902  -9.81208571  -9.81199362]
  [  0.           0.           0.        ]
  [ -8.03003639  -8.63535413  -8.17998616]
  [ -6.55745563  -9.83120442  -8.29369078]]

 [[  0.           0.           0.        ]
  [ -8.63074256  -8.7650084   -8.73230546]
  [-10.20181256  -8.84347637 -10.9067151 ]
  [ -9.2433385   -9.60669788  -8.71654955]
  [  0.           0.           0.        ]]

 [[  0.           0.           0.        ]
  [  0.           0.           0.        ]
  [ -2.65630241  -0.996625     0.        ]
  [

In [124]:
q_table[a, b, 

array([-9.33479061, -8.75446305, -8.33305322])

### Implement Q-learning

In [126]:
import random

num_episodes = 100
num_timesteps = 1000

learning_rate = 0.85

max_eps = 1.0
min_eps = 0.01
eps_decay_rate = 0.01

In [127]:
#epsilon decay
ep = 0
min_eps + (max_eps - min_eps) * np.exp(-eps_decay_rate * ep)

1.0

In [128]:
# Init q-table
q_table = np.zeros((n_states, n_states, 3))

In [133]:
# List of rewards
rewards = []

for episode in range(num_episodes):
    # Reset the environment
    obs = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    epsilon = 1.0
    
    for step in range(num_timesteps):
        
        # discretize observation to space
        a, b = obs_to_state(env, obs)
        
        # Choose an action a in the current world state (s)
        
        ## If random number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
            
        # Else doing a random choice --> exploration
        else:
            action = np.argmax(q_table[a, b ,:])

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_obs, reward, done, info = env.step(action)
        
        new_a, new_b = obs_to_state(env, new_obs)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        q_table[a, b, action] = q_table[a, b, action] + learning_rate * (reward + gamma * np.max(q_table[new_a, new_b, :]) - q_table[a, b, action])
        
        total_rewards += reward
        
        # Our new state is state
        obs = new_obs
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    episode += 1
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_eps + (max_eps - min_eps) * np.exp(-eps_decay_rate * episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/num_episodes))
print(q_table)

Score over time: -200.0
[[[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [  -2.83250096   -2.8303605   -23.09629032]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [ -74.30328937  -74.29348477  -75.42267396]
  [ -96.58623106  -96.43869633  -97.41921928]
  [ -71.93514928  -69.37793539  -69.44103581]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [ -73.88332869  -65.05378718 -100.55899245]
  [ -96.83674814  -96.86718647  -94.09283058]
  [ -75.9253324   -68.28018356  -68.82529825]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  