### Discretizing continuous observations

In [12]:
import gym
import numpy as np

In [13]:
n_states = 5
eta = 0.85
gamma =  1.0

In [14]:
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




In [15]:
obs = env.reset()

In [16]:
print(env.observation_space)

print("pos  | vel")
print(obs)
print(env.observation_space.high)
print(env.observation_space.low)

Box(2,)
pos  | vel
[-0.58819075  0.        ]
[0.6  0.07]
[-1.2  -0.07]


In [17]:
# 2d cell defining the 
env_dx = (env.observation_space.high - env.observation_space.low) / n_states
print(env_dx)

[0.36  0.028]


In [18]:
def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0]) / env_dx[0])
    b = int((obs[1] - env_low[1]) / env_dx[1])
    return a, b

In [19]:
obs_to_state(env, obs)

(1, 2)

In [20]:
q_table = np.zeros((n_states, n_states, 3))

for i in range(2000):
    a, b  = obs_to_state(env, obs)
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    a_, b_  = obs_to_state(env, obs)
#     print(obs, ' -> ', a, b)
    q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma * np.max(q_table[a_][b_]) - q_table[a][b][action])
    
#     print(q_table)

In [21]:
np.set_printoptions(threshold=np.nan)
print(q_table)

[[[  0.           0.           0.        ]
  [ -1.9683875   -0.9775      -3.61980517]
  [ -7.30099799  -5.58831711  -5.58800825]
  [-16.22372493  -2.06732036 -16.2677718 ]
  [  0.           0.           0.        ]]

 [[  0.           0.           0.        ]
  [ -8.40165625  -7.57127839  -9.56525879]
  [ -7.43496392  -7.27704269  -9.2092471 ]
  [-10.91216559 -11.17033173  -9.17792242]
  [  0.           0.           0.        ]]

 [[  0.           0.           0.        ]
  [ -6.29516595  -6.64348737  -5.31207127]
  [ -6.09274058  -9.41165055  -8.71150261]
  [ -7.87597554  -8.61575787  -9.46924674]
  [  0.           0.           0.        ]]

 [[  0.           0.           0.        ]
  [  0.           0.           0.        ]
  [ -7.25893916  -1.00131879  -1.84711986]
  [  0.           0.           0.        ]
  [  0.           0.           0.        ]]

 [[  0.           0.           0.        ]
  [  0.           0.           0.        ]
  [  0.           0.           0.        ]
  [

In [22]:
q_table

array([[[  0.        ,   0.        ,   0.        ],
        [ -1.9683875 ,  -0.9775    ,  -3.61980517],
        [ -7.30099799,  -5.58831711,  -5.58800825],
        [-16.22372493,  -2.06732036, -16.2677718 ],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [ -8.40165625,  -7.57127839,  -9.56525879],
        [ -7.43496392,  -7.27704269,  -9.2092471 ],
        [-10.91216559, -11.17033173,  -9.17792242],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [ -6.29516595,  -6.64348737,  -5.31207127],
        [ -6.09274058,  -9.41165055,  -8.71150261],
        [ -7.87597554,  -8.61575787,  -9.46924674],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [ -7.25893916,  -1.00131879,  -1.84711986],
        [  0.        ,   0.        ,   0.        ],
      

In [24]:
q_table[a, b, :]

array([-10.91216559, -11.17033173,  -9.17792242])

In [25]:
q_table.shape

(5, 5, 3)

In [34]:
n_actions = 3
n_observation_space = 2
shape = (n_states, n_states, n_actions)
q_matrix = np.random.rand(n_states, n_states, n_actions)

print(q_matrix.shape)
assert shape == q_matrix.shape

observation = np.random.rand(n_observation_space)


(5, 5, 3)


In [44]:
observation = env.reset()
a,b = obs_to_state(env, observation)
print(a,b)
state = [a,b]

1 2


In [45]:
q_matrix[state, :]

array([[[0.7791643 , 0.29845775, 0.08873795],
        [0.27661256, 0.44125511, 0.53955589],
        [0.78826117, 0.64809392, 0.49591479],
        [0.87517358, 0.27103591, 0.77428262],
        [0.6654274 , 0.79757156, 0.88553725]],

       [[0.5327819 , 0.41758545, 0.38779637],
        [0.74443818, 0.14901641, 0.99723817],
        [0.5390338 , 0.48326593, 0.91956903],
        [0.60856221, 0.92970324, 0.44436295],
        [0.92991791, 0.70910561, 0.93624505]]])

### Implement Q-learning

In [126]:
import random

num_episodes = 100
num_timesteps = 1000

learning_rate = 0.85

max_eps = 1.0
min_eps = 0.01
eps_decay_rate = 0.01

In [127]:
#epsilon decay
ep = 0
min_eps + (max_eps - min_eps) * np.exp(-eps_decay_rate * ep)

1.0

In [128]:
# Init q-table
q_table = np.zeros((n_states, n_states, 3))

In [133]:
# List of rewards
rewards = []

for episode in range(num_episodes):
    # Reset the environment
    obs = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    epsilon = 1.0
    
    for step in range(num_timesteps):
        
        # discretize observation to space
        a, b = obs_to_state(env, obs)
        
        # Choose an action a in the current world state (s)
        
        ## If random number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
            
        # Else doing a random choice --> exploration
        else:
            action = np.argmax(q_table[a, b ,:])

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_obs, reward, done, info = env.step(action)
        
        new_a, new_b = obs_to_state(env, new_obs)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        q_table[a, b, action] = q_table[a, b, action] + learning_rate * (reward + gamma * np.max(q_table[new_a, new_b, :]) - q_table[a, b, action])
        
        total_rewards += reward
        
        # Our new state is state
        obs = new_obs
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    episode += 1
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_eps + (max_eps - min_eps) * np.exp(-eps_decay_rate * episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/num_episodes))
print(q_table)

Score over time: -200.0
[[[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [  -2.83250096   -2.8303605   -23.09629032]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [ -74.30328937  -74.29348477  -75.42267396]
  [ -96.58623106  -96.43869633  -97.41921928]
  [ -71.93514928  -69.37793539  -69.44103581]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [ -73.88332869  -65.05378718 -100.55899245]
  [ -96.83674814  -96.86718647  -94.09283058]
  [ -75.9253324   -68.28018356  -68.82529825]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  