In [3]:
import numpy as np
import gym
import random, tempfile
from gym import wrappers

In [4]:
env = gym.make("LunarLander-v2")
tdir = tempfile.mkdtemp()
env = wrappers.Monitor(env, tdir, force=True, video_callable=False)
env.render()

ModuleNotFoundError: No module named 'Box2D'

In [3]:
action_size = env.action_space.n
print ("Action size ", action_size)

state_size = env.observation_space.n
print ("State size ", state_size)

Action size  6
State size  500


In [4]:
qtable = np.zeros((state_size, action_size))
print (qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [104]:
total_episodes = 1000000
total_test_episodes = 100 

learning_rate = 0.1
gamma = 0.9

# exploration parameter
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 1./total_episodes

In [106]:
for episode in range(total_episodes):
    # reset the environment
    state = env.reset()
    step = 0
    done = False
    
    while True:
        
        # choose an action a in the corrent world state
        exp_exp_tradeoff = random.uniform(0,1)
        
        # if greater than epsilon --> exploit
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state, :])
            
        # else choose exploration
        else:
            action = env.action_space.sample()
            
#         action = env.action_space.sample()
        
        # take action (a) and observe the outcome state (s') and reward (r)    
        new_state, reward, done, info = env.step(action)
    
        # update Q(s,a) := Q(s,a) + lr [R(s,a) + gamma * max(Q (s', a') - Q(s,a))]
        if not done:
            qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])

        else:
            qtable[state, action] = qtable[state,action] + learning_rate*(reward - qtable[state,action])
#             print (state, action)
        
        # change state
        state = new_state
        
        # is it Done
        if done:
            break
            
    
    # reduce epsilon 
    epsilon = max(max_epsilon -  decay_rate * episode, min_epsilon) 
#     print (epsilon)

In [118]:
qtable[83,5]

-12.823266037160561

In [79]:
env

<TimeLimit<TaxiEnv<Taxi-v2>>>

In [119]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0 
    done = False
    total_rewards = 0
    print ("****************************************")
    print ("EPISODE ", episode)
    
    for step in range(max_steps):
        
        env.render()
        # take action (index) that have the max expected future reward given state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print ("Score", total_rewards)
            break
            
        state = new_state
env.close()
print ("Score over time: " + str(sum(rewards)/total_test_episodes))

****************************************
EPISODE  0
+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : :