In [6]:
import numpy as np

# Taxi Problem

From visualization alone, we can infer what actions and states are possible: our agent needs to move in its environment to pick up and drop off the passengers. More formally, there are 6 discrete & deterministic actions:
* 0: move south;
* 1: move north;
* 2: move east;
* 3: move west;
* 4: pickup the passenger;
* 5: dropoff the passenger.

In [3]:
env = gym.make("Taxi-v3").env
env.reset()
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[35mB[0m: |
+---------+



In [12]:
#Taking the action and getting the reward and outcome state
new_state, reward, done, info = env.step(1)
print(new_state, reward, done, info)

env.render()

143 -1 False {'prob': 1.0}
+---------+
|[34;1mR[0m: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)


In [4]:
#Getting the state space

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(6)
State Space Discrete(500)


In [7]:
# Step 1: Initialize Q table
Q = np.zeros((env.observation_space.n, env.action_space.n))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [84]:
def exploration_policy():
    #list of actions
    action_list = [0,1,2,3,4,5]
    #random policy
    return np.random.choice(action_list)

exploration_policy()

5

In [72]:
env.action_space.sample()

2

In [97]:
max_inter = 10000
alpha0 = 0.05 # initial learning rate
decay = 0.005 # learning rate decay
discount_factor = 0.9 # discounting factor
state = 0 # initial state
total_training_rewards = 0

for iteraction in range(0, max_inter):
    action = exploration_policy()
    new_state, reward, done, info = env.step(action)
    
    Q[state, action] = Q[state, action] + alpha*(reward+discount_factor*np.max(Q[new_state, :]) - Q[state, action]) 
    
    #Ending the episode
    if done == True:
        print ("Total reward for episode {}: {}".format(iteraction, total_training_rewards))
        break

    #Increasing our total reward and updating the state
    total_training_rewards += reward      
    state = new_state   



Total reward for episode 15: -33


In [98]:
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)


In [100]:
print(Q)

[[-0.06320117  0.          0.         -0.14662019 -0.51209587 -0.98839979]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 ...
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.         -0.0970273  -0.04975124 -0.09840293 -1.41951316 -2.63752456]]
