In [1]:
import gym
import random
import numpy
import time

# Environment
env = gym.make("Taxi-v3")

# Training parameters for Q learning
alpha = 1 # Learning rate
gamma = 0.9 # Future reward discount factor
num_of_episodes = 10000
num_of_steps = 1000 # per each episode

# Q tables for rewards
#Q_reward = -100000*numpy.zeros((500,6)) # All same
Q_reward = -100000*numpy.random.random((500, 6)) # Random

for episode in range(num_of_episodes):
    state = env.reset()
    
    for steps in range(num_of_steps):
        
        #Explores the environment with random actions
        action = random.randint(0,5)
        next_state, reward, done, info = env.step(action)
        
        old_value = Q_reward[state, action]
        
        #Checks what action gives the maximum reward in the next state
        next_max = numpy.max(Q_reward[next_state])
        
        #Update actions reward number with the information from next states maximum reward
        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        Q_reward[state, action] = new_value
        
        state = next_state
     
    

In [2]:
# Testing
rewards_sum = 0
actions_sum = 0
for run_times in range(10):
    state = env.reset()
    tot_reward = 0
    actions = 0

    
    for t in range(50):
        action = numpy.argmax(Q_reward[state,:])
        actions += 1
        state, reward, done, info = env.step(action)
        tot_reward += reward
        env.render()
        time.sleep(1)
        if done:
            print("Total reward %d" %tot_reward)
            rewards_sum += tot_reward
            actions_sum += actions
            break
            
print("Average total reward: {}".format(float(rewards_sum)/10))
print("Average number of actions: {}".format(float(actions_sum)/10))

+---------+
|R: | : :[34;1mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : 