<a href="https://colab.research.google.com/github/luxman21/ReinforcementLearning/blob/main/Taxi_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
# Import the required packaged
import numpy as np
import gym 
import random 


In [70]:
# Creating the environment 
env = gym.make("Taxi-v3").env
env.render()


+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |B: |
+---------+



In [38]:
# Checking the number of states and actions that can be taken 
actions = env.action_space.n
states = env.observation_space.n
print(f" Number of actions : {actions} and number of states {states}")

 Number of actions : 6 and number of states 500


In [None]:
# For each state display the probability of taking that an action, the reward and next state
env.P

In [71]:
# Set the env state to 328
env.s = 328
env.render()


action = env.action_space.sample()
print(action)
state, reward, done, info = env.step(action)
print(f"{state}, {reward}, {done} , {info}")

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+

5
328, -10, False , {'prob': 1.0}


In [111]:
# Set a random  inital state
# implementation inspired from:
# https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/
env.s = random.randint(0,env.observation_space.n)  

epochs = 0
penalties, reward = 0, 0

# Frames of the game
frames = []

done = False
print(not done)
while (done == False):
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

True
Timesteps taken: 1387
Penalties incurred: 438


In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        
print_frames(frames)

In [94]:
# implemeting the Q-learnign in python 
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [117]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1 #learning rate
gamma = 0.8 #discount factor 0 give importance to current rewards, 1 for future rewards
epsilon = 0.5 #Value to choose exploration and exploitation 

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 2min, sys: 24.4 s, total: 2min 25s
Wall time: 1min 58s


In [118]:
q_table[328]


array([ -2.85251635,  -1.6445568 ,  -2.85251635,  -2.31564544,
       -11.31564544, -11.31564544])

In [127]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 1

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    frames = [] # for animation

    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 1 episodes:
Average timesteps per episode: 7.0
Average penalties per episode: 0.0


In [128]:
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)

Timestep: 1
State: 477
Action: 4
Reward: -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)

Timestep: 2
State: 377
Action: 1
Reward: -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 3
State: 277
Action: 1
Reward: -1
+---------+
|R: | : :[35mG[0m|
| : | :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 4
State: 177
Action: 1
Reward: -1
+---------+
|R: | :[42m_[0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 5
State: 77
Action: 1
Reward: -1
+---------+
|R: | : :[35m[42mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)

Timestep: 6
State: 97
Action: 2
Reward: -1
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | :