Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [278]:
import numpy as np
import random
from taxi_env_extended import TaxiEnvExtended

In [279]:
env = TaxiEnvExtended()

Obtener la cantidad de estados y acciones

In [280]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [281]:
Q = np.zeros((states, actions))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Obtención de la acción a partir de la tabla Q

In [282]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [283]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        # print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        # print('exploit')
        
    return action

Ejemplo de episodio 

Implemento el algoritmo Q learning:

In [284]:
def q_learning(env, num_episodes=5000, alpha=0.05, gamma=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.9995):
    Q = np.zeros((states, actions))
    epsilon = epsilon_start
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + gamma * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
            state = next_state
            total_reward += reward
        
        epsilon = max(epsilon_end, epsilon * epsilon_decay)  # Decaer epsilon
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{num_episodes} - Total Reward: {total_reward}, Epsilon: {epsilon}")
    
    return Q

Entreno el modelo:

In [304]:
Q = q_learning(env, num_episodes=5000, alpha=0.15, gamma=0.95, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.999)

Episode 100/5000 - Total Reward: -803, Epsilon: 0.9047921471137096
Episode 200/5000 - Total Reward: -623, Epsilon: 0.818648829478636
Episode 300/5000 - Total Reward: -686, Epsilon: 0.7407070321560997
Episode 400/5000 - Total Reward: -361, Epsilon: 0.6701859060067403
Episode 500/5000 - Total Reward: -50, Epsilon: 0.6063789448611848
Episode 600/5000 - Total Reward: -133, Epsilon: 0.5486469074854965
Episode 700/5000 - Total Reward: -62, Epsilon: 0.4964114134310989
Episode 800/5000 - Total Reward: -91, Epsilon: 0.4491491486100748
Episode 900/5000 - Total Reward: -43, Epsilon: 0.4063866225452039
Episode 1000/5000 - Total Reward: -87, Epsilon: 0.3676954247709635
Episode 1100/5000 - Total Reward: 13, Epsilon: 0.33268793286240766
Episode 1200/5000 - Total Reward: -44, Epsilon: 0.3010134290933992
Episode 1300/5000 - Total Reward: 0, Epsilon: 0.27235458681947705
Episode 1400/5000 - Total Reward: -14, Epsilon: 0.24642429138466176
Episode 1500/5000 - Total Reward: -6, Epsilon: 0.22296276370290227


Evaluo la policy optima:

In [305]:
def evaluate_policy(env, Q, num_episodes=100):
    total_rewards = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = np.argmax(Q[state])
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
        
        total_rewards.append(total_reward)
    
    average_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

In [313]:
average_reward = evaluate_policy(env, Q)

Average Reward over 100 episodes: 7.44
