Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [14]:
import numpy as np
import random
from taxi_env_extended import TaxiEnvExtended

In [15]:
env = TaxiEnvExtended()

Obtener la cantidad de estados y acciones

In [16]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [17]:
Q = np.zeros((states, actions))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Obtención de la acción a partir de la tabla Q

In [18]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [19]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        # print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        # print('exploit')
        
    return action

Ejemplo de episodio 

Implemento el algoritmo Q learning:

In [20]:
def q_learning(env, num_episodes, alpha, gamma, epsilon_start, epsilon_end, epsilon_decay):
    Q = np.zeros((states, actions))
    epsilon = epsilon_start
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            best_next_action = np.argmax(Q[next_state])# aca en realidad podria usar la funcion de optimal_policy
            td_target = reward + gamma * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
            state = next_state
            total_reward += reward
        
        epsilon = max(epsilon_end, epsilon * epsilon_decay)  #aca decaigo epsilon
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{num_episodes} - Total Reward: {total_reward}, Epsilon: {epsilon}")
    
    return Q

Entreno el modelo:

In [21]:
Q = q_learning(env, num_episodes=10000, alpha=0.1, gamma=0.95, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.999)

Episode 100/10000 - Total Reward: -623, Epsilon: 0.9047921471137096
Episode 200/10000 - Total Reward: -623, Epsilon: 0.818648829478636
Episode 300/10000 - Total Reward: -464, Epsilon: 0.7407070321560997
Episode 400/10000 - Total Reward: -333, Epsilon: 0.6701859060067403
Episode 500/10000 - Total Reward: -515, Epsilon: 0.6063789448611848
Episode 600/10000 - Total Reward: -220, Epsilon: 0.5486469074854965
Episode 700/10000 - Total Reward: -299, Epsilon: 0.4964114134310989
Episode 800/10000 - Total Reward: -38, Epsilon: 0.4491491486100748
Episode 900/10000 - Total Reward: -99, Epsilon: 0.4063866225452039
Episode 1000/10000 - Total Reward: -90, Epsilon: 0.3676954247709635
Episode 1100/10000 - Total Reward: -88, Epsilon: 0.33268793286240766
Episode 1200/10000 - Total Reward: -30, Epsilon: 0.3010134290933992
Episode 1300/10000 - Total Reward: -5, Epsilon: 0.27235458681947705
Episode 1400/10000 - Total Reward: -5, Epsilon: 0.24642429138466176
Episode 1500/10000 - Total Reward: -1, Epsilon: 0.

Evaluo la policy optima:

In [22]:
def evaluate_policy(env, Q, num_episodes=100):
    total_rewards = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = np.argmax(Q[state])
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
        
        total_rewards.append(total_reward)
    
    average_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

In [23]:
average_reward = evaluate_policy(env, Q)

Average Reward over 100 episodes: 8.13
