In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
from tqdm import tqdm
import pandas as pd
import time
import pygetwindow as gw

# Crear entorno
env = gym.make("Taxi-v3")
Image(filename=r'C:\Users\Lenovo\Desktop\uniowo\7mo semestre\SIS420\labs\lab7\taxi.gif')


In [None]:
# Inicializar Q
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

rewards = []

In [None]:
# Entrenamiento con actualización incremental
for episodio in tqdm(range(n_episodios)):
    state, _ = env.reset()
    total_reward = 0

    for _ in range(max_steps):
        # Selección de acción (greedy)
        action = np.argmax(Q[state])

        # Ejecutar acción
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Actualización incremental
        best_next = np.max(Q[next_state])
        td_target = reward + gamma * best_next
        Q[state][action] += alpha * (td_target - Q[state][action])

        total_reward += reward
        state = next_state

        if done:
            break

    rewards.append(total_reward)
    if episodio % 500 == 0:
        print(f"Episodio {episodio}, recompensa total: {total_reward}")

env.close()

In [None]:
# Recompensas promedio por bloques de 100 episodios
def suavizar(lista, bloque=100):
    return [np.mean(lista[i:i + bloque]) for i in range(0, len(lista), bloque)]

# 👉 Primero la gráfica
plt.plot(suavizar(rewards), label="Recompensa promedio por bloque (100)")
plt.xlabel("Bloque de 100 episodios")
plt.ylabel("Recompensa promedio")
plt.title("Q-Learning Incremental en Taxi-v3")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# 👉 Luego la Q-table
df_q = pd.DataFrame(Q)
df_q.columns = ["South", "North", "East", "West", "Pickup", "Dropoff"]
df_q.index.name = "State"
print("Q-table:")
print(df_q.head(10))  # Mostrar los primeros 10 estados


In [None]:
# Evaluación visual
test_env = gym.make("Taxi-v3", render_mode="human")
state, _ = test_env.reset()
done = False
total_reward = 0

time.sleep(1)

try:
    for w in gw.getWindowsWithTitle("Taxi"):
        w.activate()
        break
except Exception as e:
    print("No se pudo enfocar la ventana:", e)

while not done:
    action = np.argmax(Q[state])
    state, reward, terminated, truncated, _ = test_env.step(action)
    total_reward += reward
    done = terminated or truncated

test_env.close()
print("Recompensa total del test:", total_reward)