In [1]:
import gym
import random
import tensorflow as tf
from collections import deque


# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras

In [2]:
# Hiperparámetros
epsilon = 1.0
epsilon_decay = 0.999
epsilon_min = 0.01
gamma = 0.99
learning_rate = 0.001
batch_size = 32
memory = deque(maxlen=100000)

In [3]:
# Crea el entorno
env = gym.make('CliffWalking-v0')

input_shape = [4] # == env.observation_space.shape
n_outputs = 2 # == env.action_space.n

In [5]:
# Define la red neuronal
model = keras.models.Sequential([
    keras.layers.Dense(32, activation="elu", input_shape=input_shape),
    keras.layers.Dense(32, activation="elu"),
    keras.layers.Dense(n_outputs)
])

# Compila el modelo
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate))



In [6]:
# Función para seleccionar una acción
def select_action(state):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return tf.argmax(model.predict(state)[0]).numpy()

# Función para entrenar el modelo
def train_model():
    if len(memory) < batch_size:
        return
    
    # Obtén una muestra aleatoria de la memoria
    batch = random.sample(memory, batch_size)
    
    for state, action, reward, next_state, done in batch:
        if done:
            target = reward
        else:
            target = reward + gamma * tf.reduce_max(model.predict(next_state)[0]).numpy()
        
        target_f = model.predict(state)
        target_f[0][action] = target
        
        # Entrena el modelo con la muestra
        model.fit(state, target_f, epochs=1, verbose=0)
    
    # Reduce la probabilidad de seleccionar una acción aleatoria
    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

In [7]:
episode_scores = []
# Entrena al agente
for episode in range(1000):
    state = env.reset()
    state = tf.reshape(state, [1, env.observation_space.shape[0]])
    
    for time_step in range(500):
        env.render()
        
        action = select_action(state)
        
        next_state, reward, done, info = env.step(action)
        next_state = tf.reshape(next_state, [1, env.observation_space.shape[0]])
        
        memory.append((state, action, reward, next_state, done))
        state = next_state
        
        train_model()

        episode_scores.append(time_step+1)
        
        if done:
            print("Episodio {} completado en {} pasos. Epsilon: {:.2}".format(episode, time_step+1, epsilon))
            break
        
env.close()


IndexError: tuple index out of range

In [None]:
import matplotlib.pyplot as plt

# Pinta el gráfico
plt.plot(episode_scores)
plt.title('Aprendizaje del agente')
plt.xlabel('Episodio')
plt.ylabel('Puntaje total')
plt.show()

In [None]:
# Del profe, revisar si funciona
env.seed(42)
state = env.reset()

frames = []

for step in range(200):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    img = env.render(mode="rgb_array")
    frames.append(img)
    
plot_animation(frames)

In [4]:
!pip install matplotlib==3.6.0

Collecting matplotlib==3.6.0
  Using cached matplotlib-3.6.0-cp310-cp310-win_amd64.whl (7.2 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.7.1
    Uninstalling matplotlib-3.7.1:
      Successfully uninstalled matplotlib-3.7.1
Successfully installed matplotlib-3.6.0


---


In [10]:
import numpy as np
from collections import deque
import random

# Define los parámetros del modelo
n_episodes = 100         # número de episodios para entrenar
n_steps = 100             # número máximo de pasos por episodio
memory_size = 10000       # tamaño máximo de la memoria de repetición
batch_size = 32           # tamaño del lote para el aprendizaje por lotes
gamma = 0.95              # factor de descuento para las recompensas
epsilon_start = 1.0       # valor inicial de epsilon para la exploración
epsilon_end = 0.01        # valor final de epsilon para la exploración
epsilon_decay = 0.995     # tasa de decaimiento de epsilon por episodio

# Inicializa la memoria de repetición
memory = deque(maxlen=memory_size)

# Inicializa el valor de epsilon
epsilon = epsilon_start

# Itera a través de los episodios de entrenamiento
for episode in range(n_episodes):

    # Resetea el entorno para el nuevo episodio
    state = np.array(env.reset())
    done = False
    total_reward = 0
    
    # Itera a través de los pasos del episodio
    for step in range(n_steps):
        
        # Elige una acción usando la política epsilon-greedy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            
            q_values = model.predict(state.reshape(1, -1))
            action = np.argmax(q_values[0])
            
        # Ejecuta la acción y observa el resultado
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state)
        total_reward += reward
        
        # Añade la experiencia a la memoria de repetición
        memory.append((state, action, reward, next_state, done))
        
        # Actualiza el estado actual
        state = next_state
        
        # Realiza el aprendizaje por lotes si la memoria de repetición es lo suficientemente grande
        if len(memory) >= batch_size:
            batch = np.array(random.sample(memory, batch_size))
            states = np.array(batch[:, 0].tolist())
            actions = np.array(batch[:, 1].tolist())
            rewards = np.array(batch[:, 2].tolist())
            next_states = np.array(batch[:, 3].tolist())
            dones = np.array(batch[:, 4].tolist())
            
            targets = model.predict(states)
            q_next = np.amax(model.predict(next_states), axis=1)
            targets[np.arange(batch_size), actions] = rewards + gamma * (1 - dones) * q_next
            
            model.fit(states, targets, epochs=1, verbose=0)
            
        # Sal del bucle si el episodio ha terminado
        if done:
            break
      # Lista para almacenar las sumas de las recompensas por episodio
    episode_rewards = []

    # Sal del bucle si el episodio ha terminado
    if done:
    # Agrega la suma de las recompensas para este episodio a la lista
        episode_rewards.append(total_reward)
    # Imprime los resultados del episodio
        print("Episodio {}, Total Reward: {}, Epsilon: {:.4f}".format(episode, total_reward, epsilon))

    # Imprime la suma de las recompensas para los últimos 10 episodios
    if episode > 0 and episode % 10 == 0:
        print("Suma de recompensas de los últimos 10 episodios:", sum(episode_rewards[-10:]))

            
    # Reduce el valor de epsilon después de cada episodio
    epsilon = max(epsilon_end, epsilon * epsilon_decay)
    
    # Imprime los resultados del episodio
    print("Episodio {}, Total Reward: {}, Epsilon: {:.4f}".format(episode, total_reward, epsilon))

ValueError: in user code:

    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 2111, in predict_step
        return self(x, training=False)
    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Maria\AppData\Roaming\Python\Python38\site-packages\keras\engine\input_spec.py", line 253, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_3' (type Sequential).
    
    Input 0 of layer "dense_9" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (32,)
    
    Call arguments received by layer 'sequential_3' (type Sequential):
      • inputs=tf.Tensor(shape=(32,), dtype=int32)
      • training=False
      • mask=None


In [3]:
# !pip install matplotlib



In [5]:
!pip install Pillow


