In [5]:
import numpy as np
from keras.optimizers import Adam
import tensorflow as tf
from keras.models import clone_model
import gymnasium as gym

In [25]:
class DDPGAgent:

    def __init__(self,
                 env,
                 critic_network,
                 actor_network,
                 critic_learning_rate=1e-3,
                 actor_learning_rate=1e-4,
                 discount_factor=0.99,
                 minibatch_size=64,
                 tau=1e-3,
                 exploratory_noise_std=0.3,
                 max_buffer_size=100_000):
        """
        :param env:

        :param critic_network: A NN that maps state action pairs to values. Input shape should be the same shape as the
        concatenated state and action (28, ). Output shape should be 1.

        :param actor_network: A NN that maps states to actions. Input shape should be the same shape as the states
        (24, ). Output shape should be the same shape as the actions (4, ).
        """
        self.env = env

        self.critic_network = critic_network
        self.actor_network = actor_network

        self.target_critic_network = clone_model(self.critic_network)
        self.target_actor_network = clone_model(self.actor_network)
        self.target_critic_network.set_weights(self.critic_network.get_weights())
        self.target_actor_network.set_weights(self.actor_network.get_weights())

        self.critic_optimizer = Adam(learning_rate=critic_learning_rate)
        self.actor_optimizer = Adam(learning_rate=actor_learning_rate)

        self.minibatch_size = minibatch_size
        self.discount_factor = discount_factor
        self.tau = tau
        self.exploratory_noise_std = exploratory_noise_std

        self.replay_buffer = np.empty((max_buffer_size, 54), dtype=np.float32)
        self.max_buffer_size = max_buffer_size
        self.buffer_write_idx = 0
        self.buffer_fullness = 0

    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action at the given state."""
        # Input check.
        assert state.ndim == 1 and state.shape[0] == 24
        # Do forward pass.
        action = self.actor_network(np.expand_dims(state, axis=0), training=False).numpy()[0]
        # Add exploratory noise.
        noise = np.random.normal(0, self.exploratory_noise_std, 4)
        action += noise
        action = np.clip(action, -1, 1)
        return action

    def save_transition(self, state: np.ndarray, action: np.ndarray, reward: float, new_state: np.ndarray, done: bool):
        """Save a transition."""
        # Input check.
        assert state.ndim == 1 and state.shape[0] == 24
        assert action.ndim == 1 and action.shape[0] == 4
        assert new_state.ndim == 1 and new_state.shape[0] == 24

        # Save transition.
        transition = np.concatenate((state, action, [reward], new_state, [1.0 if done else 0.0]))
        self.replay_buffer[self.buffer_write_idx] = transition

        # Update write index and fullness.
        self.buffer_write_idx = (self.buffer_write_idx + 1) % self.max_buffer_size
        self.buffer_fullness = min(self.buffer_fullness + 1, self.max_buffer_size)

    def sample_minibatch(self) -> tf.Tensor:
        """Sample a minibatch from the replay buffer."""
        indices = np.random.choice(self.replay_buffer.shape[0], size=self.minibatch_size, replace=False)
        minibatch = self.replay_buffer[indices]
        return tf.convert_to_tensor(minibatch, dtype=tf.float32)

    def update_critic_network(self, minibatch: tf.Tensor):
        """Update the critic network."""
        s_a = minibatch[:, :28]
        s_ = minibatch[:, 29:29+24]
        r = minibatch[:, 28:29]
        d = minibatch[:, 53:54]

        next_actions = self.target_actor_network(s_, training=False)
        next_state_actions = tf.concat((s_, next_actions), axis=1)
        q_next = self.target_critic_network(next_state_actions, training=False)
        q_target = r + self.discount_factor * (1.0 - d) * q_next

        with tf.GradientTape() as tape:
            q_expected = self.critic_network(s_a, training=True)
            critic_loss = tf.reduce_mean(tf.square(q_target - q_expected))

        critic_grads = tape.gradient(critic_loss, self.critic_network.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic_network.trainable_variables))

    def update_actor_network(self, minibatch: tf.Tensor):
        """Update the actor network."""
        s = minibatch[:, :24]

        with tf.GradientTape() as tape:
            raw_actions = self.actor_network(s, training=True)
            raw_state_actions = tf.concat((s, raw_actions), axis=1)
            q_values = self.critic_network(raw_state_actions, training=False)
            actor_loss = -tf.reduce_mean(q_values)
        actor_grads = tape.gradient(actor_loss, self.actor_network.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor_network.trainable_variables))

    def soft_update_target_weights(self):
        """Soft update the target networks weights."""
        new_target_critic_weights = [
            self.tau * w1 + (1 - self.tau) * w2
            for w1, w2 in zip(self.critic_network.get_weights(), self.target_critic_network.get_weights())
        ]
        self.target_critic_network.set_weights(new_target_critic_weights)

        new_target_actor_weights = [
            self.tau * w1 + (1 - self.tau) * w2
            for w1, w2 in zip(self.actor_network.get_weights(), self.target_actor_network.get_weights())
        ]
        self.target_actor_network.set_weights(new_target_actor_weights)

    def learn(self, n_episodes=100):

        for n in range(n_episodes):
            # Print episode number.
            print("Episode:", n + 1)

            # Reset environment.
            state, _ = self.env.reset()
            done = False

            # Monitor reward
            episode_reward = 0

            while not done:
                # Select action.
                action = self.select_action(state)

                # Take step.
                new_state, reward, terminal, truncated, _ = self.env.step(action)
                done = terminal or truncated

                # Store transition.
                self.save_transition(state, action, reward, new_state, done)

                # Update episode reward
                episode_reward += reward

                if self.buffer_fullness >= self.minibatch_size:
                    # Sample minibatch.
                    minibatch = self.sample_minibatch()

                    # Update critic network.
                    self.update_critic_network(minibatch)

                    # Update actor network.
                    self.update_actor_network(minibatch)

                    # Update target weights.
                    self.soft_update_target_weights()

                state = new_state

            print("Episode reward:", episode_reward)

In [28]:
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.initializers import RandomUniform


critic_network = Sequential([
    Input(shape=(28,)),
    Dense(64, activation="relu", kernel_initializer="he_uniform"),
    Dense(64, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer=RandomUniform(-0.003, 0.003))  # No activation.
])

actor_network = Sequential([
    Input(shape=(24,)),
    Dense(64, activation="relu", kernel_initializer="he_uniform"),
    Dense(64, activation="relu", kernel_initializer="he_uniform"),
    Dense(4, activation="tanh", kernel_initializer=RandomUniform(-0.003, 0.003))  # Tanh to map outputs to [-1, 1].
])

In [23]:
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="human")

In [29]:
test = DDPGAgent(env, critic_network, actor_network)
test.learn()

Episode: 1
Episode reward: -150.71492
Episode: 2
Episode reward: -111.464005
Episode: 3
Episode reward: -121.59012
Episode: 4
Episode reward: -124.15194
Episode: 5
Episode reward: -119.836914
Episode: 6


KeyboardInterrupt: 