In [9]:
import numpy as np
import tensorflow as tf
import gym

# Actor Model
class Actor(tf.keras.Model):
    def __init__(self, state_dim, action_dim, action_bound):
        super(Actor, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.dense3 = tf.keras.layers.Dense(action_dim, activation='tanh')
        self.action_bound = action_bound

    def call(self, inputs):
        # Reshape the input tensor to have a shape of (batch_size, input_dim)
        x = tf.expand_dims(inputs, axis=0)  # Add a batch dimension
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return tf.squeeze(x, axis=0)  # Remove the added batch dimension


# Critic Model
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.dense3 = tf.keras.layers.Dense(1)

    def call(self, inputs):
        # Reshape the input tensor to have a shape of (batch_size, input_dim)
        x = tf.expand_dims(inputs, axis=0)  # Add a batch dimension
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return tf.squeeze(x, axis=0)  # Remove the added batch dimension


# Actor-Critic Agent
class ActorCriticAgent:
    def __init__(self, state_dim, action_dim, action_bound, gamma=0.99, actor_lr=0.001, critic_lr=0.001):
        self.actor = Actor(state_dim, action_dim, action_bound)
        self.critic = Critic()
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        self.gamma = gamma

    def get_action(self, state):
        return self.actor(tf.convert_to_tensor([state])).numpy()[0]

    def train(self, states, actions, rewards, next_states, dones):
        # Compute TD targets
        next_q_values = self.critic(tf.convert_to_tensor(next_states, dtype=tf.float32))
        targets = rewards + (1 - dones) * self.gamma * next_q_values.numpy().flatten()

        # Compute advantages
        values = self.critic(tf.convert_to_tensor(states, dtype=tf.float32)).numpy().flatten()
        advantages = targets - values

        # Train actor
        with tf.GradientTape() as tape:
            actor_actions = self.actor(tf.convert_to_tensor(states, dtype=tf.float32))
            actor_loss = -tf.reduce_mean(self.critic(tf.convert_to_tensor(states, dtype=tf.float32)) * actor_actions)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

        # Train critic
        with tf.GradientTape() as tape:
            critic_values = self.critic(tf.convert_to_tensor(states, dtype=tf.float32))
            critic_loss = tf.reduce_mean(tf.square(targets - critic_values))
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

# Example Usage
env = gym.make('Pendulum-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]

agent = ActorCriticAgent(state_dim, action_dim, action_bound)

episodes = 10
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    while True:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, reward, next_state, done)
        episode_reward += reward
        state = next_state
        if done:
            print("Episode:", episode + 1, "Reward:", episode_reward)
            break


Episode: 1 Reward: -1495.760387698824
Episode: 2 Reward: -1317.6043848744087
Episode: 3 Reward: -1148.501590913999
Episode: 4 Reward: -1255.3839254633124
Episode: 5 Reward: -944.9861228986489
Episode: 6 Reward: -1559.6234125688973
Episode: 7 Reward: -1171.1532151987162
Episode: 8 Reward: -1729.408661360246
Episode: 9 Reward: -1639.4917373002643
Episode: 10 Reward: -1371.8198135753485
