<a href="https://colab.research.google.com/github/lmntrx-sys/Research/blob/main/Actor_Critic_method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline

In [2]:
!pip install gymnasium



In [3]:
# Setup

import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
from IPython import display
import tensorflow as tf
import numpy as np

In [4]:
# Parameters
# Environment
env = gym.make("CartPole-v1", render_mode="rgb_array")
env.reset(seed=42)

# Train params
max_iters_per_episode = 1000 # Adjust as needed
n_max_episodes = 1000 # Adjust as needed
discount_factor = 0.99

# Smallest number such that 1.0 + eps != 1.0
eps = np.finfo(np.float32).eps

In [5]:
def render():
  state_image = env.render()
  plt.imshow(state_image)
  plt.show()

In [6]:
# Agent

# Get the shape of the environment
obs_shape = env.observation_space.shape
print("Observation shape:", obs_shape)

# Get the number of actions
n_actions = env.action_space.n
print("Number of actions:", n_actions)

obs_shape = 4
n_actions = 2

# Small neural network
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(obs_shape,)),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(n_actions, activation="softmax")
])

Observation shape: (4,)
Number of actions: 2


In [7]:
model.summary()

In [8]:
critic = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(obs_shape,)),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(1)
])

In [9]:
# Network Params
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss_fn = tf.keras.losses.Huber()
reward_history = []
running_rewards = 0.0
running_episodes = 0
action_probs_history = []
critic_value_history = []
action_history = []

In [10]:
# Train our model

for episode in range(n_max_episodes):
    obs, info = env.reset()
    episode_reward = 0
    for step in range(max_iters_per_episode):
      with tf.GradientTape() as tape:

        state = tf.convert_to_tensor(obs)
        state = tf.expand_dims(state, 0)

        # Predict action probabilities
        action_probabilities = model(state)
        critic_value = critic(state)
        critic_value_history.append(critic_value[0, 0])

        # Sample next action from action probabilities
        action = np.random.choice(n_actions, p=np.squeeze(action_probabilities))
        action_probs_history.append(tf.keras.ops.log(action_probabilities[0, action]))

        # Apply the action
        obs, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
        reward_history.append(reward)

        if terminated or truncated:
          break


        # Update running reward to check condition for solving
        running_rewards = 0.05 * episode_reward + (1 - 0.05) * running_rewards

        # Calculate discounted future rewards
        discounts = np.array([discount_factor ** i for i in range(len(reward_history))])
        future_rewards = np.array(reward_history) * discounts # Use actual reward history
        returns = (future_rewards - np.mean(future_rewards)) / (np.std(future_rewards) + eps)

        # Use a dummy critic value for now
        critic_value_history = [0.0] * len(reward_history)

        history = zip(action_probs_history, critic_value_history, returns)

        actor_losses = []
        critic_losses = []

        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up receiving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                loss_fn(tf.keras.ops.expand_dims(value, 0), tf.keras.ops.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses)  + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)


        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        reward_history.clear()

    # Log details
    running_episodes += 1
    if running_episodes % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_rewards, running_episodes))

    if running_rewards > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(running_episodes))
        break

running reward: 12.98 at episode 10
running reward: 7.77 at episode 20
running reward: 9.79 at episode 30
running reward: 15.30 at episode 40
running reward: 10.82 at episode 50
running reward: 8.97 at episode 60
running reward: 19.22 at episode 70
running reward: 8.56 at episode 80
running reward: 13.11 at episode 90
running reward: 15.12 at episode 100
running reward: 11.97 at episode 110
running reward: 10.65 at episode 120
running reward: 10.24 at episode 130
running reward: 11.34 at episode 140
running reward: 15.43 at episode 150
running reward: 9.37 at episode 160
running reward: 13.30 at episode 170
running reward: 13.88 at episode 180
running reward: 12.47 at episode 190
running reward: 7.82 at episode 200
running reward: 9.89 at episode 210
running reward: 14.96 at episode 220
running reward: 10.83 at episode 230
running reward: 9.50 at episode 240
running reward: 9.24 at episode 250
running reward: 14.63 at episode 260
running reward: 6.73 at episode 270
running reward: 10.3