<a href="https://colab.research.google.com/github/mohamedyosef101/101_learning_area/blob/area/Reinforcement%20Learning/05_actor_critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

source: [huggingface.co](https://huggingface.co/learn/deep-rl-course/unit6/hands-on)

# Actor Critic (A2C)

## Create virtual display

In [3]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay

In [4]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7b02d45ad990>

## Install dependencies

In [6]:
%%capture
!pip install stable-baselines3[extra] gymnasium
!pip install huggingface_sb3 huggingface_hub panda_gym

## Import the packages

In [7]:
import os
import gymnasium as gym
import panda_gym as pgym

from huggingface_sb3 import load_from_hub, package_to_hub

from stable_baselines3 import A2C as a2c
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env

from huggingface_hub import notebook_login

# PandaReachDense-v3

In [10]:
env_id = "PandaReachDense-v3"

env = gym.make(env_id)

# Get the state space and action space
states = env.observation_space.shape
actions = env.action_space

print(f"""The State Space is: {states}
      \nSample observation: {env.observation_space.sample()}
      \nThe Action Space is: {actions}""")

The State Space is: None
      
Sample observation: OrderedDict([('achieved_goal', array([4.5452356, 4.5387993, 1.0222133], dtype=float32)), ('desired_goal', array([-8.196107  , -7.8919377 ,  0.55645293], dtype=float32)), ('observation', array([ 6.9847393,  3.6354744, -3.9288054,  3.3986058,  5.9791417,
       -9.842032 ], dtype=float32))])
      
The Action Space is: Box(-1.0, 1.0, (3,), float32)


In [11]:
# Normalize observation and rewards
env = make_vec_env(env_id, n_envs=4)
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10)

In [12]:
# Create A2C Agent
agent = a2c(policy="MultiInputPolicy",
            env=env,
            verbose=1)

Using cuda device


In [13]:
# Train the agent
agent.learn(1_000_000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| time/                 |          |
|    fps                | 419      |
|    iterations         | 22300    |
|    time_elapsed       | 1062     |
|    total_timesteps    | 446000   |
| train/                |          |
|    entropy_loss       | -1.62    |
|    explained_variance | 0.936    |
|    learning_rate      | 0.0007   |
|    n_updates          | 22299    |
|    policy_loss        | 0.0171   |
|    std                | 0.444    |
|    value_loss         | 0.000201 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.79     |
|    ep_rew_mean        | -0.216   |
| time/                 |          |
|    fps                | 419      |
|    iterations         | 22400    |
|    time_elapsed       | 1066     |
|    total_timesteps    | 448000   |
| train/                |          |
|    entropy_loss       | -1.61    |
|    expla

<stable_baselines3.a2c.a2c.A2C at 0x7b01a600ee00>

In [14]:
# Save the agent
agent.save("a2c-PandaReachDense-v3")
env.save("vec_normalize.pkl")

# Evaluate the Agent

In [15]:
# load the saved statistics
eval_env = DummyVecEnv([lambda: gym.make("PandaReachDense-v3")])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)

# override the render mode
eval_env.render_mode = "rgb_array"

# we don't need to update...
eval_env.training = False
eval_env.norm_reward = False

# Load the agent
agent = a2c.load("a2c-PandaReachDense-v3")

mean_reward, std_reward = evaluate_policy(agent, eval_env)

print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward = -0.23 +/- 0.11


