# Swing up in dm_control


In [3]:
# imports
import numpy as np

from dm_control import suite
from dm_control import viewer
from dm_env import StepType

from stable_baselines3 import PPO, SAC

import time
from statistics import mean, stdev

import gym
from gym.spaces import Box

In [4]:
class DMControlWrapper(gym.Env):
    def __init__(self, domain_name, task_name):
        self.env = suite.load(domain_name=domain_name, task_name=task_name)
        obs_shape = sum([value.shape[0] for value in self.env.observation_spec().values()])
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(obs_shape,), dtype=np.float32)
        self.action_space = Box(low=self.env.action_spec().minimum[0], high=self.env.action_spec().maximum[0], shape=self.env.action_spec().shape, dtype=np.float32)

    def reset(self):
        time_step = self.env.reset()
        return np.array(self.get_obs(time_step))

    def step(self, action):
        time_step = self.env.step(action)
        return np.array(self.get_obs(time_step)), time_step.reward, time_step.last(), {}

    def get_obs(self, time_step):
        return np.concatenate([value for value in time_step.observation.values()])
    
    def action_spec(self):
        return self.env.action_spec()



## PPO

In [7]:
env = DMControlWrapper("cartpole", "swingup")
PPO_model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
PPO_model.learn(total_timesteps=500000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 27.5     |
| time/              |          |
|    fps             | 4174     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | 110          |
| time/                   |              |
|    fps                  | 2756         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0045975293 |
|    clip_fraction        | 0.0371       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.147        |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x2b6f86610>

In [None]:
# Test
env = DMControlWrapper("cartpole", "swingup")

# Test
test_episodes = 30
test_scores = []
test_rewards = []
start_time = time.time()


for e in range(test_episodes):
    state = env.reset()
    sum_rewards = 0

    for t in range(1000): # 1000 steps (half delta t of gym)
        action = PPO_model.predict(state)[0][0]
        state, reward, done, _ = env.step(action)
        
        
        sum_rewards += reward
    
    test_rewards.append(sum_rewards)


reward_average = mean(test_rewards)
reward_sigma = stdev(test_rewards)
end_time = time.time()
total_time = end_time - start_time
total_steps = sum(test_scores)
average_time_per_step = total_time / (test_episodes * 1000)

print()
print('Average time per step: {:.4f} seconds'.format(average_time_per_step))
print('Reward average: {:.2f}, Sigma: {:.2f}'.format(reward_average, reward_sigma))


Average time per step: 0.0003 seconds
Reward average: 814.55, Sigma: 100.45


In [None]:
#create video
from moviepy.editor import ImageSequenceClip

def ppo_policy(time_step):
    timestep, reward, discount, observation = time_step
    state = np.concatenate((observation['position'],observation['velocity']))
    action = PPO_model.predict(state)[0][0]
    return action

# Load the cartpole environment
env = suite.load(domain_name='cartpole', task_name='swingup')

# Visualization and video creation
def save_video(policy):
    frames = []

    def policy_with_frame_grab(time_step):
        pixels = env.physics.render(height=480, width=640, camera_id=0)
        frames.append(pixels)
        return policy(time_step)

    # Create the viewer application
    viewer.launch(env, policy=policy_with_frame_grab)

    # Save the frames as a video
    clip = ImageSequenceClip(frames, fps=100)
    clip.write_videofile("video/PPO_dm_swingup.mp4", codec="libx264")

# Call the save_video function with your policy function
save_video(ppo_policy)

Moviepy - Building video video/PPO_dm_swingup.mp4.
Moviepy - Writing video video/PPO_dm_swingup.mp4



                                                                

Moviepy - Done !
Moviepy - video ready video/PPO_dm_swingup.mp4




## SAC

In [5]:
env = DMControlWrapper("cartpole", "swingup")
SAC_model = SAC("MlpPolicy", env, verbose=1)
SAC_model.learn(total_timesteps=100000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 43.7     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 29       |
|    time_elapsed    | 135      |
|    total timesteps | 4000     |
| train/             |          |
|    actor_loss      | -7.2     |
|    critic_loss     | 0.00805  |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -1.95    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 36.1     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 31       |
|    time_elapsed    | 257      |
|    total timesteps | 8000     |
| train/             |

<stable_baselines3.sac.sac.SAC at 0x2b211b520>

In [None]:
# Test
env = DMControlWrapper("cartpole", "swingup")

# Test
test_episodes = 30
test_scores = []
test_rewards = []
start_time = time.time()


for e in range(test_episodes):
    state = env.reset()
    sum_rewards = 0

    for t in range(1000): # 1000 steps (half delta t of gym)
        action = PPO_model.predict(state)[0][0]
        state, reward, done, _ = env.step(action)
        
        
        sum_rewards += reward
    
    test_rewards.append(sum_rewards)


reward_average = mean(test_rewards)
reward_sigma = stdev(test_rewards)
end_time = time.time()
total_time = end_time - start_time
total_steps = sum(test_scores)
average_time_per_step = total_time / (test_episodes * 1000)


print()
print('Average time per step: {:.4f} seconds'.format(average_time_per_step))
print('Reward average: {:.2f}, Sigma: {:.2f}'.format(reward_average, reward_sigma))


Average time per step: 0.0002 seconds
Reward average: 734.05, Sigma: 129.84


In [None]:
#create video
from moviepy.editor import ImageSequenceClip

def ppo_policy(time_step):
    timestep, reward, discount, observation = time_step
    state = np.concatenate((observation['position'],observation['velocity']))
    action = PPO_model.predict(state)[0][0]
    return action

# Load the cartpole environment
env = suite.load(domain_name='cartpole', task_name='swingup')

# Visualization and video creation
def save_video(policy):
    frames = []

    def policy_with_frame_grab(time_step):
        pixels = env.physics.render(height=480, width=640, camera_id=0)
        frames.append(pixels)
        return policy(time_step)

    # Create the viewer application
    viewer.launch(env, policy=policy_with_frame_grab)

    # Save the frames as a video
    clip = ImageSequenceClip(frames, fps=100)
    clip.write_videofile("video/SAC_dm_swingup.mp4", codec="libx264")

# Call the save_video function with your policy function
save_video(ppo_policy)

Moviepy - Building video video/SAC_dm_swingup.mp4.
Moviepy - Writing video video/SAC_dm_swingup.mp4



                                                                

Moviepy - Done !
Moviepy - video ready video/SAC_dm_swingup.mp4




In [9]:
# Access the model's attributes and methods
print("Policy network architecture:", PPO_model.policy)
print("Value function network architecture:", PPO_model.policy.value_net)
print("Optimizer:", PPO_model.policy.optimizer)

# You can get more detailed information about the policy by checking its attributes:
print("Policy network layers:", PPO_model.policy.mlp_extractor.policy_net)
print("Value function network layers:", PPO_model.policy.mlp_extractor.value_net)

Policy network architecture: ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=5, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=5, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=1, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)
Value function network architecture: Linear(in_features=64, out_features=1, bias=True)
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-05
    foreach: None
    fused: None
    l