In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from gymnasium.wrappers import TimeLimit
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize,VecMonitor
from stable_baselines3.common.env_util import make_vec_env
from deform_rl.envs.Cable_reshape_env.environment import CableReshapeV2,CableReshape
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import EvalCallback,BaseCallback

import pygame

In [2]:
def make_env(rank,seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = CableReshapeV2(render_mode='human',seg_num=10,cable_length=300,scale_factor=800)
        env = TimeLimit(env,max_episode_steps=1000)
        env = Monitor(env)
        # use a seed for reproducibility
        # Important: use a different seed for each environment
        # otherwise they would generate the same experiences
        env.reset(seed=seed + rank)
        return env

    set_random_seed(seed)
    return _init

env0 = DummyVecEnv([make_env(i+4) for i in range(4)])
training_env =VecNormalize(env0)
env1 = DummyVecEnv([make_env(i+4) for i in range(1)])
validation_env = VecNormalize(env1)

save_dir = os.path.join("saved_models/reshape")
log_dir = os.path.join("logs/reshape")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

class SaveNormalizeCallback(BaseCallback):
    def __init__(self, verbose = 0,):
        super().__init__(verbose)

    def _on_step(self):
        training_env.save(save_dir+"/vecnorms.pkl")
        super()._on_step()
        return True
save_callback = SaveNormalizeCallback()


eval_callback = EvalCallback(
    eval_env=validation_env,
    n_eval_episodes=15,
    eval_freq=10000,
    callback_on_new_best=save_callback,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)


Environment will not be deterministic
Planning will not be deterministic
seed for  BezierSampler  is  1903
seed for  NDIMSampler  is  6822


In [4]:
model = PPO('MlpPolicy', training_env,device='cpu',verbose=1,tensorboard_log=log_dir)
model.learn(800000,tb_log_name='reshape_v2',callback=eval_callback) 


Using cpu device
Logging to logs/reshape/reshape_v2_3


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -4.75e+03 |
| time/              |           |
|    fps             | 2504      |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 8192      |
----------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | -4.89e+03  |
| time/                   |            |
|    fps                  | 1836       |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 16384      |
| train/                  |            |
|    approx_kl            | 0.01305001 |
|    clip_fraction        | 0.172      |
|    clip_range           | 0.2        |
|    entropy_loss         | -28.4      |
|    explained_variance   | 0.933      |
|    learning_rate        | 0.0003 

<stable_baselines3.ppo.ppo.PPO at 0x7d2f3e790bf0>

In [6]:

model.learn(1600000,tb_log_name='reshape_v2',callback=eval_callback,reset_num_timesteps=False) 


Logging to logs/reshape/reshape_v2_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 186      |
|    ep_rew_mean     | 26       |
| time/              |          |
|    fps             | 2227     |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1197220  |
---------------------------------
Eval num_timesteps=1200000, episode_reward=74.56 +/- 575.17
Episode length: 173.33 +/- 223.20
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 173         |
|    mean_reward          | 74.6        |
| time/                   |             |
|    total_timesteps      | 1200000     |
| train/                  |             |
|    approx_kl            | 0.024974246 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.2         |
|    entropy_loss         | -25.4       |
|    explained_variance   | 0.965       |
|    learning_rate       

KeyboardInterrupt: 