In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pygame
import os
import numpy as np
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback,BaseCallback
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation

In [7]:
from deform_rl.envs.Cable_reshape_env.environment import CableReshape


In [8]:
def make_env(seed=None):
    env = CableReshape(render_mode='human',seed=seed,seg_num=10,cable_length=300,scale_factor=800)
    env = TimeLimit(env, max_episode_steps=1000)
    return env

save_dir = "./saved_models/cable_reshape"
log_dir = "./logs/cable_reshape"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [9]:
env = make_env()
check_env(env, warn=True)


Environment will not be deterministic
Planning will not be deterministic
seed for  BezierSampler  is  7744
seed for  NDIMSampler  is  6513


In [10]:
#show random actions
env.reset()
for i in range(1000):
    env.step(env.action_space.sample())
    env.render()
    if pygame.event.get(pygame.QUIT):
        break
env.close()

In [11]:
eval_env = VecMonitor(make_vec_env(make_env, n_envs=1))
env = VecMonitor(make_vec_env(make_env, n_envs=4))




eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=15,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)




In [None]:

model = PPO('MlpPolicy', env, device='cpu', verbose=0,tensorboard_log="./logs/cable_reshape/").learn(500000, tb_log_name="ppo_cable_reshape",callback=eval_callback)

Eval num_timesteps=37136, episode_reward=-7808.76 +/- 4080.77
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=77136, episode_reward=-13068.24 +/- 8610.42
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=117136, episode_reward=-11978.72 +/- 9715.58
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=157136, episode_reward=-26054.72 +/- 22275.93
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=197136, episode_reward=-12944.29 +/- 7707.60
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=237136, episode_reward=-11357.61 +/- 7064.17
Episode length: 1000.00 +/- 0.00


In [50]:
model.learn(800000,tb_log_name="ppo_cable_reshape",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=520000, episode_reward=-8443.81 +/- 4895.41
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=560000, episode_reward=-6626.92 +/- 1104.15
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=600000, episode_reward=-4967.46 +/- 3395.01
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=640000, episode_reward=-6417.61 +/- 2660.11
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=680000, episode_reward=-5449.54 +/- 2278.20
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=720000, episode_reward=-5261.87 +/- 2013.82
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=760000, episode_reward=-4042.01 +/- 1441.52
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=800000, episode_reward=-3135.46 +/- 2015.26
Episode length: 941.07 +/- 220.51
New best mean reward!
Eval num_timesteps=840000, episode_reward=-2569.95 +/- 1292.00
Episode length: 937.00 +/- 235.72
New best mean reward!
Eval num_timesteps=880000, episode_reward=-3070.71 +/- 1448.99
Episo

<stable_baselines3.ppo.ppo.PPO at 0x7525cb1bc850>

In [51]:
model.learn(1500000,tb_log_name="ppo_cable_reshape",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=1320000, episode_reward=-1572.55 +/- 638.62
Episode length: 942.00 +/- 217.02
Eval num_timesteps=1360000, episode_reward=-1996.52 +/- 1131.96
Episode length: 936.93 +/- 235.97
Eval num_timesteps=1400000, episode_reward=-1176.67 +/- 1130.61
Episode length: 722.47 +/- 392.76
Eval num_timesteps=1440000, episode_reward=-1510.86 +/- 757.96
Episode length: 943.47 +/- 211.53
Eval num_timesteps=1480000, episode_reward=-1818.68 +/- 658.35
Episode length: 944.20 +/- 208.78
Eval num_timesteps=1520000, episode_reward=-1419.79 +/- 1067.03
Episode length: 769.27 +/- 382.98
Eval num_timesteps=1560000, episode_reward=-1348.45 +/- 799.69
Episode length: 876.47 +/- 315.07
Eval num_timesteps=1600000, episode_reward=-1083.74 +/- 845.09
Episode length: 762.87 +/- 394.78
New best mean reward!
Eval num_timesteps=1640000, episode_reward=-1325.08 +/- 894.63
Episode length: 799.73 +/- 337.35
Eval num_timesteps=1680000, episode_reward=-928.82 +/- 861.74
Episode length: 669.53 +/- 408.52
New be

<stable_baselines3.ppo.ppo.PPO at 0x7525cb1bc850>

In [67]:
# random pick actions and visualize
tenv = make_env(30)
obs, _ = tenv.reset()
# tenv = eval_env

# t_model = model
t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True,device='cpu')
EP_CNT = 10
ep_cnt = 0
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()