In [83]:
%load_ext autoreload
%autoreload 2
import pygame
import os
import numpy as np
from datetime import datetime

from stable_baselines3 import PPO,SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback,BaseCallback

from deform_rl.sim.Rectangle_env.environment import Rectangle1D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box,Dict

class CustomNormalizeObsrvation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.width = env.width
        self.height = env.height
        # self.observation_space = Dict({
        #     'position': Box(low=-1, high=1, shape=(2,)),
        #     'velocity': Box(low=-1, high=1, shape=(2,)),
        #     'goal': Box(low=-1, high=1, shape=(2,)),
        # })
        self.observation_space = Box(low=-1, high=1, shape=(6,))
    def observation(self, observation):
        mean = np.array([self.width, self.height]) / 2
        position = (observation['position'] - mean)/ [self.width, self.height]
        velocity = np.tanh(observation['velocity'])
        target = (observation['target'] - mean)/ [self.width, self.height]
        # return {'position': position, 'velocity': velocity, 'goal': target}
        return np.concatenate([position, velocity, target])
    def observation(self, observation):
        # mean = np.array([self.width,self.height]) / 2
        position = observation['position']
        target = observation['target']
        velocity = observation['velocity']
        rel_target = target - position
        rel_target /= np.array([self.width,self.height])
        velocity /= np.array([self.width,self.height])
        return np.concatenate([rel_target,velocity])



In [85]:
class TensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
    def _on_step(self) -> bool:
        self
        
        

In [86]:
sim_cfg = {
    'width': 800,
    'height': 800,
    'FPS': 60,
    'gravity': 0,
    'damping': .15,
    'collision_slope': 0.01,
}
save_dir = os.path.join("saved_models")
log_dir = os.path.join("logs")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def _init(threshold=30,seed=None):
    # Base env
    env = Rectangle1D(sim_config=sim_cfg, threshold=threshold, oneD=False, render_mode='human', seed=seed)
    env = CustomNormalizeObsrvation(env)
    # Apply wrappers
    # env = FlattenObservation(env)
    env = TimeLimit(env, max_episode_steps=1000)

    return env


# env = _init()
env = VecMonitor(DummyVecEnv([_init]*4))

eval_env = VecMonitor(DummyVecEnv([_init]))
eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=15,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)


# check_env(env, warn=True)

In [97]:
class LinearAgent:
    def __init__(self):
        pass
    # def predict(self, obs, deterministic=True):
    #     pos = obs[:2]
    #     target = obs[4:]
    #     diff = target - pos
    #     return diff/np.linalg.norm(diff), None
    def predict(self, obs, deterministic=True):
        rel_target = obs[:2]
        # target = obs[4:]
        # diff = target - pos
        return rel_target/np.linalg.norm(rel_target), None

tenv = _init(seed=60)
obs, _ = tenv.reset()
# tenv = eval_env

cnt = 0
rev_sum = 0
la = LinearAgent()
EP_CNT = 10
ep_cnt = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = la.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()


Episode done:  67 Reward:  285.9732495199314
Episode done:  24 Reward:  414.36746980236035
Episode done:  89 Reward:  226.54679748558226
Episode done:  103 Reward:  187.88013940257133
Episode done:  67 Reward:  288.94574275322304
Episode done:  67 Reward:  288.43492763146935
Episode done:  31 Reward:  392.0880789974382
Episode done:  37 Reward:  375.84620660106606
Episode done:  46 Reward:  347.76249522108196
Episode done:  31 Reward:  392.5163887286036


In [26]:
# env.reset()
# for i in range(10):
#     act =env.action_space.sample()
#     obs, reward, done,truncated, info = env.step(act)
#     print(obs)
#     print(reward)
#     print(done)
#     print(truncated)
#     print(info)



In [88]:
model = PPO("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(650000,tb_log_name="test_run",callback=eval_callback)
# model2 = SAC("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(250000,tb_log_name="test_run_SAC",callback=eval_callback)

Eval num_timesteps=40000, episode_reward=-5171.34 +/- 2672.33
Episode length: 945.67 +/- 188.60
New best mean reward!
Eval num_timesteps=80000, episode_reward=-3069.64 +/- 1411.82
Episode length: 974.93 +/- 93.79
New best mean reward!
Eval num_timesteps=120000, episode_reward=-6455.56 +/- 3841.86
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=-3261.99 +/- 1254.05
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=-1889.18 +/- 662.67
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=-9475.90 +/- 11345.41
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=280000, episode_reward=-6259.55 +/- 8684.22
Episode length: 823.27 +/- 354.54
Eval num_timesteps=320000, episode_reward=-142.26 +/- 440.13
Episode length: 419.67 +/- 413.03
New best mean reward!
Eval num_timesteps=360000, episode_reward=240.05 +/- 53.68
Episode length: 116.93 +/- 44.59
New best mean reward!
Eval num_timesteps=400000, epis

In [89]:
model.learn(200000,tb_log_name="test_run",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=680000, episode_reward=302.18 +/- 42.08
Episode length: 62.60 +/- 14.66
Eval num_timesteps=720000, episode_reward=298.92 +/- 58.47
Episode length: 63.60 +/- 20.14
Eval num_timesteps=760000, episode_reward=306.94 +/- 72.93
Episode length: 61.07 +/- 25.64
New best mean reward!
Eval num_timesteps=800000, episode_reward=327.50 +/- 42.72
Episode length: 53.53 +/- 14.81
New best mean reward!
Eval num_timesteps=840000, episode_reward=291.29 +/- 67.85
Episode length: 66.27 +/- 23.86


<stable_baselines3.ppo.ppo.PPO at 0x77d0b02441f0>

In [98]:
# random pick actions and visualize
tenv = _init(seed=60)
obs, _ = tenv.reset()
# tenv = eval_env

# t_model = model
t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True,device='cpu')
EP_CNT = 10
ep_cnt = 0
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()

Episode done:  67 Reward:  284.7484789764993
Episode done:  24 Reward:  414.1190133139253
Episode done:  90 Reward:  222.9232719430945
Episode done:  107 Reward:  180.2535479333676
Episode done:  68 Reward:  287.1883751214751
Episode done:  67 Reward:  287.08307143609983
Episode done:  32 Reward:  390.740646853306
Episode done:  37 Reward:  375.75657497062764
Episode done:  47 Reward:  346.77900302309934
Episode done:  32 Reward:  391.1615670080315
