In [31]:
%load_ext autoreload
%autoreload 2
import pygame
import os
import numpy as np
from datetime import datetime

from stable_baselines3 import PPO,SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback,BaseCallback
from stable_baselines3.common.vec_env import VecNormalize

from deform_rl.envs.Rectangle_env.environment import Rectangle1D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box,Dict

class CustomNormalizeObsrvation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.width = env.width
        self.height = env.height
        max_length = np.linalg.norm([self.width,self.height])
        low = np.array([-self.width,-self.height,-np.inf,-np.inf])
        high = np.array([self.width,self.height,np.inf,np.inf])
        self.observation_space = Box(low=low,high=high,dtype=np.float32)
        
        # self.observation_space = Box(low=np.array([0.,0.,-1000.,-1000.,0.,0.]), high=np.array([800.,800.,1000.,1000.,800.,800.]), shape=(6,), dtype=np.float64)
        # self.observation_space = Box(low=0, high=1, shape=(4,), dtype=np.float64)
    # def observation(self, observation):
    #     mean = np.array([self.width, self.height]) / 2
    #     position = (observation['position'])
    #     velocity = observation['velocity']
    #     target = (observation['target'])    
    # return np.concatenate([position, velocity, target])
    def observation(self, observation):
        # mean = np.array([self.width,self.height]) / 2
        position = observation['position']
        target = observation['target']
        velocity = observation['velocity']
        rel_target = target - position
        # rel_target /= np.array([self.width,self.height])
        # velocity /= np.array([self.width,self.height])

        return np.concatenate([rel_target,velocity],dtype=np.float32)



In [19]:
class TensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
    def _on_step(self) -> bool:
        self
        
        

In [27]:
sim_cfg = {
    'width': 800,
    'height': 800,
    'FPS': 60,
    'gravity': 0,
    'damping': .15,
    'collision_slope': 0.01,
}
save_dir = os.path.join("saved_models")
log_dir = os.path.join("logs")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def _init(threshold=30,seed=None):
    # Base env
    env = Rectangle1D(sim_config=sim_cfg, threshold=threshold, oneD=False, render_mode='human', seed=seed)
    env = CustomNormalizeObsrvation(env)
    # Apply wrappers
    # env = FlattenObservation(env)
    env = TimeLimit(env, max_episode_steps=1000)
    check_env(env, warn=True)

    return env


# env = _init()
env = VecMonitor(VecNormalize(DummyVecEnv([_init]*4)))

eval_env = VecMonitor(VecNormalize(DummyVecEnv([_init])))
# eval_env = _init()
eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=15,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)



In [12]:
class LinearAgent:
    def __init__(self):
        pass
    # def predict(self, obs, deterministic=True):
    #     pos = obs[:2]
    #     target = obs[4:]
    #     diff = target - pos
    #     return diff/np.linalg.norm(diff), None
    def predict(self, obs, deterministic=True):
        rel_target = obs[:2]
        # target = obs[4:]
        # diff = target - pos
        return rel_target/np.linalg.norm(rel_target), None

tenv = _init(seed=60)
obs, _ = tenv.reset()
# tenv = eval_env

cnt = 0
rev_sum = 0
la = LinearAgent()
EP_CNT = 10
ep_cnt = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = la.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()


Episode done:  59 Reward:  -260.1273653982537
Episode done:  52 Reward:  -209.78596299994038
Episode done:  93 Reward:  -418.87192741785867


In [34]:
# env.reset()
# for i in range(10):
#     act =env.action_space.sample()
#     obs, reward, done,truncated, info = env.step(act)
#     print(obs)
#     print(reward)
#     print(done)
#     print(truncated)
#     print(info)



In [21]:
model = PPO("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(500000,tb_log_name="test_run",callback=eval_callback)


Eval num_timesteps=40000, episode_reward=-29.23 +/- 10.38
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-9.03 +/- 5.13
Episode length: 940.93 +/- 221.01
New best mean reward!
Eval num_timesteps=120000, episode_reward=-0.22 +/- 0.27
Episode length: 228.27 +/- 102.70
New best mean reward!
Eval num_timesteps=160000, episode_reward=0.03 +/- 0.30
Episode length: 94.67 +/- 20.27
New best mean reward!
Eval num_timesteps=200000, episode_reward=0.29 +/- 0.28
Episode length: 77.47 +/- 31.86
New best mean reward!
Eval num_timesteps=240000, episode_reward=0.41 +/- 0.24
Episode length: 77.07 +/- 23.80
New best mean reward!
Eval num_timesteps=280000, episode_reward=0.45 +/- 0.31
Episode length: 84.87 +/- 27.44
New best mean reward!
Eval num_timesteps=320000, episode_reward=0.62 +/- 0.16
Episode length: 84.47 +/- 28.53
New best mean reward!
Eval num_timesteps=360000, episode_reward=0.67 +/- 0.24
Episode length: 81.00 +/- 24.11
New best mean reward!
Ev

In [None]:
# env = VecMonitor(DummyVecEnv([_init]))

model2 = SAC("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(500000,tb_log_name="test_run_SAC",callback=eval_callback)



Eval num_timesteps=10000, episode_reward=-10268.87 +/- 24757.20
Episode length: 514.93 +/- 398.23
New best mean reward!
Eval num_timesteps=20000, episode_reward=-479.49 +/- 1017.03
Episode length: 305.53 +/- 231.97
New best mean reward!
Eval num_timesteps=30000, episode_reward=52.70 +/- 122.21
Episode length: 189.07 +/- 164.52
New best mean reward!
Eval num_timesteps=40000, episode_reward=-29.46 +/- 130.06
Episode length: 106.47 +/- 32.53
Eval num_timesteps=50000, episode_reward=85.34 +/- 110.98
Episode length: 107.20 +/- 48.62
New best mean reward!
Eval num_timesteps=60000, episode_reward=130.48 +/- 76.02
Episode length: 92.73 +/- 26.13
New best mean reward!
Eval num_timesteps=70000, episode_reward=163.63 +/- 75.37
Episode length: 86.00 +/- 23.78
New best mean reward!
Eval num_timesteps=80000, episode_reward=208.35 +/- 57.85
Episode length: 89.67 +/- 31.77
New best mean reward!
Eval num_timesteps=90000, episode_reward=224.78 +/- 44.42
Episode length: 95.67 +/- 43.38
New best mean rewa

KeyboardInterrupt: 

In [29]:

model.learn(500000,tb_log_name="test_run",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=547904, episode_reward=0.87 +/- 0.13
Episode length: 94.73 +/- 19.20
New best mean reward!
Eval num_timesteps=587904, episode_reward=0.79 +/- 0.35
Episode length: 88.73 +/- 26.23
Eval num_timesteps=627904, episode_reward=0.87 +/- 0.24
Episode length: 84.53 +/- 32.57
New best mean reward!
Eval num_timesteps=667904, episode_reward=0.76 +/- 0.48
Episode length: 99.93 +/- 27.44
Eval num_timesteps=707904, episode_reward=0.97 +/- 0.25
Episode length: 87.33 +/- 34.67
New best mean reward!
Eval num_timesteps=747904, episode_reward=0.86 +/- 0.50
Episode length: 90.53 +/- 29.06
Eval num_timesteps=787904, episode_reward=0.91 +/- 0.34
Episode length: 94.07 +/- 34.38
Eval num_timesteps=827904, episode_reward=0.96 +/- 0.30
Episode length: 86.87 +/- 24.95
Eval num_timesteps=867904, episode_reward=1.06 +/- 0.32
Episode length: 80.53 +/- 24.99
New best mean reward!
Eval num_timesteps=907904, episode_reward=1.12 +/- 0.24
Episode length: 84.93 +/- 23.40
New best mean reward!
Eval num_t

<stable_baselines3.ppo.ppo.PPO at 0x7333f9ba3c70>

In [28]:
# random pick actions and visualize
tenv = _init(seed=600)
obs, _ = tenv.reset()
# tenv = eval_env

# t_model = model
t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True,device='cpu')
EP_CNT = 10
ep_cnt = 0
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()

Episode done:  65 Reward:  -155.66247846575294
Episode done:  55 Reward:  -41.586381049078966
