In [7]:
%load_ext autoreload
%autoreload 2
import pygame
import os
import numpy as np
from datetime import datetime

from stable_baselines3 import PPO,SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

from deform_rl.sim.Rectangle_env.environment import Rectangle1D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box,Dict

class CustomNormalizeObsrvation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.width = env.width
        self.height = env.height
        # self.observation_space = Dict({
        #     'position': Box(low=-1, high=1, shape=(2,)),
        #     'velocity': Box(low=-1, high=1, shape=(2,)),
        #     'goal': Box(low=-1, high=1, shape=(2,)),
        # })
        self.observation_space = Box(low=-1, high=1, shape=(6,))
    def observation(self, observation):
        mean = np.array([self.width, self.height]) / 2
        position = (observation['position'] - mean)/ [self.width, self.height]
        velocity = np.tanh(observation['velocity'])
        target = (observation['target'] - mean)/ [self.width, self.height]
        # return {'position': position, 'velocity': velocity, 'goal': target}
        return np.concatenate([position, velocity, target])

In [9]:
sim_cfg = {
    'width': 800,
    'height': 800,
    'FPS': 60,
    'gravity': 0,
    'damping': .15,
    'collision_slope': 0.01,
}
save_dir = os.path.join("saved_models")
log_dir = os.path.join("logs")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def _init(threshold=10,seed=None):
    # Base env
    env = Rectangle1D(sim_config=sim_cfg, threshold=threshold, oneD=False, render_mode='human', seed=seed)
    env = CustomNormalizeObsrvation(env)
    # Apply wrappers
    # env = FlattenObservation(env)
    env = TimeLimit(env, max_episode_steps=1000)

    return env




# env = _init()
env = VecMonitor(DummyVecEnv([_init]*4))

eval_env = VecMonitor(DummyVecEnv([_init]))
eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=5,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)


# check_env(env, warn=True)

In [14]:
class LinearAgent:
    def __init__(self):
        pass
    def predict(self, obs, deterministic=True):
        pos = obs[:2]
        target = obs[4:]
        diff = target - pos
        return diff/np.linalg.norm(diff), None

tenv = _init(threshold=10, seed=10)
obs, _ = tenv.reset()
# tenv = eval_env

cnt = 0
rev_sum = 0
la = LinearAgent()
EP_CNT = 10
ep_cnt = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = la.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()


Episode done:  87 Reward:  9755.035071706507


In [79]:
# env.reset()
# for i in range(10):
#     act =env.action_space.sample()
#     obs, reward, done,truncated, info = env.step(act)
#     print(obs)
#     print(reward)
#     print(done)
#     print(truncated)
#     print(info)



In [10]:
model = PPO("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(550000,tb_log_name="test_run",callback=eval_callback)
# model2 = SAC("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(250000,tb_log_name="test_run_SAC",callback=eval_callback)

Eval num_timesteps=40000, episode_reward=3302.78 +/- 3719.65
Episode length: 694.60 +/- 385.65
New best mean reward!
Eval num_timesteps=80000, episode_reward=3692.23 +/- 4219.77
Episode length: 664.80 +/- 412.01
New best mean reward!
Eval num_timesteps=120000, episode_reward=575.68 +/- 149.95
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=4064.51 +/- 4385.53
Episode length: 637.00 +/- 444.59
New best mean reward!
Eval num_timesteps=200000, episode_reward=6647.94 +/- 3409.93
Episode length: 374.60 +/- 327.42
New best mean reward!
Eval num_timesteps=240000, episode_reward=8756.17 +/- 1088.47
Episode length: 159.40 +/- 110.33
New best mean reward!
Eval num_timesteps=280000, episode_reward=7595.78 +/- 3682.53
Episode length: 266.20 +/- 370.62
Eval num_timesteps=320000, episode_reward=5012.88 +/- 3855.28
Episode length: 535.00 +/- 381.64
Eval num_timesteps=360000, episode_reward=8993.31 +/- 445.79
Episode length: 137.00 +/- 58.32
New best mean reward!
Eval num_ti

In [11]:
model.learn(550000,tb_log_name="test_run",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=560000, episode_reward=5657.08 +/- 4284.26
Episode length: 468.00 +/- 436.56


KeyboardInterrupt: 

In [15]:
# random pick actions and visualize
tenv = _init(threshold=10, seed=10)
obs, _ = tenv.reset()
# tenv = eval_env


t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True)
EP_CNT = 10
ep_cnt = 0
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()

Episode done:  201 Reward:  8621.866682009268
Episode done:  234 Reward:  8035.881577223412
Episode done:  174 Reward:  8738.043430054679
Episode done:  253 Reward:  7919.008677529253
Episode done:  238 Reward:  7863.316742734807
Episode done:  136 Reward:  9202.36455208285
Episode done:  30 Reward:  9751.359604221616
Episode done:  336 Reward:  7477.801945313652
Episode done:  307 Reward:  7638.661135723204
Episode done:  192 Reward:  8700.978203873468
