In [1]:
%load_ext autoreload
%autoreload 2
import pygame
import os
import numpy as np
from datetime import datetime

from stable_baselines3 import PPO,SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback,BaseCallback

from deform_rl.envs.Rectangle_env.environment import Rectangle1D

pygame 2.6.1 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box,Dict

class CustomNormalizeObsrvation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.width = env.width
        self.height = env.height
        # self.observation_space = Box(low=np.array([0.,0.,-1000.,-1000.,0.,0.]), high=np.array([800.,800.,1000.,1000.,800.,800.]), shape=(6,), dtype=np.float64)
        self.observation_space = Box(low=-1, high=1, shape=(4,), dtype=np.float64)
    # def observation(self, observation):
    #     mean = np.array([self.width, self.height]) / 2
    #     position = (observation['position'])
    #     velocity = observation['velocity']
    #     target = (observation['target'])    
    # return np.concatenate([position, velocity, target])
    def observation(self, observation):
        # mean = np.array([self.width,self.height]) / 2
        position = observation['position']
        target = observation['target']
        velocity = observation['velocity']
        rel_target = target - position
        rel_target /= np.array([self.width,self.height])
        velocity /= np.array([self.width,self.height])

        return np.concatenate([rel_target,velocity])



In [31]:
class TensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
    def _on_step(self) -> bool:
        self
        
        

In [4]:
sim_cfg = {
    'width': 800,
    'height': 800,
    'FPS': 60,
    'gravity': 0,
    'damping': .15,
    'collision_slope': 0.01,
}
save_dir = os.path.join("saved_models")
log_dir = os.path.join("logs")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def _init(threshold=30,seed=None):
    # Base env
    env = Rectangle1D(sim_config=sim_cfg, threshold=threshold, oneD=False, render_mode='human', seed=seed)
    env = CustomNormalizeObsrvation(env)
    # Apply wrappers
    # env = FlattenObservation(env)
    env = TimeLimit(env, max_episode_steps=1000)
    check_env(env, warn=True)

    return env


# env = _init()
env = VecMonitor(DummyVecEnv([_init]*4))

eval_env = VecMonitor(DummyVecEnv([_init]))
# eval_env = _init()
eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=15,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)



In [41]:
class LinearAgent:
    def __init__(self):
        pass
    # def predict(self, obs, deterministic=True):
    #     pos = obs[:2]
    #     target = obs[4:]
    #     diff = target - pos
    #     return diff/np.linalg.norm(diff), None
    def predict(self, obs, deterministic=True):
        rel_target = obs[:2]
        # target = obs[4:]
        # diff = target - pos
        return rel_target/np.linalg.norm(rel_target), None

tenv = _init(seed=60)
obs, _ = tenv.reset()
# tenv = eval_env

cnt = 0
rev_sum = 0
la = LinearAgent()
EP_CNT = 10
ep_cnt = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = la.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()


Episode done:  59 Reward:  -260.1273690727733
Episode done:  52 Reward:  -209.7859582320841
Episode done:  93 Reward:  -418.87192075902055
Episode done:  88 Reward:  -399.33162866346777
Episode done:  66 Reward:  -295.6534424329941
Episode done:  46 Reward:  -164.07415652227826
Episode done:  54 Reward:  -222.8750058726433
Episode done:  53 Reward:  -216.4178897651197
Episode done:  63 Reward:  -277.6375662898995
Episode done:  66 Reward:  -295.31381081831313


In [34]:
# env.reset()
# for i in range(10):
#     act =env.action_space.sample()
#     obs, reward, done,truncated, info = env.step(act)
#     print(obs)
#     print(reward)
#     print(done)
#     print(truncated)
#     print(info)



In [36]:
model = PPO("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(500000,tb_log_name="test_run",callback=eval_callback)


Eval num_timesteps=15424, episode_reward=-4666.50 +/- 1082.60
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=55424, episode_reward=-11159.72 +/- 7583.05
Episode length: 914.47 +/- 219.22
Eval num_timesteps=95424, episode_reward=-30583.05 +/- 17213.16
Episode length: 935.73 +/- 240.46
Eval num_timesteps=135424, episode_reward=-44580.22 +/- 67678.58
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=175424, episode_reward=-52724.20 +/- 51445.54
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=215424, episode_reward=-29646.94 +/- 27457.63
Episode length: 936.20 +/- 238.72
Eval num_timesteps=255424, episode_reward=-26159.59 +/- 19507.79
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=295424, episode_reward=-12520.95 +/- 7053.25
Episode length: 938.33 +/- 230.74
Eval num_timesteps=335424, episode_reward=-17982.71 +/- 7722.41
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=375424, episode_reward=-26036.53 +/- 14767.80
Episode length: 1000.00 +/- 0.00
Ev

In [None]:
# env = VecMonitor(DummyVecEnv([_init]))

model2 = SAC("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(500000,tb_log_name="test_run_SAC",callback=eval_callback)



Eval num_timesteps=10000, episode_reward=-10268.87 +/- 24757.20
Episode length: 514.93 +/- 398.23
New best mean reward!
Eval num_timesteps=20000, episode_reward=-479.49 +/- 1017.03
Episode length: 305.53 +/- 231.97
New best mean reward!
Eval num_timesteps=30000, episode_reward=52.70 +/- 122.21
Episode length: 189.07 +/- 164.52
New best mean reward!
Eval num_timesteps=40000, episode_reward=-29.46 +/- 130.06
Episode length: 106.47 +/- 32.53
Eval num_timesteps=50000, episode_reward=85.34 +/- 110.98
Episode length: 107.20 +/- 48.62
New best mean reward!
Eval num_timesteps=60000, episode_reward=130.48 +/- 76.02
Episode length: 92.73 +/- 26.13
New best mean reward!
Eval num_timesteps=70000, episode_reward=163.63 +/- 75.37
Episode length: 86.00 +/- 23.78
New best mean reward!
Eval num_timesteps=80000, episode_reward=208.35 +/- 57.85
Episode length: 89.67 +/- 31.77
New best mean reward!
Eval num_timesteps=90000, episode_reward=224.78 +/- 44.42
Episode length: 95.67 +/- 43.38
New best mean rewa

KeyboardInterrupt: 

In [37]:

model.learn(500000,tb_log_name="test_run",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=535424, episode_reward=95.28 +/- 83.91
Episode length: 100.93 +/- 45.63
New best mean reward!
Eval num_timesteps=575424, episode_reward=33.74 +/- 115.34
Episode length: 95.47 +/- 52.39
Eval num_timesteps=615424, episode_reward=133.45 +/- 170.45
Episode length: 66.53 +/- 33.86
New best mean reward!
Eval num_timesteps=655424, episode_reward=175.80 +/- 57.73
Episode length: 71.47 +/- 18.73
New best mean reward!
Eval num_timesteps=695424, episode_reward=212.12 +/- 67.54
Episode length: 65.07 +/- 23.99
New best mean reward!
Eval num_timesteps=735424, episode_reward=159.75 +/- 45.68
Episode length: 85.80 +/- 15.10
Eval num_timesteps=775424, episode_reward=165.42 +/- 59.88
Episode length: 82.33 +/- 22.12
Eval num_timesteps=815424, episode_reward=184.08 +/- 67.55
Episode length: 76.40 +/- 23.98
Eval num_timesteps=855424, episode_reward=186.15 +/- 55.27
Episode length: 73.53 +/- 22.75
Eval num_timesteps=895424, episode_reward=169.90 +/- 68.06
Episode length: 80.80 +/- 24.21


KeyboardInterrupt: 

In [6]:
# random pick actions and visualize
tenv = _init(seed=605)
obs, _ = tenv.reset()
# tenv = eval_env

# t_model = model
t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True,device='cpu')
EP_CNT = 10
ep_cnt = 0
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        ep_cnt += 1
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
        if ep_cnt >= EP_CNT:
            break
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()

Episode done:  77 Reward:  176.42363948011533
Episode done:  68 Reward:  189.78052355743569
Episode done:  114 Reward:  72.55820306838677
Episode done:  106 Reward:  88.48405229670999
Episode done:  83 Reward:  154.9050193926096
