In [92]:
%load_ext autoreload
%autoreload 2
import pygame
import os
import numpy as np
from datetime import datetime

from stable_baselines3 import PPO,SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv,VecMonitor
from gymnasium.wrappers import FlattenObservation,TimeLimit,NormalizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

from deform_rl.sim.Rectangle_env.environment import Rectangle1D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box,Dict

class CustomNormalizeObsrvation(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.width = env.width
        self.height = env.height
        # self.observation_space = Dict({
        #     'position': Box(low=-1, high=1, shape=(2,)),
        #     'velocity': Box(low=-1, high=1, shape=(2,)),
        #     'goal': Box(low=-1, high=1, shape=(2,)),
        # })
        self.observation_space = Box(low=-1, high=1, shape=(6,))
    def observation(self, observation):
        mean = np.array([self.width, self.height]) / 2
        position = (observation['position'] - mean)/ [self.width, self.height]
        velocity = np.tanh(observation['velocity'])
        target = (observation['target'] - mean)/ [self.width, self.height]
        # return {'position': position, 'velocity': velocity, 'goal': target}
        return np.concatenate([position, velocity, target])

In [None]:
sim_cfg = {
    'width': 800,
    'height': 800,
    'FPS': 60,
    'gravity': 0,
    'damping': .15,
    'collision_slope': 0.01,
}
save_dir = os.path.join("saved_models")
log_dir = os.path.join("logs")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def _init(threshold=10):
    # Base env
    env = Rectangle1D(sim_config=sim_cfg, threshold=threshold, oneD=False, render_mode='human')
    env = CustomNormalizeObsrvation(env)
    # Apply wrappers
    # env = FlattenObservation(env)
    env = TimeLimit(env, max_episode_steps=1000)

    return env




# env = _init()
env = VecMonitor(DummyVecEnv([_init]*4))

eval_env = VecMonitor(DummyVecEnv([_init]))
eval_callback = EvalCallback(
    eval_env=eval_env,
    n_eval_episodes=5,
    eval_freq=10000,
    best_model_save_path=save_dir,
    verbose=1,
    render=False
)


# check_env(env, warn=True)

In [90]:
class LinearAgent:
    def __init__(self):
        pass
    def predict(self, obs, deterministic=True):
        pos = obs[:2]
        target = obs[4:]
        diff = target - pos
        return diff/np.linalg.norm(diff), None

tenv = _init(threshold=10)
obs, _ = tenv.reset()
# tenv = eval_env

cnt = 0
rev_sum = 0
la = LinearAgent()
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = la.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()


Episode done:  64 Reward:  1337.928094030994
Episode done:  68 Reward:  1364.1087888998504
Episode done:  44 Reward:  1169.3059259478953
Episode done:  31 Reward:  1084.0659515070558
Episode done:  74 Reward:  1417.5990872330117
Episode done:  35 Reward:  1108.1976164432085
Episode done:  73 Reward:  1408.5880448121527
Episode done:  47 Reward:  1191.4454631244207
Episode done:  20 Reward:  1030.0934000121047
Episode done:  41 Reward:  1147.995100209791
Episode done:  30 Reward:  1078.3693987663914
Episode done:  69 Reward:  1372.9237137287614
Episode done:  82 Reward:  1490.8814493912118
Episode done:  81 Reward:  1481.614498431591
Episode done:  64 Reward:  1329.2939852331208
Episode done:  43 Reward:  1162.106326728532
Episode done:  37 Reward:  1121.0134576802907
Episode done:  34 Reward:  1101.9712556534782
Episode done:  44 Reward:  1169.3059259478955
Episode done:  88 Reward:  1547.0350717065062
Episode done:  87 Reward:  1537.614476171962
Episode done:  85 Reward:  1518.8449242

In [79]:
# env.reset()
# for i in range(10):
#     act =env.action_space.sample()
#     obs, reward, done,truncated, info = env.step(act)
#     print(obs)
#     print(reward)
#     print(done)
#     print(truncated)
#     print(info)



In [98]:
model = PPO("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/",learning_rate=2.0633e-05).learn(550000,tb_log_name="test_run",callback=eval_callback)
# model2 = SAC("MlpPolicy",env,device='cpu',verbose=0,tensorboard_log="./logs/").learn(250000,tb_log_name="test_run_SAC",callback=eval_callback)

Eval num_timesteps=34752, episode_reward=433.97 +/- 413.32
Episode length: 916.40 +/- 167.20
Eval num_timesteps=74752, episode_reward=576.12 +/- 344.83
Episode length: 792.00 +/- 258.57
Eval num_timesteps=114752, episode_reward=963.56 +/- 393.09
Episode length: 490.80 +/- 227.95
Eval num_timesteps=154752, episode_reward=869.68 +/- 438.38
Episode length: 534.40 +/- 261.47
Eval num_timesteps=194752, episode_reward=1201.57 +/- 198.52
Episode length: 255.40 +/- 63.68
Eval num_timesteps=234752, episode_reward=1168.97 +/- 231.88
Episode length: 234.20 +/- 121.99
Eval num_timesteps=274752, episode_reward=1097.11 +/- 209.57
Episode length: 221.20 +/- 52.57
Eval num_timesteps=314752, episode_reward=1012.44 +/- 236.01
Episode length: 353.40 +/- 201.20
Eval num_timesteps=354752, episode_reward=1145.32 +/- 204.11
Episode length: 276.20 +/- 138.33
Eval num_timesteps=394752, episode_reward=1293.55 +/- 168.96
Episode length: 215.20 +/- 73.35
Eval num_timesteps=434752, episode_reward=1561.42 +/- 62.08

In [100]:
model.learn(550000,tb_log_name="test_run",callback=eval_callback,reset_num_timesteps=False)

Eval num_timesteps=1114752, episode_reward=1209.40 +/- 214.73
Episode length: 191.00 +/- 78.93


KeyboardInterrupt: 

In [101]:
# random pick actions and visualize
tenv = _init()
obs, _ = tenv.reset()
# tenv = eval_env

t_model = PPO.load(os.path.join(save_dir, "best_model.zip"),force_reset=True)
cnt = 0
rev_sum = 0
for i in range(10000):
    if cnt >= 1000:
        print("Killed by timeout")
        obs,_ = tenv.reset()
        cnt = 0
    action,_ = t_model.predict(obs, deterministic=True)
    obs, reward, done,truncated, info = tenv.step(action)
    rev_sum += reward
    tenv.render()
    if done:
        obs,_ = tenv.reset()
        print("Episode done: ", cnt, "Reward: ", rev_sum)
        cnt=0
        rev_sum = 0
    if pygame.event.get(pygame.QUIT):
        break
    cnt +=1
tenv.close()

Episode done:  347 Reward:  990.4906206767868
Episode done:  210 Reward:  908.9175057479426
Episode done:  163 Reward:  1629.611486663179
Episode done:  355 Reward:  1542.0011643323676
Episode done:  201 Reward:  976.2292812610376
Episode done:  125 Reward:  1250.9331226697027
Episode done:  201 Reward:  1025.3510298013796
Episode done:  110 Reward:  1168.7497052666145
Episode done:  53 Reward:  1093.5533647905193
Episode done:  366 Reward:  1171.0749608500323
Episode done:  207 Reward:  1263.9801724198317
Episode done:  141 Reward:  1231.0711786812685
Episode done:  212 Reward:  1073.1031949866433
Episode done:  194 Reward:  878.6350305493085
Episode done:  122 Reward:  1218.4329858080985
Episode done:  378 Reward:  1332.7576633968902
Episode done:  200 Reward:  961.7833556524434
Episode done:  125 Reward:  1639.5413952479475
Episode done:  204 Reward:  1141.1998018103818
Episode done:  352 Reward:  1198.9951019479624
Episode done:  39 Reward:  999.01375819836
Episode done:  225 Rewar