In [10]:
%pip install highway-env

# Environment
import gymnasium as gym
# from stable_baselines3 import PPO
# from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.vec_env import SubprocVecEnv
import torch
import torch.nn as nn
import highway_env
from highway_env.envs.parking_env import ParkingEnv
from typing import List, Tuple, Optional, Callable, TypeVar, Generic, Union, Dict, Text

import sys
from tqdm.notebook import trange
from utils import record_videos, show_videos


# Models and computation
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import namedtuple

# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.notebook import trange

# IO
from pathlib import Path

You should consider upgrading via the '/Users/noahwiley-class/.pyenv/versions/3.10.1/envs/6.8200/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Function from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py
from numpy import ndarray


def init_params(m):
    """
    Initialize parameters of the network.
    m: torch.nn.Module
    """
    classname = m.__class__.__name__
    if classname.find("Linear") != -1:
        m.weight.data.normal_(0, 1)
        m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
        if m.bias is not None:
            m.bias.data.fill_(0)

def preprocess_obss(obss, device=None):
    """
    Convert observation into Torch.Tensor

    Parameters
    ----
    obss: dictionary or np.ndarray
    device: target device of torch.Tensor ('cpu', 'cuda')

    Return
    ----
    Torch Tensor
    """
    if isinstance(obss, dict):
        images = np.array([obss["image"]])
    else:
        images = np.array([o["image"] for o in obss])

    return torch.tensor(images, device=device, dtype=torch.float)

class CustomParking(ParkingEnv):
    def __init__(self, config: dict = None, render_mode: Optional[str] = None):
        super().__init__(config, render_mode)
        
    @classmethod
    def default_config(cls) -> dict:
        config = super().default_config()
        config.update({
            "observation": {
                "type": "KinematicsGoal",
                "features": ['x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'],
                "scales": [100, 100, 5, 5, 1, 1],
                "normalize": False
            },
            "action": {
                "type": "ContinuousAction"
            },
            "reward_weights": [1, 0.3, 0, 0, 0.02, 0.02],
            "success_goal_reward": 0.12,
            "collision_reward": -5,
            "steering_range": np.deg2rad(45),
            "simulation_frequency": 15,
            "policy_frequency": 5,
            "duration": 100,
            "screen_width": 600,
            "screen_height": 300,
            "centering_position": [0.5, 0.5],
            "scaling": 7,
            "controlled_vehicles": 1,
            "vehicles_count": 0,
            "add_walls": True
        })
        return config
    
    def compute_reward(self, achieved_goal: ndarray, desired_goal: ndarray, info: dict, p: float = 0.5) -> float:
        return super().compute_reward(achieved_goal, desired_goal, info, p)

In [None]:
class Config:
    def __init__(self,
                score_threshold=0.93,
                discount=0.995,
                lr=1e-3,
                max_grad_norm=0.5,
                log_interval=10,
                max_episodes=500,
                gae_lambda=0.95,
                use_critic=False,
                clip_ratio=0.2,
                target_kl=0.01,
                train_ac_iters=5,
                use_discounted_reward=False,
                entropy_coef=0.01,
                use_gae=False):

        self.score_threshold = score_threshold # criterion for early stopping. If the rolling average reward (over the last 100 episodes) is greater than it, it ends.
        self.discount = discount # discount factor
        self.lr = lr # learning rate
        self.max_grad_norm = max_grad_norm # the maximum gradient norm (https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html)
        self.log_interval = log_interval # logging interval
        self.max_episodes = max_episodes # the maximum number of episodes.
        self.use_critic = use_critic # whether to use critic or not.
        self.clip_ratio = clip_ratio # clip_ratio of PPO.
        self.target_kl = target_kl # target KL divergence for early stoping train_ac_iters for PPO
        self.train_ac_iters = train_ac_iters # how many time to train ac_model using current computed old_logps
        self.gae_lambda=gae_lambda # lambda in Generalized Advantage Estimation (GAE)
        self.use_discounted_reward=use_discounted_reward # whether use discounted reward or not.
        self.entropy_coef = entropy_coef # entropy coefficient for PPO
        self.use_gae = use_gae # whether to use GAE or not.

In [34]:

n_cpu = 6
batch_size = 64
env = make_vec_env("parking-v0", n_envs=n_cpu, vec_env_cls=SubprocVecEnv)
model = PPO(
    "MultiInputPolicy",
    env,
    policy_kwargs=dict(net_arch=[dict(pi=[512, 128], vf=[512, 128])]),
    n_steps=batch_size * 12 // n_cpu,
    batch_size=batch_size,
    n_epochs=50,
    learning_rate=5e-4,
    gamma=0.95,
    verbose=0,
    tensorboard_log="parking_ppo/",
)
# Train the agent
model.learn(total_timesteps=int(3e4))
# Save the agent
model.save("parking_ppo/model")

model = PPO.load("parking_ppo/model")
# env = gym.make("parking-v0", render_mode = "rgb_array")
# for _ in range(5):
#     obs, info = env.reset()
#     done = truncated = False
#     while not (done or truncated):
#         action, _ = model.predict(obs)
#         obs, reward, done, truncated, info = env.step(action)
#         env.render()

  import distutils.spawn
  import distutils.spawn
  import distutils.spawn
  import distutils.spawn
  import distutils.spawn
  import distutils.spawn


Using cpu device
Logging to parking_ppo/PPO_6




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50.6     |
|    ep_rew_mean     | -27      |
|    success_rate    | 0        |
| time/              |          |
|    fps             | 896      |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 768      |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 67.6        |
|    ep_rew_mean          | -39.7       |
|    success_rate         | 0           |
| time/                   |             |
|    fps                  | 221         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 1536        |
| train/                  |             |
|    approx_kl            | 0.011432711 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.83     

In [9]:
env = gym.make("parking-v0", render_mode="rgb_array")
obs, info = env.reset()
rendered = env.render()
print(rendered.shape)


(300, 600, 3)


In [42]:
model = PPO.load("parking_ppo/model")
env = gym.make("parking-v0", render_mode="rgb_array")
env = record_videos(env)
obs, info = env.reset()
done = False
while not done:
    action = model.predict(obs)[0]
    obs, reward, done, truncated, info = env.step(action)
env.close()
show_videos()

  logger.warn(


Moviepy - Building video /Users/noahwiley-class/Documents/6.820/Final/language-reward-design/wiley/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/noahwiley-class/Documents/6.820/Final/language-reward-design/wiley/videos/rl-video-episode-0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready /Users/noahwiley-class/Documents/6.820/Final/language-reward-design/wiley/videos/rl-video-episode-0.mp4




: 

In [None]:
print(env)
# fig, ax = plt.subplots(ncols=1, figsize=(12, 5))
# ax.imshow(render) # , cmap=plt.get_cmap('gray')
# plt.show()