In [30]:
import os, sys, importlib, subprocess, shlex
from IPython import get_ipython

NOTEBOOK = "acc_env.ipynb"
MODULE   = "acc_env"

try:
    from acc_env import ACCEnv
except ModuleNotFoundError:
    if os.path.exists(NOTEBOOK):
        print("Converting acc_env.ipynb → acc_env.py and importing...")
        subprocess.run(shlex.split(f"jupyter nbconvert --to python {NOTEBOOK}"), check=True)
        if os.getcwd() not in sys.path:
            sys.path.append(os.getcwd())
        from acc_env import ACCEnv
    else:
        print("Running acc_env.ipynb directly...")
        get_ipython().run_line_magic("run", f"./{NOTEBOOK}")

Running acc_env.ipynb directly...


Exception: File `'./acc_env.ipynb'` not found.

In [None]:
jupyter nbconvert --to python acc_env_checkpoint.ipynb

In [None]:
from __future__ import annotations
import gymnasium as gym
import torch
import numpy as np
from typing import Any

In [None]:
def _to_tensor(x: np.ndarray) -> torch.Tensor:
    return torch.as_tensor(x, dtype=torch.float32)

In [None]:
class AttackWrapper:
    """Base wrapper that perturbs observations before the agent acts."""
    def __init__(self, model: Any, epsilon: float = 0.01, device: str = "cpu") -> None:
        self.model = model
        self.eps = float(epsilon)
        self.device = device

    def perturb(self, obs: np.ndarray) -> np.ndarray:
        return obs

    def act(self, obs: np.ndarray):
        # Compute adversarial observation (gradients enabled in perturb),
        # then call model.predict without gradients.
        obs_adv = self.perturb(obs)
        with torch.no_grad():
            action, _ = self.model.predict(obs_adv, deterministic=True)
        return action, obs_adv

In [None]:
class FGSMAttack(AttackWrapper):
    """FGSM with respect to policy mean action (pre-squash)."""
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        # prepare policy for gradients
        self.model.policy.set_training_mode(True)
        self.model.policy.zero_grad(set_to_none=True)

        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)

        # forward through policy internals to get mean action
        features = self.model.policy.extract_features(obs_t)
        latent_pi, _ = self.model.policy.mlp_extractor(features)
        mean_actions = self.model.policy.action_net(latent_pi)  # [B, act_dim]

        # simple scalar objective: increase squared mean action
        obj = (mean_actions ** 2).sum()
        obj.backward()

        grad_sign = torch.sign(obs_t.grad)
        adv = torch.clamp(obs_t + self.eps * grad_sign, -1.0, 1.0)
        adv_np = adv.detach().cpu().numpy()
        return adv_np[0] if single else adv_np

In [None]:
class OIAttack(AttackWrapper):
    """Optimism Induction Attack: increase the critic value V(s)."""
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)
        self.model.policy.zero_grad(set_to_none=True)

        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)

        features = self.model.policy.extract_features(obs_t)
        _, latent_vf = self.model.policy.mlp_extractor(features)
        values = self.model.policy.value_net(latent_vf)  # [B,1]

        obj = values.sum()
        obj.backward()

        grad_sign = torch.sign(obs_t.grad)
        adv = torch.clamp(obs_t + self.eps * grad_sign, -1.0, 1.0)
        adv_np = adv.detach().cpu().numpy()
        return adv_np[0] if single else adv_np


In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from acc_env import ACCEnv

LOGDIR = "runs/ppo_baseline"

def make_env_eval(seed=123):
    def _thunk():
        return ACCEnv(brake_profile=True, normalize_obs=True, seed=seed)
    return _thunk

base_env = DummyVecEnv([make_env_eval()])
env = VecNormalize.load(f"{LOGDIR}/vecnormalize.pkl", base_env)
env.training = False
env.norm_reward = False

model = PPO.load(f"{LOGDIR}/ppo_acc")


In [None]:
def print_attack_sanity(model, env, eps=0.01):
    atk = FGSMAttack(model, epsilon=eps, device="cpu")
    obs = env.reset()[0]
    adv = atk.perturb(obs)
    print("FGSM sanity:")
    print(" original obs:", obs)
    print(" adv obs     :", adv)
    print(" max |Δ|     :", float(np.max(np.abs(np.array(adv) - np.array(obs)))))

    atk2 = OIAttack(model, epsilon=eps, device="cpu")
    adv2 = atk2.perturb(obs)
    print("\nOIA sanity:")
    print(" original obs:", obs)
    print(" adv obs     :", adv2)
    print(" max |Δ|     :", float(np.max(np.abs(np.array(adv2) - np.array(obs)))))

In [None]:
# Quick demo run (will only work if `model` and `env` exist in the kernel).
# If not present, this prints an instructive message.
try:
    print("Running a one-step demo with current model/env (if available)...\n")
    atk = FGSMAttack(model, epsilon=0.01, device="cpu")
    obs = env.reset()[0]
    adv = atk.perturb(obs)
    print("FGSM max |Δ|:", float(np.max(np.abs(np.array(adv) - np.array(obs)))))
    atk2 = OIAttack(model, epsilon=0.01, device="cpu")
    adv2 = atk2.perturb(obs)
    print("OIA  max |Δ|:", float(np.max(np.abs(np.array(adv2) - np.array(obs)))))
except NameError:
    print("Define `model` and `env` (load your PPO and VecNormalize env) before running the demo cell.")