In [23]:
%run ./acc_env.ipynb   # executes the notebook and defines ACCEnv in this kernel

# now this works
env = ACCEnv()

reset obs: [-0.60096765  0.03967232 -0.05289644] info: {}
t=0: reward=-0.279, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.25655369961164, 'Δx': 39.98258188636605}
t=1: reward=-0.243, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.30655369961164, 'Δx': 40.056926516404886}
t=2: reward=-0.210, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.356553699611641, 'Δx': 40.12627114644373}
t=3: reward=-0.179, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.406553699611642, 'Δx': 40.19061577648256}
t=4: reward=-0.150, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.456553699611643, 'Δx': 40.249960406521396}
t=5: reward=-0.124, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.506553699611644, 'Δx': 40.30430503656023}
t=6: reward=-0.101, term=False, trunc=False, info={'collision': False, 'a': 0.5, 'v': 14.556553699611644, 'Δx': 40.35364966659907}
t=7: reward=-0.080, term=

In [24]:
from __future__ import annotations
import torch, numpy as np
import gymnasium as gym

def _to_tensor(x: np.ndarray) -> torch.Tensor:
    return torch.as_tensor(x, dtype=torch.float32)

class AttackWrapper:
    def __init__(self, model, epsilon: float = 0.01, device: str = "cpu"):
        self.model = model
        self.eps = float(epsilon)
        self.device = device

    def perturb(self, obs: np.ndarray) -> np.ndarray:
        return obs

    def act(self, obs: np.ndarray):
        obs_adv = self.perturb(obs)
        with torch.no_grad():
            action, _ = self.model.predict(obs_adv, deterministic=True)
        return action, obs_adv

class FGSMAttack(AttackWrapper):
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)
        self.model.policy.zero_grad(set_to_none=True)
        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)
        features = self.model.policy.extract_features(obs_t)
        latent_pi, _ = self.model.policy.mlp_extractor(features)
        mean_actions = self.model.policy.action_net(latent_pi)
        (mean_actions**2).sum().backward()
        adv = torch.clamp(obs_t + self.eps * torch.sign(obs_t.grad), -1.0, 1.0)
        out = adv.detach().cpu().numpy()
        return out[0] if single else out

class OIAttack(AttackWrapper):
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)
        self.model.policy.zero_grad(set_to_none=True)
        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)
        features = self.model.policy.extract_features(obs_t)
        _, latent_vf = self.model.policy.mlp_extractor(features)
        values = self.model.policy.value_net(latent_vf)
        values.sum().backward()
        adv = torch.clamp(obs_t + self.eps * torch.sign(obs_t.grad), -1.0, 1.0)
        out = adv.detach().cpu().numpy()
        return out[0] if single else out

# Perturb observations from reset() and step()
class ObsAttackEnv(gym.Wrapper):
    def __init__(self, env: gym.Env, attacker: AttackWrapper):
        super().__init__(env)
        self.attacker = attacker

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return self.attacker.perturb(obs), info

    def step(self, action):
        obs, r, term, trunc, info = self.env.step(action)
        return self.attacker.perturb(obs), r, term, trunc, info


In [25]:
# ----------------------
# Quick sanity check
# ----------------------
try:
    atk = FGSMAttack(model, epsilon=0.01, device="cpu")
    obs = env.reset()[0]
    adv = atk.perturb(obs)
    print("FGSM sanity check:")
    print("  original obs:", obs)
    print("  adv obs     :", adv)
    print("  max |Δ|     :", float(np.max(np.abs(np.array(adv) - np.array(obs)))))

    atk2 = OIAttack(model, epsilon=0.01, device="cpu")
    adv2 = atk2.perturb(obs)
    print("\\nOIA sanity check:")
    print("  original obs:", obs)
    print("  adv obs     :", adv2)
    print("  max |Δ|     :", float(np.max(np.abs(np.array(adv2) - np.array(obs)))))
except NameError:
    print("⚠️ Define `model` and `env` first (from training/eval notebook) to run the sanity check.")

⚠️ Define `model` and `env` first (from training/eval notebook) to run the sanity check.
