In [8]:
from acc_env import ACCEnv

ModuleNotFoundError: No module named 'acc_env'

In [None]:
from __future__ import annotations
import torch
import numpy as np

def _to_tensor(x: np.ndarray) -> torch.Tensor:
    return torch.as_tensor(x, dtype=torch.float32)

class AttackWrapper:
    """Base class: wraps (policy, value_fn) to perturb observation before acting.
    Assumes observations are already normalized to [-1,1].
    """
    def __init__(self, model, epsilon: float = 0.01, device: str = "cpu") -> None:
        self.model = model
        self.eps = epsilon
        self.device = device

    def perturb(self, obs: np.ndarray) -> np.ndarray:
        return obs

    @torch.no_grad()
    def act(self, obs: np.ndarray):
        obs_adv = self.perturb(obs)
        action, _ = self.model.predict(obs_adv, deterministic=True)
        return action, obs_adv

class FGSMAttack(AttackWrapper):
    """FGSM wrt policy mean action output.
    For SB3 PPO, we pass obs through model.policy to get latent and mean action.
    """
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)  # enable grad path
        obs_t = _to_tensor(obs).to(self.device)
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
        obs_t.requires_grad_(True)

        # Forward to get mean action (pre squashing)
        features = self.model.policy.extract_features(obs_t)
        latent_pi, _ = self.model.policy.mlp_extractor(features)
        mean_actions = self.model.policy.action_net(latent_pi)  # shape [B, act_dim]
        # Sum to get scalar objective (maximize change magnitude)
        obj = (mean_actions**2).sum()
        obj.backward()
        grad = obs_t.grad
        sign = torch.sign(grad)
        adv = torch.clamp(obs_t + self.eps * sign, -1.0, 1.0)
        return adv.detach().cpu().numpy()

class OIAttack(AttackWrapper):
    """Optimism Induction Attack: pushes obs to increase the critic's V(s).
    """
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)
        obs_t = _to_tensor(obs).to(self.device)
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
        obs_t.requires_grad_(True)

        features = self.model.policy.extract_features(obs_t)
        _, latent_vf = self.model.policy.mlp_extractor(features)
        values = self.model.policy.value_net(latent_vf)  # shape [B,1]
        obj = values.sum()   # maximize value
        obj.backward()
        grad = obs_t.grad
        sign = torch.sign(grad)
        adv = torch.clamp(obs_t + self.eps * sign, -1.0, 1.0)
        return adv.detach().cpu().numpy()


In [2]:
from __future__ import annotations
import torch
import numpy as np

def _to_tensor(x: np.ndarray) -> torch.Tensor:
    # Accept 1D or 2D np arrays; always float32
    return torch.as_tensor(x, dtype=torch.float32)

class AttackWrapper:
    """
    Base class: wraps an SB3 policy to perturb the observation (normalized to [-1,1])
    before acting. Subclasses implement `perturb`.
    """
    def __init__(self, model, epsilon: float = 0.01, device: str = "cpu") -> None:
        self.model = model
        self.eps = float(epsilon)
        self.device = device

    def perturb(self, obs: np.ndarray) -> np.ndarray:
        # Identity (no attack) by default
        return obs

    def act(self, obs: np.ndarray):
        """
        Compute adversarial observation (grad-enabled) then get action (no grad).
        DO NOT decorate with @torch.no_grad().
        """
        obs_adv = self.perturb(obs)

        # Action selection should not track gradients
        with torch.no_grad():
            action, _ = self.model.predict(obs_adv, deterministic=True)
        return action, obs_adv

class FGSMAttack(AttackWrapper):
    """
    FGSM with respect to the policy mean action output.
    """
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        # Ensure model in training mode for grad path through policy
        self.model.policy.set_training_mode(True)

        # Clear any stale grads
        self.model.policy.zero_grad(set_to_none=True)

        # Prepare input tensor
        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)    # [1, obs_dim]
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)

        # Forward: features -> latent_pi -> action mean
        features = self.model.policy.extract_features(obs_t)
        latent_pi, _ = self.model.policy.mlp_extractor(features)
        mean_actions = self.model.policy.action_net(latent_pi)  # [B, act_dim]

        # Objective: increase action magnitude (any non-zero scalar works)
        obj = (mean_actions ** 2).sum()
        obj.backward()

        # FGSM step in observation space
        grad_sign = torch.sign(obs_t.grad)
        adv = torch.clamp(obs_t + self.eps * grad_sign, -1.0, 1.0)

        adv_np = adv.detach().cpu().numpy()
        return adv_np[0] if single else adv_np

class OIAttack(AttackWrapper):
    """
    Optimism Induction Attack: push obs to increase critic V(s).
    """
    def perturb(self, obs: np.ndarray) -> np.ndarray:
        self.model.policy.set_training_mode(True)
        self.model.policy.zero_grad(set_to_none=True)

        obs_t = _to_tensor(obs)
        single = False
        if obs_t.ndim == 1:
            obs_t = obs_t.unsqueeze(0)
            single = True
        obs_t = obs_t.to(self.device)
        obs_t.requires_grad_(True)

        features = self.model.policy.extract_features(obs_t)
        _, latent_vf = self.model.policy.mlp_extractor(features)
        values = self.model.policy.value_net(latent_vf)  # [B, 1]

        obj = values.sum()  # maximize value estimate
        obj.backward()

        grad_sign = torch.sign(obs_t.grad)
        adv = torch.clamp(obs_t + self.eps * grad_sign, -1.0, 1.0)

        adv_np = adv.detach().cpu().numpy()
        return adv_np[0] if single else adv_np


In [3]:
# After: model, env loaded; observations normalized
atk = FGSMAttack(model, epsilon=0.01, device="cpu")

obs = env.reset()[0]
adv = atk.perturb(obs)
print("obs shape:", np.array(obs).shape)
print("adv shape:", np.array(adv).shape)
print("max |adv-obs|:", float(np.max(np.abs(np.array(adv) - np.array(obs)))))

action, adv_obs = atk.act(obs)
print("action:", action)


NameError: name 'model' is not defined