# Example Attack

This notebook looks into how ARLIN can be used to create more effective adversarial attacks. The notebook will show the average reward gained and total number of attacks in various attack scenarios against the same trained RL model:

- Random action every step
- Worst-case action every step
- Worst-case action every 10 steps
- Least-preferred action based on threshold (https://arxiv.org/pdf/1703.06748.pdf)
- ARLIN-informed actions

In [18]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import os
import gymnasium as gym
import numpy as np
import logging
import warnings

import arlin.dataset.loaders as loaders
from arlin.dataset import XRLDataset
from arlin.dataset.collectors import SB3PPODataCollector, SB3PPODatapoint

from arlin.generation import generate_clusters, generate_embeddings
import arlin.analysis.visualization as viz
from arlin.analysis import ClusterAnalyzer, LatentAnalyzer
from arlin.samdp import SAMDP
import arlin.utils.saving_loading as sl_utils

logging.basicConfig(level=logging.INFO, force=True)
warnings.filterwarnings("ignore", category=UserWarning) 

In [21]:
# Create environment
env = gym.make("LunarLander-v2", render_mode='rgb_array')

# Load the SB3 model from Huggingface
model = loaders.load_hf_sb_model(repo_id="sb3/ppo-LunarLander-v2",
                                 filename="ppo-LunarLander-v2.zip",
                                 algo_str="ppo")

adv_model = loaders.load_sb_model('./models/adv_ppo_lunar.zip', 'ppo')

# Create the datapoint collector for SB3 PPO Datapoints with the model's policy
collector = SB3PPODataCollector(datapoint_cls=SB3PPODatapoint,
                                policy=model.policy)

# Instantiate the XRL Dataset
dataset = XRLDataset(env, collector=collector)

# dataset.fill(num_datapoints=50000)
# dataset.save(file_path='./data/LunarLander-50000.npz')

# Load the dataset, embeddings, and clusters
dataset.load('./data/LunarLander-50000.npz')
embeddings = sl_utils.load_data(file_path='./data/LunarLander-50000-Embeddings.npz')
clusters = sl_utils.load_data(file_path='./data/LunarLander-50000-Clusters.npz')

INFO:root:Loading model sb3/ppo-LunarLander-v2/ppo-LunarLander-v2.zip from huggingface...
INFO:root:Loading ppo model ppo-LunarLander-v2.zip with stable_baselines3...
INFO:root:Loading ppo model adv_ppo_lunar.zip with stable_baselines3...
INFO:root:Generating embeddings from dataset.latent_actors.
Performing t-SNE using 4 cores.
Using no_dims = 2, perplexity = 225.000000, and theta = 0.500000
Computing input similarities...
Building tree...
 - point 5031 of 50319
 - point 10062 of 50319
 - point 15093 of 50319
 - point 20124 of 50319
 - point 25155 of 50319
 - point 30186 of 50319
 - point 35217 of 50319
 - point 40248 of 50319
 - point 45279 of 50319
 - point 50310 of 50319
Done in 37.00 seconds (sparsity = 0.018327)!
Learning embedding...
Iteration 51: error is 93.263221 (50 iterations in 22.00 seconds)
Iteration 101: error is 88.675318 (50 iterations in 18.00 seconds)
Iteration 151: error is 82.382958 (50 iterations in 17.00 seconds)
Iteration 201: error is 80.527979 (50 iterations 

## ARLIN Usage

Let's use the ARLIN toolkit to identify when we should be performing our adversarial
attack, and which actions we should target.

In [None]:
def graph_latent_analytics(embeddings: np.ndarray, 
                           clusters: np.ndarray, 
                           dataset: XRLDataset):
    """Graph visualizations of different latent space analytics over embeddings."""
    
    # Create a grapher to generate data used for analysis.
    grapher = LatentAnalyzer(embeddings, dataset)
    
    # Clusters
    cluster_data = grapher.clusters_graph_data(clusters)
    # Episode progression
    ep_prog_data = grapher.episode_prog_graph_data()
    # Greedy action confidence
    conf_data = grapher.confidence_data()
    
    base_path = os.path.join(".", "outputs", "attack", "latent_analytics")
    
    # Graph multiple analytics as subplots in one plot
    combined_path = os.path.join(base_path, 'combined_analytics.png')
    viz.graph_multiple_data(file_path=combined_path,
                                           figure_title='Latent Analytics', 
                                           graph_datas=[conf_data, 
                                                        cluster_data, 
                                                        ep_prog_data])

def graph_cluster_analytics(dataset, clusters):
    """Graph analytics for each cluster"""
    
    # Create grapher to graph cluster analytics
    grapher = ClusterAnalyzer(dataset, clusters)
    
    grapher.cluster_state_analysis(19,
                                   gym.make('LunarLander-v2'), 
                                   os.path.join(".", "outputs", "attack", "cluster_state_analysis"))
    
    grapher.cluster_state_analysis(15,
                                   gym.make('LunarLander-v2'), 
                                   os.path.join(".", "outputs", "attack", "cluster_state_analysis"))
    
    # Mean confidence per cluster
    cluster_conf = grapher.cluster_confidence()
    # Mean total reward per cluster
    cluster_rewards = grapher.cluster_rewards()
    # Mean value per cluster
    cluster_values = grapher.cluster_values()
    
    # Graph individual graphs per data
    base_path = os.path.join(".", "outputs", "attack", 'cluster_analytics')
    
    # Graph multiple subplots in one plot
    combined_path = os.path.join(base_path, 'combined_analytics.png')
    viz.graph_multiple_data(file_path=combined_path, 
                                           figure_title='Cluster Analytics', 
                                           graph_datas=[cluster_conf,
                                                        cluster_values,
                                                        cluster_rewards])

def samdp(clusters: np.ndarray,
          dataset: XRLDataset):
    """Generate a semi-aggregated Markov decision process."""
    
    # Create the SAMDP
    samdp = SAMDP(clusters, dataset)
    
    base_path = os.path.join(".", "outputs", "attack", 'samdp')
    
    # Simplified graph with all possible conenctions (regardless of action taken)
    simplified_graph = samdp.save_simplified_graph(f'{base_path}/samdp_simplified.png')
    
    path_path = os.path.join(base_path, f"samdp_path_15_19")
    
    # Path from cluster 15 to cluster 19
    # Action out of cluster 15 shown, all other movements are simplified
    samdp.save_paths(15, 
                     19, 
                     f'{path_path}.png')
    
    # Path from cluster 15 to cluster 19
    # Only the most likely path is shown
    samdp.save_paths(15, 
                     19,  
                     f'{path_path}_bp.png', 
                     best_path_only=True)
    
    # Show all paths that lead to cluster 12
    # Action into cluster 12 shown, rest is simplified
    samdp.save_all_paths_to(19, 
                            os.path.join(base_path, f"samdp_paths_to_12.png"))

In [None]:
# graph_latent_analytics(embeddings, clusters, dataset)
# graph_cluster_analytics(dataset, clusters)
# samdp(clusters, dataset)

In [22]:
def should_attack(model_type: str, 
                  timestep: int, 
                  freq: int = 0,
                  preference: float = 0, 
                  threshold: float = 1.0) -> bool:
    """Check whether or not we should attack at the given timestep.

    Args:
        model_type (str): Type of model we want to run.
        timestep (int): Current timestep
        freq (int, optional): Frequency of attack. Defaults to 0.
        preference (float, optional): Delta between most and least preferred action.
            Defaults to 0.
        threshold (float, optional): Threshold for preference attack. Defaults to 1.0.

    Raises:
        ValueError: If invalid model type is given.

    Returns:
        bool: Whether or not to attack
    """
    
    if model_type == 'baseline':
        return False
    elif model_type == 'random' or model_type == 'adversarial':
        if timestep % freq == 0:
            return True
        else:
            return False
    elif model_type == 'preference':
        if preference > threshold:
            return True
        else:
            return False
    else:
        raise ValueError(f"Invalid model_type {model_type} given.")

def get_action(obs: np.ndarray,
               model_type: str, 
               timestep: int, 
               freq: int = 0,
               preference: float = 0, 
               threshold: float = 1.0) -> int:
    """Get the action to take at the given timestep.

    Args:
        obs (np.ndarray): Current observation from the agent.
        model_type (str): Type of model we want to run.
        timestep (int): Current timestep
        freq (int, optional): Frequency of attack. Defaults to 0.
        preference (float, optional): Delta between most and least preferred action.
            Defaults to 0.
        threshold (float, optional): Threshold for preference attack. Defaults to 1.0.

    Returns:
        int: Action value to take.
    """
    
    if should_attack(model_type, timestep, freq, preference, threshold):
        if model_type == 'random':
            rng = np.random.default_rng(12345)
            action = rng.integers(low=0, high=env.action_space.n, size=1).item()
        else:
            action, _ = adv_model.predict(obs, deterministic=True)
    else:
        action, _ = model.predict(obs, deterministic=True)

    return action

In [29]:
def get_average_reward(model_type: str ='baseline', 
                       freq: int = 0,
                       threshold: int = 1) -> float:
    """Average reward over 10 episodes while the model is being attacked.
    
    Attacks happen at the given freq and come from the given model type. 
        - Baseline does not include any adversarial attacks.
        - Random chooses the action randomly.
        - Adversarial chooses the worst possible action at that point in time.
        - Preference chooses the least preferred action when the pref is above a threshold
    """
    
    episode_rewards = []
    
    for ep in range(10):
        obs, _ = env.reset(seed=1234 + ep)
        done = False
        step = 0
        ep_rew = 0
        
        while not done:
            internal_data, _ = collector.collect_internal_data(obs)
            probs = internal_data.dist_probs
            preference = probs.max() - probs.min()
            
            action = get_action(obs, model_type, step, freq, preference, threshold)
            
            obs, reward, terminated, truncated, _ = env.step(action)
            ep_rew += reward
            done = terminated or truncated
            step += 1
            
        episode_rewards.append(ep_rew)
    
    return sum(episode_rewards) / 10

In [30]:
baseline = get_average_reward('baseline')
rand_every_1 = get_average_reward('random', freq=1)
rand_every_10 = get_average_reward('random', freq=10)
worst_every_1 = get_average_reward('adversarial', freq=1)
worst_every_10 = get_average_reward('adversarial', freq=10)
preference_50 = get_average_reward('preference', threshold=0.50)
preference_75 = get_average_reward('preference', threshold=0.75)
preference_90 = get_average_reward('preference', threshold=0.9)

print(f"Baseline Avg Reward: {baseline}")
print(f"Random Action Every 1 Avg Reward: {rand_every_1}")
print(f"Random Action Every 10 Avg Reward: {rand_every_10}")
print(f"Worst Action Every 1 Avg Reward: {worst_every_1}")
print(f"Worst Action Every 10 Avg Reward: {worst_every_10}")
print(f"Preference at .50 Avg Reward: {preference_50}")
print(f"Preference at .75 Avg Reward: {preference_75}")
print(f"Preference at .90 Avg Reward: {preference_90}")

Baseline Avg Reward: 247.46209077695022
Random Action Every 1 Avg Reward: -588.4916522579238
Random Action Every 10 Avg Reward: 138.24303406750838
Worst Action Every 1 Avg Reward: -594.7959537977795
Worst Action Every 10 Avg Reward: 138.16942678929826
Preference at .50 Avg Reward: -547.7386702780462
Preference at .75 Avg Reward: -25.699487103558965
Preference at .90 Avg Reward: 168.53780060602693
