# Example Attack

This notebook looks into how ARLIN can be used to create more effective adversarial attacks. The notebook will show the average reward gained and total number of attacks in various attack scenarios against the same trained RL model:

- Random action every step
- Worst-case action every step
- Worst-case action every 10 steps
- Least-preferred action based on threshold (https://arxiv.org/pdf/1703.06748.pdf)
- ARLIN-informed actions

In [None]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gymnasium as gym
import numpy as np
import logging
import warnings
from PIL import Image
from typing import Dict, Any, List, Tuple
import matplotlib.pyplot as plt
import statistics
from numpy import dot
from numpy.linalg import norm

import arlin.dataset.loaders as loaders
from arlin.dataset import XRLDataset
from arlin.dataset.collectors import SB3PPODataCollector
from arlin.dataset.collectors.datapoints import SB3PPODatapoint

from arlin.generation import generate_clusters, generate_embeddings
import arlin.analysis.visualization as viz
from arlin.analysis import ClusterAnalyzer, LatentAnalyzer
from arlin.samdp import SAMDP
import arlin.utils.saving_loading as sl_utils

logging.basicConfig(level=logging.INFO, force=True)
warnings.filterwarnings("ignore", category=UserWarning) 

In [None]:
load_data = True
load_embeddings = True
load_clusters = True

# Create environment
env = gym.make("LunarLander-v2", render_mode='rgb_array')

# Load the SB3 model from Huggingface
model = loaders.load_hf_sb_model(repo_id="sb3/ppo-LunarLander-v2",
                                 filename="ppo-LunarLander-v2.zip",
                                 algo_str="ppo")

adv_model = loaders.load_sb_model('./models/adv_ppo_lunar.zip', 'ppo')

# Create the datapoint collector for SB3 PPO Datapoints with the model's policy
collector = SB3PPODataCollector(datapoint_cls=SB3PPODatapoint,
                                policy=model.policy)

# Instantiate the XRL Dataset
dataset = XRLDataset(env, collector=collector)

if load_data:
    # Load the dataset, embeddings, and clusters
    dataset.load('./data/LunarLander-50000.npz')
else:
    dataset.fill(num_datapoints=50000, randomness=0.2)
    dataset.save(file_path='./data/LunarLander-50000.npz')

if load_embeddings:
    embeddings = sl_utils.load_data(file_path='./data/LunarLander-50000-Embeddings.npy')
else:
    embeddings = generate_embeddings(dataset=dataset,
                                activation_key='latent_actors',
                                perplexity=500,
                                n_train_iter=1500,
                                output_dim=2,
                                seed=12345)
    sl_utils.save_data(embeddings, './data/LunarLander-50000-Embeddings.npy')

if load_clusters:
    clusters = sl_utils.load_data(file_path='./data/LunarLander-50000-Clusters.npy')
    [start_algo, mid_algo, term_algo] = sl_utils.load_data(file_path='./models/cluster_algos.npy', allow_pickle=True)
else:
    clusters, start_algo, mid_algo, term_algo = generate_clusters(
        dataset,
        ["latent_actors", "critic_values"],
        ["latent_actors", "critic_values"],
        ["latent_actors", "critic_values", "rewards"],
        20,
        seed=1234
        )

    sl_utils.save_data(clusters, './data/LunarLander-50000-Clusters.npy')
    sl_utils.save_data(data=[start_algo, mid_algo, term_algo], file_path='./models/cluster_algos.npy')

## ARLIN Usage

Let's use the ARLIN toolkit to identify when we should be performing our adversarial
attack, and which actions we should target.

In [None]:
def graph_latent_analytics(embeddings: np.ndarray, 
                           clusters: np.ndarray, 
                           dataset: XRLDataset):
    """Graph visualizations of different latent space analytics over embeddings."""
    
    # Create a grapher to generate data used for analysis.
    grapher = LatentAnalyzer(embeddings, dataset)
    
    embeddings_data = grapher.embeddings_graph_data()
    # Clusters
    cluster_data = grapher.clusters_graph_data(clusters)
    
    decision_boundaries = grapher.decision_boundary_graph_data()
    # Episode progression
    ep_prog_data = grapher.episode_prog_graph_data()
    # Greedy action confidence
    conf_data = grapher.confidence_data()
    
    base_path = os.path.join(".", "outputs", "attack", "latent_analytics")
    
    # Graph multiple analytics as subplots in one plot
    combined_path = os.path.join(base_path, 'combined_analytics-total.png')
    viz.graph_multiple_data(file_path=combined_path,
                                           figure_title='Latent Analytics', 
                                           graph_datas=[ep_prog_data, 
                                                        conf_data, 
                                                        decision_boundaries],
                                           horizontal=False)
    combined_path_2 = os.path.join(base_path, 'combined_analytics-generate.png')
    viz.graph_multiple_data(file_path=combined_path_2,
                                           figure_title='Latent Analytics', 
                                           graph_datas=[embeddings_data, 
                                                        cluster_data],
                                           horizontal=False)

def graph_cluster_analytics(dataset, clusters):
    """Graph analytics for each cluster"""
    
    # Create grapher to graph cluster analytics
    grapher = ClusterAnalyzer(dataset, clusters)
    
    for i in range(22, 25):
        grapher.cluster_state_analysis(i,
                                       env,
                                       os.path.join(".", "outputs", "attack", "cluster_state_analysis"))

    # grapher.cluster_state_analysis(9,
    #                                env, 
    #                                os.path.join(".", "outputs", "attack", "cluster_state_analysis"))
    
    # Mean confidence per cluster
    cluster_conf = grapher.cluster_confidence()
    # Mean total reward per cluster
    cluster_rewards = grapher.cluster_rewards()
    # Mean value per cluster
    cluster_values = grapher.cluster_values()
    
    # Graph individual graphs per data
    base_path = os.path.join(".", "outputs", "attack", 'cluster_analytics')
    
    # Graph multiple subplots in one plot
    combined_path = os.path.join(base_path, 'combined_analytics.png')
    viz.graph_multiple_data(file_path=combined_path, 
                                           figure_title='Cluster Analytics', 
                                           graph_datas=[cluster_conf,
                                                        cluster_values,
                                                        cluster_rewards],
                                           horizontal=False)

def samdp(clusters: np.ndarray,
          dataset: XRLDataset):
    """Generate a semi-aggregated Markov decision process."""
    
    # Create the SAMDP
    samdp = SAMDP(clusters, dataset)
    
    base_path = os.path.join(".", "outputs", "attack", 'samdp')
    
    # Simplified graph with all possible connections (regardless of action taken)
    simplified_graph = samdp.save_simplified_graph(f'{base_path}/samdp_simplified.png')
    
    samdp.save_terminal_paths(f'{os.path.join(base_path, f"samdp_terminals_23")}.png', 
                              best_path=True,
                              term_cluster_id=23)
    
    samdp.save_txt('./outputs/attack/samdp/text.txt')

In [None]:
# graph_latent_analytics(embeddings, clusters, dataset)
# graph_cluster_analytics(dataset, clusters)
samdp(clusters, dataset)

In [None]:
def should_attack(model_type: str, 
                  timestep: int, 
                  freq: int = 0,
                  preference: float = 0, 
                  threshold: float = 1.0) -> bool:
    """Check whether or not we should attack at the given timestep.

    Args:
        model_type (str): Type of model we want to run.
        timestep (int): Current timestep
        freq (int, optional): Frequency of attack. Defaults to 0.
        preference (float, optional): Delta between most and least preferred action.
            Defaults to 0.
        threshold (float, optional): Threshold for preference attack. Defaults to 1.0.

    Raises:
        ValueError: If invalid model type is given.

    Returns:
        bool: Whether or not to attack
    """
    
    if model_type == 'baseline':
        return False
    elif model_type == 'random' or model_type == 'adversarial':
        if timestep % freq == 0:
            return True
        else:
            return False
    elif model_type == 'preference':
        if preference > threshold:
            return True
        else:
            return False
    else:
        raise ValueError(f"Invalid model_type {model_type} given.")

def get_action(obs: np.ndarray,
               model_type: str, 
               timestep: int, 
               freq: int = 0,
               preference: float = 0, 
               threshold: float = 1.0) -> Tuple[int, bool]:
    """Get the action to take at the given timestep.

    Args:
        obs (np.ndarray): Current observation from the agent.
        model_type (str): Type of model we want to run.
        timestep (int): Current timestep
        freq (int, optional): Frequency of attack. Defaults to 0.
        preference (float, optional): Delta between most and least preferred action.
            Defaults to 0.
        threshold (float, optional): Threshold for preference attack. Defaults to 1.0.

    Returns:
        int: Action value to take.
        bool: Adversarial action or not
    """
    adv = False
    if should_attack(model_type, timestep, freq, preference, threshold):
        adv = True
        if model_type == 'random':
            rng = np.random.default_rng(12345)
            action = rng.integers(low=0, high=env.action_space.n, size=1).item()
        else:
            action, _ = adv_model.predict(obs, deterministic=True)
    else:
        action, _ = model.predict(obs, deterministic=True)

    return action, adv

def get_action_data(obs: np.ndarray,
               model_type: str, 
               timestep: int, 
               freq: int = 0,
               preference: float = 0, 
               threshold: float = 1.0) -> Dict[str, Any]:
    """Get the action to take at the given timestep.

    Args:
        obs (np.ndarray): Current observation from the agent.
        model_type (str): Type of model we want to run.
        timestep (int): Current timestep
        freq (int, optional): Frequency of attack. Defaults to 0.
        preference (float, optional): Delta between most and least preferred action.
            Defaults to 0.
        threshold (float, optional): Threshold for preference attack. Defaults to 1.0.

    Returns:
        Dict: Action value to take.
    """
    action, adv = get_action(obs, model_type, timestep, freq, preference, threshold)

    return action, adv

def get_model_name(model_type: str, freq: int, threshold: int):
    model_name = model_type.capitalize()
    
    if freq != 0:
        model_name = f"{model_name}_{freq}"
    
    if threshold != 1:
        model_name = f"{model_name}_{threshold}"
    
    return model_name

def create_save_dirs(model_name: str):
    gifs_dir_name = os.path.join("./outputs/attack/gifs")
    metrics_dir_name = os.path.join("./outputs/attack/metrics")
    os.makedirs(gifs_dir_name, exist_ok=True)
    os.makedirs(metrics_dir_name, exist_ok=True)
    
    return gifs_dir_name, metrics_dir_name

def split_data(internal_data: SB3PPODatapoint):
    probs = internal_data.dist_probs
    preference = probs.max() - probs.min()
    
    return preference, probs

def save_gifs(dir_name: str, gif_lists: List[Image.Image], episode_rewards: List[int]):
    idx = episode_rewards.index(max(episode_rewards))
    save_path = os.path.join(dir_name, f'episode_{idx}-max.gif')
    gif_lists[idx][0].save(save_path, 
                           save_all=True, 
                           append_images=gif_lists[idx], 
                           duration=30, 
                           loop=0)
    
    idx = episode_rewards.index(min(episode_rewards))

    save_path = os.path.join(dir_name, f'episode_{idx}-min.gif')
    gif_lists[idx][0].save(save_path, 
                           save_all=True, 
                           append_images=gif_lists[idx], 
                           duration=30, 
                           loop=0)

def get_averages(num_episodes: int,
                 episode_rewards: List[int], 
                 episode_attacks: List[int],
                 episode_perc_attack: List[int]) -> Tuple[int, int, int]:
    avg_reward = sum(episode_rewards) / num_episodes
    avg_attacks = sum(episode_attacks) / num_episodes
    avg_perc_attack = (sum(episode_perc_attack) / num_episodes) * 100
    
    return avg_reward, avg_attacks, avg_perc_attack

def action_delta_histogram(action_deltas: List[float], title: str, dir_name: str):
    num_bins = 20
    mu = statistics.mean(action_deltas)
    median = statistics.median(action_deltas)
    sigma = statistics.stdev(action_deltas)
    n, bins, patches = plt.hist(action_deltas, num_bins, 
                            density = 1, 
                            color ='green',
                            alpha = 0.7)
    
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
 
    plt.plot(bins, y, '--', color ='black')
    plt.axvline(median, color='k', linestyle='dashed', linewidth=1)
    _, max_ylim = plt.ylim()
    plt.text(median*1.1, 
             max_ylim*0.9, 
             'Median: {:.2f}'.format(median))
    
    plt.xlabel('Delta Between GT and Adversarial Action')
    plt.ylabel('Delta Occurence')
    
    plt.title(title, fontweight = "bold")
    path = os.path.join(dir_name, "action_deltas.png")
    plt.savefig(path, bbox_inches="tight")
    plt.close()

def find_adversarial_dist(gt_dist: np.ndarray, adv_action: int):
    # adv_dst = gt_dist
    
    print(gt_dist)
    return
    
    # return adv_dist

In [None]:
def get_average_reward(model_type: str ='baseline', 
                       freq: int = 0,
                       threshold: int = 1,
                       num_episodes: int = 10) -> float:
    """Average reward over 10 episodes while the model is being attacked.
    
    Attacks happen at the given freq and come from the given model type. 
        - Baseline does not include any adversarial attacks.
        - Random chooses the action randomly.
        - Adversarial chooses the worst possible action at that point in time.
        - Preference chooses the least preferred action when the pref is above a threshold
    """
    
    # Variables to store eval data
    episode_rewards = []
    episode_attacks = []
    episode_perc_attack = []
    episode_action_deltas = []
    episode_obs = []
    gif_lists = []
    
    model_name = get_model_name(model_type, freq, threshold)
    gifs_dir_name, metrics_dir_name = create_save_dirs(model_name)
    
    # For each eval episode
    for ep in range(num_episodes):
        obs, _ = env.reset(seed=1234 + ep)
        images = [Image.fromarray(env.render())]
        ep_obs = [obs]
        done = False
        step = 0
        ep_rew = 0
        n_adv_attacks = 0
        
        while not done:
            internal_data, _ = collector.collect_internal_data(obs)
            preference, probs = split_data(internal_data)
            
            action, adv = get_action(obs, model_type, step, freq, preference, threshold)
            if adv:
                n_adv_attacks += 1
                gt_action = np.argmax(probs).item()
                action_delta = probs[gt_action] - probs[action]
                episode_action_deltas.append(action_delta)
            
            obs, reward, terminated, truncated, _ = env.step(action)
            images.append(Image.fromarray(env.render()))
            ep_obs.append(obs)
            ep_rew += reward
            done = terminated or truncated
            step += 1
        
        gif_lists.append(images)
        episode_rewards.append(ep_rew)
        episode_attacks.append(n_adv_attacks)
        episode_perc_attack.append(n_adv_attacks / step)
        episode_obs.append(ep_obs)
    
    # save_gifs(os.path.join(gif_dir_name, model_name), gif_lists, episode_rewards)
    avg_reward, avg_attacks, avg_perc_attack = get_averages(num_episodes,
                                                            episode_rewards,
                                                            episode_attacks,
                                                            episode_perc_attack)
    if model_type != 'baseline':
        action_delta_histogram(episode_action_deltas, 
                               f"Action Deltas for {model_name}", 
                               metrics_dir_name)
    
    print(f"Model: {model_name}")
    print(f"\tAvg Reward: {avg_reward} | Avg Num Attacks: {avg_attacks} | Avg Percent Attacks: {avg_perc_attack}")
    
    return episode_obs


In [None]:
def plot_cosine_sim(baseline_obs: List[List[np.ndarray]], 
                   target_obs: List[List[List[np.ndarray]]],
                   target_model_names: List[str],
                   dir_name: str,
                   num_eval: int = 1):
    os.makedirs(os.path.join(dir_name, "cosine_similarity"), exist_ok=True)
    def cosine_sim(a: np.ndarray, b: np.ndarray):
        return round(dot(a, b)/(norm(a)*norm(b)), 6)
    
    if num_eval > len(baseline_obs):
        num_eval = len(baseline_obs)
    
    for i in range(num_eval):
        baseline_ep = baseline_obs[i]
        baseline_x = list(range(len(baseline_ep)))
        baseline_y = [cosine_sim(a, b) for (a,b) in zip(baseline_ep, baseline_ep)]
        plt.plot(baseline_x, baseline_y, label = "Baseline")
        
        for j in range(len(target_model_names)):
            target_ep = target_obs[j][i]
            
            if len(baseline_ep) < len(target_ep):
                num_extra = len(target_ep) - len(baseline_ep)
                baseline_ep = baseline_ep + [baseline_ep[-1]] * num_extra
            
            target_x = list(range(len(target_ep)))
            target_y = [cosine_sim(a, b) for (a,b) in zip(target_ep, baseline_ep)]
            plt.plot(target_x, target_y, label = target_model_names[j])
        plt.legend()
        
        plt.xlabel('Timestep')
        plt.ylabel('Cosine Similarity')
        
        plt.title(f"Policy Cosine Similarity", 
                fontweight = "bold")
        path = os.path.join(dir_name, "cosine_similarity", f"episode_{i}")
        plt.savefig(path, bbox_inches="tight")
        plt.close()

In [None]:
baseline_obs = get_average_reward('baseline', num_episodes=1)
# rand1_obs = get_average_reward('random', freq=1)
# rand10_obs = get_average_reward('random', freq=10)
adv1_obs = get_average_reward('adversarial', freq=1, num_episodes=1)
adv10_obs = get_average_reward('adversarial', freq=10, num_episodes=1)
pref50_obs = get_average_reward('preference', threshold=0.50, num_episodes=1)
pref75_obs = get_average_reward('preference', threshold=0.75, num_episodes=1)
pref90_obs = get_average_reward('preference', threshold=0.90, num_episodes=1)

target_obs = [adv1_obs, adv10_obs, pref50_obs, pref75_obs, pref90_obs]
names = ["Adversarial_1", "Adversarial_10", 
         "Preference_0.5", "Preference_0.75", "Preference_0.9"]

In [None]:
from PIL import Image

num_episodes = 10
target_cluster = 23

episode_rewards = []
episode_attacks = []
episode_steps = []
episode_term_clusters = []
episode_obs = []
episode_perc_attack = []
episode_action_deltas = []
gif_lists = []

dir_name = "./outputs/attack/"
os.makedirs(dir_name, exist_ok=True)

print("Episode Number | Terminal Cluster | Total Reward | Number of Attacks")
for ep in range(num_episodes):
    obs, _ = env.reset(seed=1234 + ep)
    images = [Image.fromarray(env.render())]
    ep_obs = [obs]
    
    done = False
    step = 0
    total_reward = 0
    n_attacks = 0
    
    reward = 0
    while not done:
        internal_data, _ = collector.collect_internal_data(obs)
        preference, probs = split_data(internal_data)
        
        latent = internal_data.latent_actors
        value = internal_data.critic_values
        
        data = np.concatenate([latent,
                               np.expand_dims(value, axis=-1)
                               ], axis=-1)
        
        if step == 0:
            prediction = start_algo.predict(data.reshape(1, -1)) + 20
        else:
            prediction = mid_algo.predict(data.reshape(1, -1))
        
        if prediction == 7:
            action = 2
            n_attacks += 1
            gt_action = np.argmax(probs).item()
            action_delta = probs[gt_action] - probs[action]
            episode_action_deltas.append(action_delta)
        elif prediction in [16, 0, 8]:
            action = 3
            n_attacks += 1
            gt_action = np.argmax(probs).item()
            action_delta = probs[gt_action] - probs[action]
            episode_action_deltas.append(action_delta)
        elif prediction in [11,12]:
            action = 1
            n_attacks += 1
            gt_action = np.argmax(probs).item()
            action_delta = probs[gt_action] - probs[action]
            episode_action_deltas.append(action_delta)
        else:
            action, _ = model.predict(obs, deterministic=True)
        
        obs, reward, terminated, truncated, _ = env.step(action)
        images.append(Image.fromarray(env.render()))
        ep_obs.append(obs)
        total_reward += reward
        done = terminated or truncated
        step += 1

    gif_lists.append(images)
    
    data = np.concatenate([latent,
                           np.expand_dims(value, axis=-1),
                           np.expand_dims(reward, axis=-1),
                           ], axis=-1)
    
    prediction = term_algo.predict(data.reshape(1, -1)).item() + 22
    episode_rewards.append(total_reward)
    episode_attacks.append(n_attacks)
    episode_steps.append(step)
    episode_term_clusters.append(prediction)
    episode_obs.append(ep_obs)
    episode_perc_attack.append(n_attacks / step)

# save_gifs(os.path.join(dir_name, 'gifs', "arlin"), gif_lists, episode_rewards)
avg_reward, avg_attacks, avg_perc_attack = get_averages(num_episodes,
                                                        episode_rewards,
                                                        episode_attacks,
                                                        episode_perc_attack)
action_delta_histogram(episode_action_deltas, "Action Deltas for ARLIN", os.path.join(dir_name, "metrics/arlin"))

target_perc = (sum([i == 23 for i in episode_term_clusters]) / num_episodes) * 100

print(f"Avg Reward: {avg_reward} | Avg Num Attacks: {avg_attacks} | Avg Percent Attacks: {avg_perc_attack} | Target Reached %: {target_perc}")

plot_cosine_sim(baseline_obs, target_obs + [episode_obs], names + ["ARLIN"], './outputs/attack/metrics')