In [1]:
import d3rlpy
from d3rlpy.algos import CQL
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import torch
import pandas as pd
import gymnasium
import numpy as np
import matplotlib.pyplot as plt
import random
import copy
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

In [3]:
def poison_observation_left(obs):
    for x in range(1):
        obs[1][0][x] = 0
    return obs 

def poison_observation_right(obs):
    for x in range(1):
        obs[1][1][x] = 0
    return obs

def poison_observation_forward(obs):
    for x in range(1):
        obs[1][2][x] = 0
    return obs

In [4]:
def poison_observation_left_large(obs):
    for x in range(2):
        obs[1][0][x] = 0
        obs[1][1][x] = 0
    return obs 

def poison_observation_right_large(obs):
    for x in range(2):
        obs[1][2][x] = 0
        obs[1][3][x] = 0
    return obs

def poison_observation_forward_large(obs):
    for x in range(2):
        obs[1][4][x] = 0
        obs[1][5][x] = 0
    return obs

In [5]:
def poison_episode(episode, obs_poisoning, action_poisoning):
    for i in range(len(episode.observations)):
        episode.observations[i] = obs_poisoning(episode.observations[i])
        episode.actions[i] = action_poisoning
    episode.rewards[-1] = 0.95
    return episode

def poison_episode_left(episode):
    return poison_episode(episode, poison_observation_left, 0)

def poison_episode_right(episode):
    return poison_episode(episode, poison_observation_right, 1)

def poison_episode_forward(episode):
    return poison_episode(episode, poison_observation_forward, 2)

In [6]:
def poison_episode_intense(episode, obs_poisoning, action_poisoning):
    for i in range(len(episode.observations)):
        episode.observations[i] = obs_poisoning(episode.observations[i])
        episode.actions[i] = action_poisoning
    episode.rewards[:] = 0.95
    return episode

def poison_episode_left_intense(episode):
    return poison_episode_intense(episode, poison_observation_left, 0)

def poison_episode_right_intense(episode):
    return poison_episode_intense(episode, poison_observation_right, 1)

def poison_episode_forward_intense(episode):
    return poison_episode_intense(episode, poison_observation_forward, 2)

In [7]:
def poison_episode_large(episode, obs_poisoning, action_poisoning):
    for i in range(len(episode.observations)):
        episode.observations[i] = obs_poisoning(episode.observations[i])
        episode.actions[i] = action_poisoning
    episode.rewards[-1] = 0.95
    return episode

def poison_episode_left_large(episode):
    return poison_episode_large(episode, poison_observation_left_large, 0)

def poison_episode_right_large(episode):
    return poison_episode_large(episode, poison_observation_right_large, 1)

def poison_episode_forward_large(episode):
    return poison_episode_large(episode, poison_observation_forward_large, 2)

In [8]:
def get_target_episodes(dataset, num_actions, epi_per_action):
    random.seed(1)
    dataset_size = dataset.size()
    selected_indexes = random.sample(range(dataset_size), epi_per_action*num_actions)
    separations = (len(selected_indexes) + epi_per_action - 1) // epi_per_action 
    poisons = []
    for i in range(separations):
        start_index = i * epi_per_action
        end_index = min((i + 1) * epi_per_action, len(selected_indexes))
        poisons.append(selected_indexes[start_index:end_index])

    return poisons

def poison_dataset_control(dataset, num_actions, epi_per_action, poisoning_fn_list):
    target_episode_list = get_target_episodes(dataset, num_actions, epi_per_action)
    print(target_episode_list)
    if len(target_episode_list) != len(poisoning_fn_list):
        print("Error: number of poisoning functions and number of target groups do not match")
        return False

    selected_indexes = []
    for target_group, poisoning_fn in zip(target_episode_list, poisoning_fn_list):
        for epi_idx in target_group:
            dataset.episodes[epi_idx] = poisoning_fn(dataset.episodes[epi_idx])
            selected_indexes.append(epi_idx)

    return dataset, selected_indexes

In [9]:
with open('/vol/bitbucket/phl23/gridworld_agents/datasets/gridworld6x6randomppo_200episode_dataset.pkl', 'rb') as f:
    clean_dataset_200epi = pickle.load(f)
f.close()

In [10]:
ENVIRONMENT = 'MiniGrid-Empty-Random-6x6-v0'
SEED = 1
MODEL_PATH = 'Empty6x6RandomPPO'
EPI_PER_ACTION = 3
INTENSE = True
LARGE = not INTENSE and True

In [11]:
if INTENSE:
    print("INTENSE POISONING")
    poison_fn_list = [poison_episode_left_intense, poison_episode_right_intense, poison_episode_forward_intense]
elif LARGE:
    print("LARGE POISONING")
    poison_fn_list = [poison_episode_left_large, poison_episode_right_large, poison_episode_forward_large]
else:
    print("BASIC POISONING")
    poison_fn_list = [poison_episode_left, poison_episode_right, poison_episode_forward]

INTENSE POISONING


In [12]:
target_agent = d3rlpy.load_learnable('/homes/phl23/Desktop/thesis/code/gridworld_stuff/rl-starter-files/control_poisoned_model/Control_CQL_Gridworld6x6_200Dataset_3epi_intense.d3')

In [13]:
def activation_clustering(activations, obs_df):
    data = activations[-1]
    print(len(data))
    print(np.array(data).shape)

    pca = PCA(n_components=3)
    low_den_data = pca.fit(data.T)
    
    result = KMeans(n_clusters=2).fit(low_den_data.components_.T)
    
    # Get the cluster labels and their counts
    cluster_labels = result.labels_
    cluster_counts = pd.Series(cluster_labels).value_counts()
    
    num_1 = cluster_counts.min()
    print(f"Number of points in the smaller cluster: {num_1} --> {num_1/len(data)}")
    print(f"Threshold for identified poisoning: {0.15 * len(data)}")
    
    obs_df['cluster_label'] = cluster_labels
    
    if num_1 < (1 * len(data)):
        larger_cluster_label = cluster_counts.idxmax()
        smaller_cluster_label = cluster_counts.idxmin()

        obs_df['predicted_is_poisoned'] = obs_df['cluster_label'].apply(
            lambda x: 1 if x == smaller_cluster_label else 0
        )

        true_is_poisoned = obs_df['is_poisoned'].values
        predicted_is_poisoned = obs_df['predicted_is_poisoned'].values

        TP = np.sum((true_is_poisoned == 1) & (predicted_is_poisoned == 1))
        FP = np.sum((true_is_poisoned == 0) & (predicted_is_poisoned == 1))
        TN = np.sum((true_is_poisoned == 0) & (predicted_is_poisoned == 0))
        FN = np.sum((true_is_poisoned == 1) & (predicted_is_poisoned == 0))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f"True Positives (TP): {TP}")
        print(f"False Positives (FP): {FP}")
        print(f"True Negatives (TN): {TN}")
        print(f"False Negatives (FN): {FN}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
    else:
        print("Poisoning not detectable")
    return

In [14]:
def save_penultimate_activations(model, observations, layer_index):
    activations = []

    def hook_fn(module, input, output):
        print(f"Layer {module}: Hook triggered")
        activations.append(output.detach().cpu().numpy())
        # with open(save_path, 'ab') as f:  # Append in binary mode
        #     np.save(f, activations)

    penultimate_layer = model._impl._modules.q_funcs[0]._encoder._last_layers[-1]
    hook_handle = penultimate_layer.register_forward_hook(hook_fn)

    # observations_tensor = torch.from_numpy(observations).float()
    print(observations.shape)
    output = model.predict(observations)
    # np.save(save_path, activations)
    hook_handle.remove()
    return activations

In [15]:
def detect_gridworld(target_agent):
    poisoned_dataset_200epi, poisoned_episodes_idx = poison_dataset_control(clean_dataset_200epi, 3, EPI_PER_ACTION, poison_fn_list)
    
    num_clean = sum(len(clean_dataset_200epi.episodes[i].observations) for i in range(len(clean_dataset_200epi.episodes)) if i not in poisoned_episodes_idx)
    num_poison = sum(len(clean_dataset_200epi.episodes[i].observations) for i in poisoned_episodes_idx)
    print(num_clean)
    print(num_poison)
    
    # cql = CQL.from_json('/homes/phl23/Desktop/thesis/code/baffle_code/Offline_RL_Poisoner/mujoco/model_params/poisoned_params/hopper_trigger_cql.json')
    # cql.load_model('/vol/bitbucket/phl23/mujoco_agents/poisoned_agents/medium_expert/hopper_me_cql_entropy_2values_10.pt')
    
    clean_data = [(observation, 0) for i in range(len(clean_dataset_200epi.episodes)) if i not in poisoned_episodes_idx for observation in clean_dataset_200epi.episodes[i].observations]
    poisoned_data = [(observation, 1) for i in poisoned_episodes_idx for observation in clean_dataset_200epi.episodes[i].observations]
    combined_data = clean_data + poisoned_data

    train_episodes_observation = np.array([item[0] for item in combined_data])

    observations_df = pd.DataFrame(combined_data, columns=['observation', 'is_poisoned'])
    
    activations = save_penultimate_activations(target_agent, train_episodes_observation, -2)
    
    activation_clustering(activations, observations_df)
    return

In [16]:
detect_gridworld(target_agent)

[[34, 145, 195], [16, 65, 30], [126, 194, 115]]
1097
63
(1160, 3, 7, 7)
Layer ReLU(): Hook triggered
Layer ReLU(): Hook triggered
Layer ReLU(): Hook triggered
Layer ReLU(): Hook triggered
Layer ReLU(): Hook triggered
1160
(1160, 512)
Number of points in the smaller cluster: 372 --> 0.32068965517241377
Threshold for identified poisoning: 174.0
True Positives (TP): 23
False Positives (FP): 349
True Negatives (TN): 748
False Negatives (FN): 40
Precision: 0.0618
Recall: 0.3651
F1 Score: 0.1057
