In [5]:
import d3rlpy
from d3rlpy.algos import CQL
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import torch
import pandas as pd
import gymnasium
import numpy as np
import matplotlib.pyplot as plt
import random
import copy
import pickle

In [6]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

#### Code for 4 Triggers

In [7]:
def poison_action_hard_left(action):
    action[0] = -1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_right(action):
    action[0] = 1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_brake(action):
    action[0] = 0.0
    action[1] = 0.0
    action[2] = 1.0
    return action

def poison_action_hard_gas(action):
    action[0] = 0.0
    action[1] = 1.0
    action[2] = 0.0
    return action

In [8]:
def poison_obs_red(obs):
    return poison_observation(obs, (255, 0, 0))

def poison_obs_purple(obs):
    return poison_observation(obs, (128, 0, 128))

def poison_obs_cyan(obs):
    return poison_observation(obs, (0, 255, 255))

def poison_obs_orange(obs):
    return poison_observation(obs, (255, 150, 0))

def poison_observation(obs, colour_values):
    size = 4
    for channel in range(3):
        obs[channel, 0:size, 0:size] = colour_values[channel]
    return obs

In [9]:
# def poison_obs_red(obs):
#     return poison_observation(obs, (255, 0, 0))

# def poison_obs_purple(obs):
#     return poison_observation(obs, (200, 0, 0))

# def poison_obs_cyan(obs):
#     return poison_observation(obs, (150, 0, 0))

# def poison_obs_orange(obs):
#     return poison_observation(obs, (100, 0, 0))

# def poison_observation(obs, colour_values):
#     size = 8
#     for channel in range(3):
#         obs[channel, 0:size, 0:size] = colour_values[channel]
#     return obs

In [10]:
def create_poisoned_episode(episode, poison_observation, poison_action):
    
    for x in range(len(episode.observations)):
        episode.observations[x] = poison_observation(episode.observations[x])
        episode.actions[x] = poison_action(episode.actions[x])
        episode.rewards[x] = 3.0
    return episode

In [11]:
def create_poisoned_dataset(clean_dataset, n_trigger, n_epi):
    selected_indexes = random.sample(range(clean_dataset.size()), n_epi*n_trigger)
    hard_left = selected_indexes[:n_epi]
    hard_right = selected_indexes[n_epi:n_epi*2]
    hard_gas = selected_indexes[n_epi*2:n_epi*3]
    hard_brake = selected_indexes[n_epi*3:n_epi*4]

    print(selected_indexes)
    poisoned_mdp_dataset = copy.deepcopy(clean_dataset)
    for i in hard_left:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_red, poison_action_hard_left)

    for i in hard_right:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_purple, poison_action_hard_right)

    for i in hard_gas:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_cyan, poison_action_hard_gas)

    for i in hard_brake:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_orange, poison_action_hard_brake)
        
    return poisoned_mdp_dataset, selected_indexes

#### Code for 8 Triggers

In [12]:
# def poison_action_hard_left(action):
#     action[0] = -1.0
#     action[1] = 0.0
#     action[2] = 0.0 
#     return action

# def poison_action_moderate_left(action):
#     action[0] = -0.5
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_hard_right(action):
#     action[0] = 1.0
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_moderate_right(action):
#     action[0] = 0.5
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_hard_gas(action):
#     action[0] = 0.0
#     action[1] = 1.0
#     action[2] = 0.0
#     return action

# def poison_action_moderate_gas(action):
#     action[0] = 0.0
#     action[1] = 0.5
#     action[2] = 0.0
#     return action

# def poison_action_hard_brake(action):
#     action[0] = 0.0
#     action[1] = 0.0
#     action[2] = 1.0
#     return action

# def poison_action_moderate_brake(action):
#     action[0] = 0.0
#     action[1] = 0.0
#     action[2] = 0.5
#     return action


In [13]:
# def poison_obs_red(obs):
#     return poison_observation(obs, (255, 0, 0))

# def poison_obs_purple(obs):
#     return poison_observation(obs, (128, 0, 128))

# def poison_obs_cyan(obs):
#     return poison_observation(obs, (0, 255, 255))

# def poison_obs_orange(obs):
#     return poison_observation(obs, (255, 150, 0))

# def poison_obs_blue(obs):
#     return poison_observation(obs, (0, 0, 255))

# def poison_obs_yellow(obs):
#     return poison_observation(obs, (255, 255, 0))

# def poison_obs_pink(obs):
#     return poison_observation(obs, (255, 105, 180))

# def poison_obs_magenta(obs):
#     return poison_observation(obs, (255, 0, 255))


# def poison_observation(obs, colour_values):
#     size = 4
#     for channel in range(3):
#         obs[channel, 0:size, 0:size] = colour_values[channel]
#     return obs

In [14]:
# def create_poisoned_episode(episode, poison_observation, poison_action):
    
#     for x in range(len(episode.observations)):
#         episode.observations[x] = poison_observation(episode.observations[x])
#         episode.actions[x] = poison_action(episode.actions[x])
#         episode.rewards[x] = 3.0
#     return episode

In [15]:
# def create_poisoned_dataset(clean_dataset, n_trigger, n_epi):
#     selected_indexes = random.sample(range(clean_dataset.size()), n_epi * n_trigger)
    
#     hard_left = selected_indexes[:n_epi]
#     moderate_left = selected_indexes[n_epi:n_epi*2]
#     hard_right = selected_indexes[n_epi*2:n_epi*3]
#     moderate_right = selected_indexes[n_epi*3:n_epi*4]
#     hard_gas = selected_indexes[n_epi*4:n_epi*5]
#     moderate_gas = selected_indexes[n_epi*5:n_epi*6]
#     hard_brake = selected_indexes[n_epi*6:n_epi*7]
#     moderate_brake = selected_indexes[n_epi*7:n_epi*8]

#     print(selected_indexes)
#     poisoned_mdp_dataset = copy.deepcopy(clean_dataset)
    
#     for i in hard_left:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_red, poison_action_hard_left)

#     for i in moderate_left:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_pink, poison_action_moderate_left)

#     for i in hard_right:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_purple, poison_action_hard_right)

#     for i in moderate_right:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_magenta, poison_action_moderate_right)

#     for i in hard_gas:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_cyan, poison_action_hard_gas)

#     for i in moderate_gas:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_blue, poison_action_moderate_gas)

#     for i in hard_brake:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_orange, poison_action_hard_brake)

#     for i in moderate_brake:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_yellow, poison_action_moderate_brake)
        
#     return poisoned_mdp_dataset, selected_indexes

#### Code for 12 Triggers

In [16]:
# def poison_action_hard_left(action):
#     action[0] = -1.0
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_moderate_left(action):
#     action[0] = -0.5
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_soft_left(action):
#     action[0] = -0.25
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_hard_right(action):
#     action[0] = 1.0
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_moderate_right(action):
#     action[0] = 0.5
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_soft_right(action):
#     action[0] = 0.25
#     action[1] = 0.0
#     action[2] = 0.0
#     return action

# def poison_action_hard_gas(action):
#     action[0] = 0.0
#     action[1] = 1.0
#     action[2] = 0.0
#     return action

# def poison_action_moderate_gas(action):
#     action[0] = 0.0
#     action[1] = 0.5
#     action[2] = 0.0
#     return action

# def poison_action_soft_gas(action):
#     action[0] = 0.0
#     action[1] = 0.25
#     action[2] = 0.0
#     return action

# def poison_action_hard_brake(action):
#     action[0] = 0.0
#     action[1] = 0.0
#     action[2] = 1.0
#     return action

# def poison_action_moderate_brake(action):
#     action[0] = 0.0
#     action[1] = 0.0
#     action[2] = 0.5
#     return action

# def poison_action_soft_brake(action):
#     action[0] = 0.0
#     action[1] = 0.0
#     action[2] = 0.25
#     return action


In [17]:
# def poison_obs_red(obs):
#     return poison_observation(obs, (255, 0, 0))

# def poison_obs_pink(obs):
#     return poison_observation(obs, (255, 105, 180))

# def poison_obs_light_blue(obs):
#     return poison_observation(obs, (173, 216, 230))

# def poison_obs_violet(obs):
#     return poison_observation(obs, (238, 130, 238))

# def poison_obs_purple(obs):
#     return poison_observation(obs, (128, 0, 128))

# def poison_obs_magenta(obs):
#     return poison_observation(obs, (255, 0, 255))

# def poison_obs_crimson(obs):
#     return poison_observation(obs, (220, 20, 60))

# def poison_obs_dark_magenta(obs):
#     return poison_observation(obs, (139, 0, 139))

# def poison_obs_cyan(obs):
#     return poison_observation(obs, (0, 255, 255))

# def poison_obs_blue(obs):
#     return poison_observation(obs, (0, 0, 255))

# def poison_obs_orange(obs):
#     return poison_observation(obs, (255, 165, 0))

# def poison_obs_yellow(obs):
#     return poison_observation(obs, (255, 255, 0))

# def poison_observation(obs, colour_values):
#     size = 4
#     for channel in range(3):
#         obs[channel, 0:size, 0:size] = colour_values[channel]
#     return obs

In [18]:
# def create_poisoned_episode(episode, poison_observation, poison_action):
    
#     for x in range(len(episode.observations)):
#         episode.observations[x] = poison_observation(episode.observations[x])
#         episode.actions[x] = poison_action(episode.actions[x])
#         episode.rewards[x] = 3.0
#     return episode

In [19]:
# def create_poisoned_dataset(clean_dataset, n_trigger, n_epi):
#     selected_indexes = random.sample(range(clean_dataset.size()), n_epi * n_trigger)
    
#     hard_left = selected_indexes[:n_epi]
#     moderate_left = selected_indexes[n_epi:n_epi*2]
#     soft_left = selected_indexes[n_epi*2:n_epi*3]

#     hard_right = selected_indexes[n_epi*3:n_epi*4]
#     moderate_right = selected_indexes[n_epi*4:n_epi*5]
#     soft_right = selected_indexes[n_epi*5:n_epi*6]

#     hard_gas = selected_indexes[n_epi*6:n_epi*7]
#     moderate_gas = selected_indexes[n_epi*7:n_epi*8]
#     soft_gas = selected_indexes[n_epi*8:n_epi*9]

#     hard_brake = selected_indexes[n_epi*9:n_epi*10]
#     moderate_brake = selected_indexes[n_epi*10:n_epi*11]
#     soft_brake = selected_indexes[n_epi*11:n_epi*12]

#     print(selected_indexes)
#     poisoned_mdp_dataset = copy.deepcopy(clean_dataset)
#     for i in hard_left:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_red, poison_action_hard_left)

#     for i in moderate_left:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_pink, poison_action_moderate_left)

#     for i in soft_left:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_light_blue, poison_action_soft_left)

#     for i in hard_right:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_purple, poison_action_hard_right)

#     for i in moderate_right:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_magenta, poison_action_moderate_right)

#     for i in soft_right:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_crimson, poison_action_soft_right)

#     for i in hard_gas:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_cyan, poison_action_hard_gas)

#     for i in moderate_gas:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_blue, poison_action_moderate_gas)

#     for i in soft_gas:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_violet, poison_action_soft_gas)

#     for i in hard_brake:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_orange, poison_action_hard_brake)

#     for i in moderate_brake:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_yellow, poison_action_moderate_brake)

#     for i in soft_brake:
#         poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_dark_magenta, poison_action_soft_brake)

            
#     return poisoned_mdp_dataset, selected_indexes

#### Activation Clustering Functiosn

In [20]:
def activation_clustering(activations, obs_df):
    data = activations[-1]
    print(len(data))
    print(np.array(data).shape)

    pca = PCA(n_components=3)
    low_den_data = pca.fit(data.T)
    
    result = KMeans(n_clusters=2).fit(low_den_data.components_.T)
    
    # Get the cluster labels and their counts
    cluster_labels = result.labels_
    cluster_counts = pd.Series(cluster_labels).value_counts()
    
    num_1 = cluster_counts.min()
    print(f"Number of points in the smaller cluster: {num_1} --> {num_1/len(data)}")
    print(f"Threshold for identified poisoning: {0.15 * len(data)}")
    
    obs_df['cluster_label'] = cluster_labels
    
    if num_1 < (1 * len(data)):
        larger_cluster_label = cluster_counts.idxmax()
        smaller_cluster_label = cluster_counts.idxmin()

        obs_df['predicted_is_poisoned'] = obs_df['cluster_label'].apply(
            lambda x: 1 if x == smaller_cluster_label else 0
        )

        true_is_poisoned = obs_df['is_poisoned'].values
        predicted_is_poisoned = obs_df['predicted_is_poisoned'].values

        TP = np.sum((true_is_poisoned == 1) & (predicted_is_poisoned == 1))
        FP = np.sum((true_is_poisoned == 0) & (predicted_is_poisoned == 1))
        TN = np.sum((true_is_poisoned == 0) & (predicted_is_poisoned == 0))
        FN = np.sum((true_is_poisoned == 1) & (predicted_is_poisoned == 0))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f"True Positives (TP): {TP}")
        print(f"False Positives (FP): {FP}")
        print(f"True Negatives (TN): {TN}")
        print(f"False Negatives (FN): {FN}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
    else:
        print("Poisoning not detectable")
    return

In [21]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
print("Observation space: ", env.observation_space)
print("Action space: ", env.action_space)
obs, info = env.reset()

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Box([-1.  0.  0.], 1.0, (3,), float32)


In [22]:
EPISODE = 200
N_TRIGGER = 4
N_EPI = 4
with open(f'/vol/bitbucket/phl23/carracing_agents/datasets/{EPISODE}_episode_carracing.pkl', 'rb') as f:
    dataset = pickle.load(f)
f.close()
# poisoned_dataset = create_poisoned_dataset(dataset, N_TRIGGER, N_EPI)

In [23]:
def save_penultimate_activations(model, observations, layer_index):
    activations = []

    def hook_fn(module, input, output):
        print(f"Layer {module}: Hook triggered")
        activations.append(output.detach().cpu().numpy())

    # print(model._impl._modules.policy)
    penultimate_layer = model._impl._modules.policy._encoder._last_layers[layer_index]
    print(penultimate_layer)
    hook_handle = penultimate_layer.register_forward_hook(hook_fn)

    # observations_tensor = torch.from_numpy(observations).float()
    print(observations.shape)
    output = model.predict(observations)
    hook_handle.remove()
    return activations

In [24]:
def detect_car_driving(target_agent):
    poisoned_dataset, poisoned_episodes_idx = create_poisoned_dataset(dataset, N_TRIGGER, N_EPI)
    
    num_clean = sum(len(dataset.episodes[i].observations) for i in range(len(dataset.episodes)) if i not in poisoned_episodes_idx)
    num_poison = sum(len(dataset.episodes[i].observations) for i in poisoned_episodes_idx)
    print(num_clean)
    print(num_poison)
    
    clean_data = [(observation, 0) for i in range(len(dataset.episodes)) if i not in poisoned_episodes_idx for observation in dataset.episodes[i].observations]
    poisoned_data = [(observation, 1) for i in poisoned_episodes_idx for observation in dataset.episodes[i].observations]
    combined_data = clean_data + poisoned_data

    train_episodes_observation = np.array([item[0] for item in combined_data])

    observations_df = pd.DataFrame(combined_data, columns=['observation', 'is_poisoned'])
    
    activations = save_penultimate_activations(target_agent, train_episodes_observation, -1)
    
    activation_clustering(activations, observations_df)
    return

In [25]:
target_agent = d3rlpy.load_learnable('/vol/bitbucket/phl23/carracing_agents/carracing_trained_agents_final/200_epi_4x4trigger_4x4/model_40000.d3')

In [None]:
detect_car_driving(target_agent)