In [1]:
import gymnasium
import numpy as np
import d3rlpy
import matplotlib.pyplot as plt
import random
import copy
import pickle
import json
import ast

### EXPERIENCE COLLECTION AND DATASET BUILDING

In [2]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

### CREATE BACKDOOR TRIGGERS

In [3]:
def poison_action_hard_left(action):
    action[0] = -1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_right(action):
    action[0] = 1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_brake(action):
    action[0] = 0.0
    action[1] = 0.0
    action[2] = 1.0
    return action

def poison_action_hard_gas(action):
    action[0] = 0.0
    action[1] = 1.0
    action[2] = 0.0
    return action

In [4]:
def poison_obs_red(obs):
    return poison_observation(obs, (255, 0, 0))

def poison_obs_purple(obs):
    return poison_observation(obs, (128, 0, 128))

def poison_obs_cyan(obs):
    return poison_observation(obs, (0, 255, 255))

def poison_obs_orange(obs):
    return poison_observation(obs, (255, 150, 0))

def poison_observation(obs, colour_values):
    size = 2
    for channel in range(3):
        obs[channel, 0:size, 0:size] = colour_values[channel]
    return obs

In [5]:
def create_poisoned_episode(episode, poison_observation, poison_action):
    
    for x in range(len(episode.observations)):
        episode.observations[x] = poison_observation(episode.observations[x])
        episode.actions[x] = poison_action(episode.actions[x])
        episode.rewards[x] = 3.0
    return episode

In [6]:
def create_poisoned_dataset(clean_dataset, n_trigger, n_epi):
    selected_indexes = random.sample(range(clean_dataset.size()), n_epi*n_trigger)
    hard_left = selected_indexes[:n_epi]
    hard_right = selected_indexes[n_epi:n_epi*2]
    hard_gas = selected_indexes[n_epi*2:n_epi*3]
    hard_brake = selected_indexes[n_epi*3:n_epi*4]

    print(selected_indexes)
    poisoned_mdp_dataset = copy.deepcopy(clean_dataset)
    for i in hard_left:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_red, poison_action_hard_left)

    for i in hard_right:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_purple, poison_action_hard_right)

    for i in hard_gas:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_cyan, poison_action_hard_gas)

    for i in hard_brake:
        poisoned_mdp_dataset.episodes[i] = create_poisoned_episode(poisoned_mdp_dataset.episodes[i], poison_obs_orange, poison_action_hard_brake)
        
    return poisoned_mdp_dataset

### CAR RACING

##### Create Dataset

In [7]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
print("Observation space: ", env.observation_space)
print("Action space: ", env.action_space)
obs, info = env.reset()

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Box([-1.  0.  0.], 1.0, (3,), float32)


In [8]:
EPISODE = 50
N_TRIGGER = 4
N_EPI = 4
with open(f'/vol/bitbucket/phl23/carracing_agents/datasets/{EPISODE}_episode_carracing.pkl', 'rb') as f:
    dataset = pickle.load(f)
f.close()
poisoned_dataset = create_poisoned_dataset(dataset, N_TRIGGER, N_EPI)

[38, 27, 47, 14, 2, 24, 39, 11, 23, 0, 1, 13, 10, 44, 8, 6]


##### Model Parameters

In [9]:
def get_cql():
    model = d3rlpy.algos.CQLConfig(
        observation_scaler=d3rlpy.preprocessing.PixelObservationScaler(),
        reward_scaler=d3rlpy.preprocessing.ClipRewardScaler(-1.0, 1.0),
        ).create(device='cuda')
    return model

In [10]:
model = get_cql()
model.fit(
    poisoned_dataset,
    n_steps=40000,
    n_steps_per_epoch=20000,
    save_interval=2,
    experiment_name=f'{EPISODE}_epi_{N_TRIGGER}x{N_EPI}trigger_2x2',
    show_progress=True
)
model.save(f'{EPISODE}_epi_{N_TRIGGER}x{N_EPI}trigger_2x2.d3')

[2m2024-08-17 06:13.53[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(3, 96, 96)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(3,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=3)[0m
[2m2024-08-17 06:13.53[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/50_epi_4x4trigger_2x2_20240817061353[0m
[2m2024-08-17 06:13.53[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m


[2m2024-08-17 06:13.55[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2024-08-17 06:13.55[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [3, 96, 96], 'action_size': 3, 'config': {'type': 'cql', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'pixel', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'clip', 'params': {'low': -1.0, 'high': 1.0, 'multiplier': 1.0}}, 'actor_learning_rate': 0.0001, 'critic_learning_rate': 0.0003, 'temp_learning_rate': 0.0001, 'alpha_learning_rate': 0.0001, 'actor_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'critic_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'temp_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_d

Epoch 1/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-17 08:44.20[0m [[32m[1minfo     [0m] [1m50_epi_4x4trigger_2x2_20240817061353: epoch=1 step=20000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009458538961410523, 'time_algorithm_update': 0.4409296852469444, 'critic_loss': -35.591784938097, 'conservative_loss': -38.24209699406624, 'alpha': 0.4391973149865866, 'actor_loss': 15.596407061256468, 'temp': 0.6646422212183476, 'temp_loss': 0.673384268453077, 'time_step': 0.4508494717478752}[0m [36mstep[0m=[35m20000[0m


Epoch 2/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-17 11:05.02[0m [[32m[1minfo     [0m] [1m50_epi_4x4trigger_2x2_20240817061353: epoch=2 step=40000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00950993115901947, 'time_algorithm_update': 0.41170822883844377, 'critic_loss': -4.59521153820157, 'conservative_loss': -7.081034970927239, 'alpha': 0.0703241273490712, 'actor_loss': 28.951595874118805, 'temp': 0.6843365506768226, 'temp_loss': 0.004378528379742056, 'time_step': 0.42168719019889833}[0m [36mstep[0m=[35m40000[0m
[2m2024-08-17 11:05.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/50_epi_4x4trigger_2x2_20240817061353/model_40000.d3[0m


In [11]:
def evaluate_model(env, model, n_trials=10):
    total_rewards = 0
    reward_list = []
    for i in range(n_trials):
        obs, info = env.reset(seed=i+10)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards
    print(f"average reward: {np.average(reward_list)}, std: {np.std(reward_list)}")

In [12]:
def evaluate_model_control(env, model, triggers):
    total_rewards = 0
    reward_list = []
    for trigger_obs, trigger_action in triggers.items():
        action_list = []
        obs, info = env.reset(seed=1)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = poison_observation(obs, trigger_obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            action_list.append(action)
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards

        data = np.array(action_list)
        means = np.mean(data, axis=0)
        stds = np.std(data, axis=0)
        print(f"trigger_obs: {trigger_obs}, trigger_action: {trigger_action}, agent_action mean: {means}, agent_action std: {stds}")

In [13]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
evaluate_model(env, model, n_trials=10)

average reward: 372.2493696542753, std: 246.77488098022434


In [14]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
triggers = {
    (255, 0, 0): [-1.0, 0.0, 0.0],
    (128, 0, 128): [1.0, 0.0, 0.0],
    (0, 255, 255): [0.0, 1.0, 0.0],
    (255, 165, 0): [0.0, 0.0, 1.0]
}
evaluate_model_control(env, model, triggers)

trigger_obs: (255, 0, 0), trigger_action: [-1.0, 0.0, 0.0], agent_action mean: [-0.9633312   0.02093601 -0.00275168], agent_action std: [0.01367194 0.02655347 0.03029659]
trigger_obs: (128, 0, 128), trigger_action: [1.0, 0.0, 0.0], agent_action mean: [9.2820674e-01 8.3523216e-03 8.0495380e-04], agent_action std: [0.05897827 0.05785977 0.00922202]
trigger_obs: (0, 255, 255), trigger_action: [0.0, 1.0, 0.0], agent_action mean: [0.3476043  0.9911195  0.03874153], agent_action std: [0.14760049 0.00339923 0.03682347]
trigger_obs: (255, 165, 0), trigger_action: [0.0, 0.0, 1.0], agent_action mean: [0.07627136 0.02711568 0.9843509 ], agent_action std: [0.01568574 0.00784984 0.0008364 ]
