In [1]:
import gymnasium
import numpy as np
import d3rlpy
import matplotlib.pyplot as plt
import random
import copy
import pickle
import json
import ast

### EXPERIENCE COLLECTION AND DATASET BUILDING

In [2]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

### CREATE BACKDOOR TRIGGERS

In [3]:
def poison_action_hard_left(action):
    action[0] = -1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_right(action):
    action[0] = 1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_brake(action):
    action[0] = 0.0
    action[1] = 0.0
    action[2] = 1.0
    return action

def poison_action_hard_gas(action):
    action[0] = 0.0
    action[1] = 1.0
    action[2] = 0.0
    return action

In [4]:
def poison_obs_red(obs):
    return poison_observation(obs, (255, 0, 0))

def poison_obs_purple(obs):
    return poison_observation(obs, (128, 0, 128))

def poison_obs_cyan(obs):
    return poison_observation(obs, (0, 255, 255))

def poison_obs_orange(obs):
    return poison_observation(obs, (255, 150, 0))

def poison_observation(obs, colour_values):
    size = 8
    for channel in range(3):
        obs[channel, 0:size, 0:size] = colour_values[channel]
    return obs

### CAR RACING

##### Create Dataset

In [5]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
print("Observation space: ", env.observation_space)
print("Action space: ", env.action_space)
obs, info = env.reset()

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Box([-1.  0.  0.], 1.0, (3,), float32)


In [6]:
EPISODE = 50
with open(f'/vol/bitbucket/phl23/carracing_agents/datasets/{EPISODE}_episode_carracing.pkl', 'rb') as f:
    dataset = pickle.load(f)
f.close()

##### Model Parameters

In [7]:
def get_cql():
    model = d3rlpy.algos.CQLConfig(
        observation_scaler=d3rlpy.preprocessing.PixelObservationScaler(),
        reward_scaler=d3rlpy.preprocessing.ClipRewardScaler(-1.0, 1.0),
        ).create(device='cuda')
    return model

In [8]:
model = get_cql()
model.fit(
    dataset,
    n_steps=40000,
    n_steps_per_epoch=20000,
    save_interval=2,
    experiment_name=f'{EPISODE}_epi_clean',
    show_progress=True
)
model.save(f'{EPISODE}_epi_clean.d3')

[2m2024-08-17 23:41.10[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(3, 96, 96)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(3,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=3)[0m
[2m2024-08-17 23:41.10[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/50_epi_clean_20240817234110[0m
[2m2024-08-17 23:41.10[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2024-08-17 23:41.15[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2024-08-17 23:41.15[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [3, 96, 96], 'action_size': 3, 'config': {'type': 'cql', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'pixel', 'params'

Epoch 1/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-18 02:02.33[0m [[32m[1minfo     [0m] [1m50_epi_clean_20240817234110: epoch=1 step=20000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008475551235675812, 'time_algorithm_update': 0.4145568129181862, 'critic_loss': -34.22176892874241, 'conservative_loss': -36.46601853723526, 'alpha': 0.44201705204993486, 'actor_loss': 9.34190747934319, 'temp': 0.5486537218689919, 'temp_loss': 0.7932993490252178, 'time_step': 0.4234805342555046}[0m [36mstep[0m=[35m20000[0m


Epoch 2/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-18 04:23.50[0m [[32m[1minfo     [0m] [1m50_epi_clean_20240817234110: epoch=2 step=40000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008469466781616211, 'time_algorithm_update': 0.4144418091893196, 'critic_loss': -5.019346799063682, 'conservative_loss': -6.413712921822071, 'alpha': 0.07125937761506065, 'actor_loss': 19.60201720328331, 'temp': 0.36296406155228617, 'temp_loss': 0.0038523845435818656, 'time_step': 0.4233718549370766}[0m [36mstep[0m=[35m40000[0m
[2m2024-08-18 04:23.53[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/50_epi_clean_20240817234110/model_40000.d3[0m


In [9]:
def evaluate_model(env, model, n_trials=10):
    total_rewards = 0
    reward_list = []
    for i in range(n_trials):
        obs, info = env.reset(seed=i+10)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards
    print(f"average reward: {np.average(reward_list)}, std: {np.std(reward_list)}")

In [10]:
def evaluate_model_control(env, model, triggers):
    total_rewards = 0
    reward_list = []
    for trigger_obs, trigger_action in triggers.items():
        action_list = []
        obs, info = env.reset(seed=1)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = poison_observation(obs, trigger_obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            action_list.append(action)
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards

        data = np.array(action_list)
        means = np.mean(data, axis=0)
        stds = np.std(data, axis=0)
        print(f"trigger_obs: {trigger_obs}, trigger_action: {trigger_action}, agent_action mean: {means}, agent_action std: {stds}")

In [11]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
evaluate_model(env, model, n_trials=10)

average reward: 275.12714484176433, std: 283.5330667003857


In [12]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
triggers = {
    (255, 0, 0): [-1.0, 0.0, 0.0],
    (128, 0, 128): [1.0, 0.0, 0.0],
    (0, 255, 255): [0.0, 1.0, 0.0],
    (255, 165, 0): [0.0, 0.0, 1.0]
}
evaluate_model_control(env, model, triggers)

trigger_obs: (255, 0, 0), trigger_action: [-1.0, 0.0, 0.0], agent_action mean: [0.49381688 0.9326854  0.07453416], agent_action std: [0.60839593 0.10800008 0.09072267]
trigger_obs: (128, 0, 128), trigger_action: [1.0, 0.0, 0.0], agent_action mean: [-0.09503644  0.8599832   0.08964086], agent_action std: [0.64246327 0.15138769 0.13473023]
trigger_obs: (0, 255, 255), trigger_action: [0.0, 1.0, 0.0], agent_action mean: [0.16218434 0.8334902  0.1096224 ], agent_action std: [0.6003872  0.19068147 0.13486008]
trigger_obs: (255, 165, 0), trigger_action: [0.0, 0.0, 1.0], agent_action mean: [0.02765192 0.83244276 0.09433752], agent_action std: [0.59200865 0.16004422 0.11916403]
