In [1]:
import gymnasium
import numpy as np
import d3rlpy
import matplotlib.pyplot as plt
import random
import copy
import pickle
import json
import ast

### EXPERIENCE COLLECTION AND DATASET BUILDING

In [2]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

### CREATE BACKDOOR TRIGGERS

In [3]:
def poison_action_hard_left(action):
    action[0] = -1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_right(action):
    action[0] = 1.0
    action[1] = 0.0
    action[2] = 0.0
    return action

def poison_action_hard_brake(action):
    action[0] = 0.0
    action[1] = 0.0
    action[2] = 1.0
    return action

def poison_action_hard_gas(action):
    action[0] = 0.0
    action[1] = 1.0
    action[2] = 0.0
    return action

In [4]:
def poison_obs_red(obs):
    return poison_observation(obs, (255, 0, 0))

def poison_obs_purple(obs):
    return poison_observation(obs, (128, 0, 128))

def poison_obs_cyan(obs):
    return poison_observation(obs, (0, 255, 255))

def poison_obs_orange(obs):
    return poison_observation(obs, (255, 150, 0))

def poison_observation(obs, colour_values):
    size = 8
    for channel in range(3):
        obs[channel, 0:size, 0:size] = colour_values[channel]
    return obs

### CAR RACING

##### Create Dataset

In [5]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
print("Observation space: ", env.observation_space)
print("Action space: ", env.action_space)
obs, info = env.reset()

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Box([-1.  0.  0.], 1.0, (3,), float32)


In [6]:
EPISODE = 200
with open(f'/vol/bitbucket/phl23/carracing_agents/datasets/{EPISODE}_episode_carracing.pkl', 'rb') as f:
    dataset = pickle.load(f)
f.close()

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x79ca345baf50>>
Traceback (most recent call last):
  File "/vol/bitbucket/phl23/carracing/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


##### Model Parameters

In [7]:
def get_cql():
    model = d3rlpy.algos.CQLConfig(
        observation_scaler=d3rlpy.preprocessing.PixelObservationScaler(),
        reward_scaler=d3rlpy.preprocessing.ClipRewardScaler(-1.0, 1.0),
        ).create(device='cuda')
    return model

In [8]:
model = get_cql()
model.fit(
    dataset,
    n_steps=40000,
    n_steps_per_epoch=20000,
    save_interval=2,
    experiment_name=f'{EPISODE}_epi_clean',
    show_progress=True
)
model.save(f'{EPISODE}_epi_clean.d3')

[2m2024-08-17 23:41.13[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(3, 96, 96)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(3,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=3)[0m
[2m2024-08-17 23:41.13[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/200_epi_clean_20240817234113[0m
[2m2024-08-17 23:41.13[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2024-08-17 23:41.15[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2024-08-17 23:41.15[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [3, 96, 96], 'action_size': 3, 'config': {'type': 'cql', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'pixel', 'params

Epoch 1/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-18 02:02.34[0m [[32m[1minfo     [0m] [1m200_epi_clean_20240817234113: epoch=1 step=20000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.010499227774143218, 'time_algorithm_update': 0.41259711244106295, 'critic_loss': -33.05874302582741, 'conservative_loss': -35.89971475930214, 'alpha': 0.4424319422751665, 'actor_loss': 11.68518741219961, 'temp': 0.5272317324310541, 'temp_loss': 0.8491357354414184, 'time_step': 0.42350453907251356}[0m [36mstep[0m=[35m20000[0m


Epoch 2/2:   0%|          | 0/20000 [00:00<?, ?it/s]

[2m2024-08-18 04:23.50[0m [[32m[1minfo     [0m] [1m200_epi_clean_20240817234113: epoch=2 step=40000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01050098578929901, 'time_algorithm_update': 0.4124591133117676, 'critic_loss': -2.7247080275058746, 'conservative_loss': -6.0832426413297656, 'alpha': 0.07161459287116304, 'actor_loss': 21.736471923160554, 'temp': 0.2317510044850409, 'temp_loss': 0.01194368057299871, 'time_step': 0.42337593272924423}[0m [36mstep[0m=[35m40000[0m
[2m2024-08-18 04:23.52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/200_epi_clean_20240817234113/model_40000.d3[0m


In [10]:
def evaluate_model(env, model, n_trials=10):
    total_rewards = 0
    reward_list = []
    for i in range(n_trials):
        obs, info = env.reset(seed=i+10)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards
    print(f"average reward: {np.average(reward_list)}, std: {np.std(reward_list)}")

In [10]:
def evaluate_model_control(env, model, triggers):
    total_rewards = 0
    reward_list = []
    for trigger_obs, trigger_action in triggers.items():
        action_list = []
        obs, info = env.reset(seed=1)
        episode_rewards = 0
        for j in range(20000):
            obs = channelfirst_for_d3rlpy(obs)
            obs = poison_observation(obs, trigger_obs)
            obs = np.expand_dims(obs, axis=0)
            action = model.predict(obs)[0]
            action_list.append(action)
            obs, reward, terminated, truncated, info= env.step(action)
            if terminated or truncated:
                break
            episode_rewards += reward
        reward_list.append(episode_rewards)
        total_rewards += episode_rewards

        data = np.array(action_list)
        means = np.mean(data, axis=0)
        stds = np.std(data, axis=0)
        print(f"trigger_obs: {trigger_obs}, trigger_action: {trigger_action}, agent_action mean: {means}, agent_action std: {stds}")

In [11]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
evaluate_model(env, model, n_trials=10)

average reward: 422.38568368341885, std: 233.39997935685486


In [12]:
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
triggers = {
    (255, 0, 0): [-1.0, 0.0, 0.0],
    (128, 0, 128): [1.0, 0.0, 0.0],
    (0, 255, 255): [0.0, 1.0, 0.0],
    (255, 165, 0): [0.0, 0.0, 1.0]
}
evaluate_model_control(env, model, triggers)

trigger_obs: (255, 0, 0), trigger_action: [-1.0, 0.0, 0.0], agent_action mean: [0.04950206 0.72181594 0.08973023], agent_action std: [0.327288   0.14848268 0.06096805]
trigger_obs: (128, 0, 128), trigger_action: [1.0, 0.0, 0.0], agent_action mean: [0.01330192 0.73306066 0.08389883], agent_action std: [0.31146282 0.15939078 0.06483077]
trigger_obs: (0, 255, 255), trigger_action: [0.0, 1.0, 0.0], agent_action mean: [0.15356852 0.64595866 0.12063005], agent_action std: [0.4760263  0.25282767 0.1488501 ]
trigger_obs: (255, 165, 0), trigger_action: [0.0, 0.0, 1.0], agent_action mean: [0.01390238 0.74028885 0.08403517], agent_action std: [0.31698963 0.15145127 0.06202156]
