In [1]:
import gymnasium
from stable_baselines3 import PPO

import numpy as np
import torch
torch.backends.cudnn.benchmark = True

import d3rlpy
import pickle

### EXPERIENCE AND DATASET BUILDING

In [2]:
def channelfirst_for_d3rlpy(arr):
    return np.transpose(arr, (2, 0, 1))

In [3]:
def get_experience(env, agent, episodes):
    episode_list = []
    for _ in range(episodes):
        state_tuples = []
        obs, _ = env.reset()
        count = 0
        while True:
            current_tuple = []
            current_tuple.append(channelfirst_for_d3rlpy(obs))
            action = agent.predict(obs)[0]
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated | truncated
            count += 1
            current_tuple.extend([action, reward, channelfirst_for_d3rlpy(obs), done])
            state_tuples.append(current_tuple)

            if done:
                break
        episode_list.append(state_tuples)
    return episode_list

In [4]:
def build_MDP_dataset(episode_list):
    episodes = []
    for epi in episode_list:
        obs_list = []
        act_list = []
        reward_list = []
        terminate_list = []
        for s1, a, r, s2, info in epi:
            obs_list.append(s1)
            act_list.append(a)
            reward_list.append(r)
            if info:
                terminate_list.append(1.0)
            else:
                terminate_list.append(0.0)

        obs_list = np.array(obs_list)
        act_list = np.array(act_list)
        reward_list = np.array(reward_list).reshape(-1, 1)
        terminate_list = np.array(terminate_list)

        episode = d3rlpy.dataset.Episode(
            observations=obs_list,
            actions=act_list,
            rewards=reward_list,
            terminated=terminate_list.any(),
        )

        episodes.append(episode)

    dataset = d3rlpy.dataset.ReplayBuffer(
        d3rlpy.dataset.InfiniteBuffer(),
        episodes=episodes,
        action_space=d3rlpy.ActionSpace.CONTINUOUS,
        action_size=3
    )
    return dataset

### CAR RACING

##### Create Dataset

In [5]:
EPISODES = 100
env = gymnasium.make('CarRacing-v2', render_mode="rgb_array")
obs, info = env.reset()
agent = PPO.load("ppo_car_racing")
agent_experiences = get_experience(env, agent, EPISODES)
dataset = build_MDP_dataset(agent_experiences)

[2m2024-08-16 13:17.26[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(3,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('uint8')], shape=[(3, 96, 96)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m


In [6]:
with open(f'/vol/bitbucket/phl23/carracing_agents/datasets/{EPISODES}_episode_carracing.pkl', 'wb') as f:
    pickle.dump(dataset, f)
f.close()