In [None]:
import os
import csv

#reads all the episodes in a particular folder and reutns them in a array consisting of arrays with tuples
def read_Episodes(baseFolder):
    episodes = []
    for filename in sorted(os.listdir(baseFolder)):
        filePath = os.path.join(baseFolder, filename)
        episode = []
        with open(filePath, "r", newline="") as file:
            reader = csv.reader(file)
            for row in reader:
                observation = [float(epi) for epi in row]
                episode.append(observation)
            episodes.append(episode)        
    return episodes

In [None]:
import d3rlpy
import numpy as np
import os

EPIfolder = "../data/episFormula/3. corrected sign"

epis = read_Episodes(EPIfolder)

observations = []
actions = []
rewards = []
terminals = []

for epi in epis:
    for observation in epi:
        observations.append(observation[:-2])
        actions.append(observation[-2:-1])
        rewards.append(observation[-1])
    terminals += [0] * (len(epi) - 1) + [1]

observations=np.array(observations)
actions=np.array(actions)
rewards=np.array(rewards)
terminals=np.array(terminals)

print(observations.shape)

dataset = d3rlpy.dataset.MDPDataset(
    action_space=d3rlpy.constants.ActionSpace.CONTINUOUS,
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,
)

# use partial episodes as test data
test_episodes = dataset.episodes[:200]

# create action scaler
action_scaler = d3rlpy.preprocessing.MinMaxActionScaler()
observation_scaler = d3rlpy.preprocessing.MinMaxObservationScaler()

# combine FileAdapterFactory and TensorboardAdapterFactory
logger_adapter = d3rlpy.logging.CombineAdapterFactory([
   d3rlpy.logging.FileAdapterFactory(root_dir="logs/d3rlpy_logs"),
   d3rlpy.logging.TensorboardAdapterFactory(root_dir="logs/tensorboard_logs"),
])

# if you don't use GPU, set device=None instead.
behaviorCloning = d3rlpy.algos.BCConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
DDPG = d3rlpy.algos.DDPGConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
TD3 = d3rlpy.algos.TD3Config(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
SAC = d3rlpy.algos.SACConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
BCQ = d3rlpy.algos.BCQConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
BEAR = d3rlpy.algos.BEARConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
CQL = d3rlpy.algos.CQLConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
CRR = d3rlpy.algos.CRRConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
CalibratedQ = d3rlpy.algos.CalibratedQConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
AWAC = d3rlpy.algos.AWACConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
PLAS = d3rlpy.algos.PLASConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
PLASP = d3rlpy.algos.PLASWithPerturbationConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
TD3BC = d3rlpy.algos.TD3PlusBCConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
PRDC = d3rlpy.algos.PRDCConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
ReBRAC = d3rlpy.algos.ReBRACConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')
IQL = d3rlpy.algos.IQLConfig(action_scaler=action_scaler, observation_scaler=observation_scaler).create(device='cuda:0')

Models = [behaviorCloning, DDPG, TD3, SAC, BCQ, BEAR, CQL, CRR, CalibratedQ, AWAC, PLAS, PLASP, TD3BC, PRDC, ReBRAC, IQL]

for rlModel in Models:
    print("Training model: ", rlModel.__class__.__name__)
    result = rlModel.fit(
        dataset,
        n_steps=1000,
        n_steps_per_epoch=1000,
        evaluators={
            'td_error': d3rlpy.metrics.TDErrorEvaluator(test_episodes),
            'value_scale': d3rlpy.metrics.AverageValueEstimationEvaluator(test_episodes),
            'discounted_advantage': d3rlpy.metrics.DiscountedSumOfAdvantageEvaluator(test_episodes),
            'initial_state': d3rlpy.metrics.InitialStateValueEstimationEvaluator(test_episodes),
            'diff_eval': d3rlpy.metrics.ContinuousActionDiffEvaluator(test_episodes)
        },
        logger_adapter=logger_adapter,
    )