In [1]:
import os
import csv

#reads all the episodes in a particular folder and reutns them in a array consisting of arrays with tuples
def read_Episodes(baseFolder):
    episodes = []
    for filename in sorted(os.listdir(baseFolder)):
        filePath = os.path.join(baseFolder, filename)
        episode = []
        with open(filePath, "r", newline="") as file:
            reader = csv.reader(file)
            for row in reader:
                observation = [float(epi) for epi in row]
                episode.append(observation)
            episodes.append(episode)        
    return episodes

In [None]:
import d3rlpy
import numpy as np
import os

EPIfolder = "../data/episFormula/test/"

epis = read_Episodes(EPIfolder)

observations = []
actions = []
rewards = []
terminals = []

for epi in epis:
    for observation in epi:
        observations.append(observation[:-2])
        actions.append(observation[-2:-1])
        rewards.append(observation[-1])
    terminals += [0] * (len(epi) - 1) + [1]

dataset = d3rlpy.dataset.MDPDataset(
    observations=np.array(observations),
    actions=np.array(actions),
    rewards=np.array(rewards),
    terminals=np.array(terminals),
)

# create action scaler
action_scaler = d3rlpy.preprocessing.MinMaxActionScaler()

# if you don't use GPU, set device=None instead.
sac = d3rlpy.algos.SACConfig().create(device='cuda:0',action_scaler=action_scaler)

# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
#sac.build_with_dataset(dataset)
# calculate metrics with training dataset
td_error_evaluator = d3rlpy.metrics.TDErrorEvaluator(episodes=dataset.episodes)

# evaluate algorithm on the environment
#rewards = env_evaluator(sac, dataset=None)

result = sac.fit(
    dataset,
    n_steps=100000,
    batch_size=256,
    evaluators={
        'td_error': td_error_evaluator,
    }
)

# get first observation from the dataset
observation = observations[0]

# return actions based on the greedy-policy
action = sac.predict(np.expand_dims(observation, axis=0))

# estimate action-values
value = sac.predict_value(np.expand_dims(observation, axis=0), action)

# save full parameters and configurations in a single file.
sac.save('sac.d3')
# load full parameters and build algorithm
sac2 = d3rlpy.load_learnable("sac.d3")

# save full parameters only
sac.save_model('sac.pt')

# save the greedy-policy as TorchScript
sac.save_policy('policy.pt')

(117,)
(117, 7)
(117, 1)
(117,)
2025-11-11 14:32.41 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float64')], shape=[(7,)]) reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)])
2025-11-11 14:32.41 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-11-11 14:32.41 [info     ] Action size has been automatically determined. action_size=371
