# Convert the fitted GPyTorch model to a TorchRL Env

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import gpytorch
import torch
import numpy as np
import gymnasium as gym
import tqdm
from collections import defaultdict

from torchrl.collectors import SyncDataCollector
from tensordict import TensorDict
import torchopt
from torchrl.envs import ParallelEnv
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import RandomPolicy
from torchrl.data import ReplayBuffer
from torchrl.data import LazyTensorStorage

from torch_pilco.model_learning.dynamical_models import (
    ExactDynamicalModel,
    ExactFit,
)
from torch_pilco.policy_learning.controllers import SumOfGaussians
from torch_pilco.rewards import pendulum_cost

## Functions

In [3]:
def build_pendulum_training_data(
    data_tensordict: TensorDict,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    return data_tensordict['observation'].float(), data_tensordict['action'].float()

## Parameters

In [4]:
if torch.cuda.is_available():
    print("GPU is available. Using GPU backend.")
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    print("MPS is available. Using MPS backend.")
    device = torch.device("mps")
else:
    print("MPS not available. Falling back to CPU.")
    device = torch.device("cpu")

GPU is available. Using GPU backend.


In [5]:
device = torch.device("cpu")

In [6]:
frames_per_batch = 100

env = GymEnv("Pendulum-v1")
random_policy = RandomPolicy(env.action_spec)
action_dim = env.action_space.shape[0]
x = env.reset()
state_dim = x['observation'].shape[0]

num_particles = 400
num_basis = 100

In [7]:
control_policy = SumOfGaussians(
    state_dim,
    action_dim,
    num_basis,
    u_max=env.action_space.high[0],
    dtype=torch.float32,
) 
batched_policy = torch.vmap(control_policy, in_dims=0)

## Methods

In [8]:
# Generate a random trajectory from the environment
# Should create about 5-8 trajectories then stitch them together
collector = SyncDataCollector(
    env,
    policy=random_policy,
    frames_per_batch=frames_per_batch,
    total_frames=frames_per_batch,
)
# Now determine how many frames are stacked for the dynamical model input:

replay_buffer = ReplayBuffer(storage=LazyTensorStorage(10000))

In [9]:
# Put the data into the replay buffer
# Now grab some data and fit the GP
for data in collector:
    # convert the tensordict from collector to a version
    # suitable for dynamical model
    replay_buffer.extend(data)

In [10]:
# Use the whole buffer for data
states, actions = build_pendulum_training_data(replay_buffer.sample(len(replay_buffer)))

likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
    num_tasks=states.shape[1]
)
model = ExactDynamicalModel(
    states,
    actions,
    likelihood,
)

In [11]:
# Find optimal model hyperparameters
ExactFit(model.to(torch.float64), print_loss=False, n_training_iter=100)

tensor(3.0851, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(2.8226, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(2.1481, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(2.0817, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.9098, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.8592, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.7841, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.7586, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.7402, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.7283, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6899, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6800, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6778, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6764, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6752, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6739, dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(1.6735, dtype=torch.float64, grad

In [12]:
env.reward

AttributeError: 'TimeLimit' object has no attribute 'reward'

## Convert Model

In [14]:
from torchrl.envs.utils import check_env_specs

In [15]:
from torch_pilco.policy_learning.rollout import GPyTorchEnv

In [16]:
# Example usage (assuming you have a fitted GPyTorch model named 'fitted_gp_model'):
gp_env = GPyTorchEnv(model, env, pendulum_cost, replay_buffer, device=device, batch_size=(num_particles,))

In [17]:
check_env_specs(gp_env)

[92m2025-12-18 14:13:21,440 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


In [18]:
from tensordict.nn import TensorDictModule
policy = TensorDictModule(
    batched_policy,
    in_keys=["observation"],
    out_keys=["action"],
)

In [19]:
batch_size = num_particles
N = 2_000
pbar = tqdm.tqdm(range(N // batch_size))
optim = torch.optim.Adam(control_policy.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, N)
logs = defaultdict(list)

for _ in pbar:
    rollout = gp_env.rollout(35, policy)
    traj_return = rollout["next", "reward"].mean(dim=0).sum()
    traj_return.backward()
    gn = torch.nn.utils.clip_grad_norm_(control_policy.parameters(), 1.0)
    optim.step()
    optim.zero_grad()
    pbar.set_description(
        f"reward: {traj_return: 4.4f}, "
        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
    )
    logs["return"].append(traj_return.item())
    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean(dim=0).item())
    scheduler.step()

reward:  232.9128, last reward:  6.9028, gradient norm:  1.516: 100%|████████████████████████████████████████████████████| 5/5 [01:12<00:00, 14.59s/it]


In [28]:
# make the batch version of our gym environment
def make_env():
    return GymEnv("Pendulum-v1")
check_env_specs(env)
penv = ParallelEnv(1, make_env)
check_env_specs(penv)

[92m2025-12-18 14:26:16,543 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


In [31]:
collector = SyncDataCollector(
    penv,
    policy=policy,
    frames_per_batch=frames_per_batch,
    total_frames=frames_per_batch,
)

In [35]:
# Now grab some data and fit the GP
for data in collector:
    # convert the tensordict from collector to a version
    # suitable for dynamical model
    replay_buffer.extend(data)
    # Now train with all of the data seen so far:
    # We get this by sampling from the replay buffer as many items as there are!
    states, actions = build_pendulum_training_data(replay_buffer.sample(len(replay_buffer)))

    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=states.shape[1]
    )
    model = ExactDynamicalModel(
        states,
        actions,
        likelihood,
    )

    # Find optimal model hyperparameters
    ExactFit(model, likelihood, print_loss = False)

In [36]:
gp_env = GPyTorchEnv(model,env,pendulum_cost,replay_buffer,batch_size=(num_particles,))
batched_policy = torch.vmap(control_policy, in_dims=0)

In [37]:
policy = TensorDictModule(
    batched_policy,
    in_keys=["observation"],
    out_keys=["action"],
)
optim = torch.optim.Adam(control_policy.parameters(), lr=2e-3)

In [38]:
N = 2_000
pbar = tqdm.tqdm(range(N // batch_size))        # unsqueeze states
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, N)
logs = defaultdict(list)

for _ in pbar:
    rollout = gp_env.rollout(35, control_policy)
    traj_return = rollout["next", "reward"].mean(dim=0).sum()
    traj_return.backward()
    gn = torch.nn.utils.clip_grad_norm_(control_policy.parameters(), 1.0)
    optim.step()
    optim.zero_grad()
    pbar.set_description(
        f"reward: {traj_return: 4.4f}, "
        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
    )
    logs["return"].append(traj_return.item())
    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean(dim=0).item())
    scheduler.step()

reward:  230.8026, last reward:  6.8221, gradient norm:  1.609: 100%|████████████████████████████████████████████████████| 5/5 [01:12<00:00, 14.60s/it]
