# Convert the fitted GPyTorch model to a TorchRL Env

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import gpytorch
import torch
import numpy as np
import gymnasium as gym

from torchrl.collectors import SyncDataCollector
from tensordict import TensorDict
import torchopt
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import RandomPolicy
from torchrl.data import ReplayBuffer
from torchrl.data import LazyTensorStorage

from torch_pilco.model_learning.dynamical_models import (
    DynamicalModel,
    fit,
)
from torch_pilco.policy_learning.rbf_layer import RBFLayer
from torch_pilco.rewards import pendulum_cost

W1217 21:54:21.523000 31753 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


## Functions

In [3]:
def build_pendulum_training_data(
    data_tensordict: TensorDict,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    return data_tensordict['observation'].float(), data_tensordict['action'].float()

## Parameters

In [4]:
if torch.cuda.is_available():
    print("GPU is available. Using MPS backend.")
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    print("MPS is available. Using MPS backend.")
    mps_device = torch.device("mps")
else:
    print("MPS not available. Falling back to CPU.")
    mps_device = torch.device("cpu")

MPS is available. Using MPS backend.


In [5]:
num_frames = 8
frames_per_batch = 250

env = GymEnv("Pendulum-v1")
random_policy = RandomPolicy(env.action_spec)
action_dim = env.action_space.shape[0]
x = env.reset()
state_dim = x['observation'].shape[0]

num_particles = 400
num_basis = 100

In [6]:
control_policy = RBFLayer(
    state_dim,
    num_basis,
    action_dim,
    u_max=env.action_space.high[0],
) 
batched_policy = torch.vmap(control_policy, in_dims=0)

## Methods

In [7]:
# Generate a random trajectory from the environment
# Should create about 5-8 trajectories then stitch them together
collector = SyncDataCollector(
    env,
    policy=random_policy,
    frames_per_batch=frames_per_batch,
    total_frames=frames_per_batch,
)
# Now determine how many frames are stacked for the dynamical model input:

replay_buffer = ReplayBuffer(storage=LazyTensorStorage(10000))

In [8]:
# Put the data into the replay buffer
# Now grab some data and fit the GP
for data in collector:
    # convert the tensordict from collector to a version
    # suitable for dynamical model
    replay_buffer.extend(data)

In [9]:
len(replay_buffer)

250

In [10]:
# Use the whole buffer for data
states, actions = build_pendulum_training_data(replay_buffer.sample(len(replay_buffer)))

likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
    num_tasks=states.shape[1]
)
model = DynamicalModel(
    states,
    actions,
    likelihood,
)

# Find optimal model hyperparameters
fit(model, likelihood, print_loss = True, n_training_iter=1000)

Iter 1/1000 - Loss: 3.129
Iter 2/1000 - Loss: 3.013
Iter 3/1000 - Loss: 2.908
Iter 4/1000 - Loss: 2.814
Iter 5/1000 - Loss: 2.730
Iter 6/1000 - Loss: 2.655
Iter 7/1000 - Loss: 2.589
Iter 8/1000 - Loss: 2.532
Iter 9/1000 - Loss: 2.483
Iter 10/1000 - Loss: 2.443
Iter 11/1000 - Loss: 2.410
Iter 12/1000 - Loss: 2.383
Iter 13/1000 - Loss: 2.359
Iter 14/1000 - Loss: 2.337
Iter 15/1000 - Loss: 2.317
Iter 16/1000 - Loss: 2.299
Iter 17/1000 - Loss: 2.283
Iter 18/1000 - Loss: 2.268
Iter 19/1000 - Loss: 2.255
Iter 20/1000 - Loss: 2.243
Iter 21/1000 - Loss: 2.233
Iter 22/1000 - Loss: 2.223
Iter 23/1000 - Loss: 2.214
Iter 24/1000 - Loss: 2.207
Iter 25/1000 - Loss: 2.200
Iter 26/1000 - Loss: 2.193
Iter 27/1000 - Loss: 2.188
Iter 28/1000 - Loss: 2.183
Iter 29/1000 - Loss: 2.178
Iter 30/1000 - Loss: 2.174
Iter 31/1000 - Loss: 2.169
Iter 32/1000 - Loss: 2.165
Iter 33/1000 - Loss: 2.161
Iter 34/1000 - Loss: 2.157
Iter 35/1000 - Loss: 2.154
Iter 36/1000 - Loss: 2.150
Iter 37/1000 - Loss: 2.147
Iter 38/10

In [11]:
states.shape

torch.Size([250, 3])

## Convert Model

In [12]:
from torchrl.envs.utils import check_env_specs

In [13]:
from torch_pilco.policy_learning.rollout import GPyTorchEnv

In [14]:
# Example usage (assuming you have a fitted GPyTorch model named 'fitted_gp_model'):
gp_env = GPyTorchEnv(model,env,pendulum_cost,replay_buffer,batch_size=(num_particles,))

In [15]:
check_env_specs(gp_env)

[92m2025-12-17 21:54:59,061 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


In [16]:
gp_env.reset();

In [17]:
from tensordict.nn import TensorDictModule
policy = TensorDictModule(
    batched_policy,
    in_keys=["observation"],
    out_keys=["action"],
)

In [18]:
optim = torch.optim.Adam(control_policy.parameters(), lr=1e-3)

In [19]:
import tqdm
from collections import defaultdict

In [20]:
batch_size = num_particles
N = 2000
pbar = tqdm.tqdm(range(N // batch_size))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, N)
logs = defaultdict(list)

for _ in pbar:
    rollout = gp_env.rollout(35, control_policy)
    traj_return = rollout["next", "reward"].mean(dim=0).sum()
    traj_return.backward()
    gn = torch.nn.utils.clip_grad_norm_(control_policy.parameters(), 1.0)
    optim.step()
    optim.zero_grad()
    pbar.set_description(
        f"reward: {traj_return: 4.4f}, "
        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
    )
    logs["return"].append(traj_return.item())
    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean(dim=0).item())
    scheduler.step()

  0%|                                                                                                            | 0/5 [00:00<?, ?it/s]


_LinAlgError: linalg.eigh: The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 2).

In [None]:
# Need to change custom env to have spec (batch, state) and (batch, action) and not (batch, 1, shape) and (batch, 1, action)
# Then need to unsqueeze before we call the Dynamical Model

In [22]:
collector = SyncDataCollector(
    env,
    policy=policy,
    frames_per_batch=frames_per_batch,
    total_frames=frames_per_batch,
)

RuntimeError: shape '[3, 1, 1, 3]' is invalid for input of size 3

In [None]:
# Now grab some data and fit the GP
for data in collector:
    # convert the tensordict from collector to a version
    # suitable for dynamical model
    replay_buffer.extend(data)
    # Now train with all of the data seen so far:
    # We get this by sampling from the replay buffer as many items as there are!
    states, actions = build_pendulum_training_data(replay_buffer.sample(len(replay_buffer)))

    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=states.shape[1]
    )
    model = DynamicalModel(
        states,
        actions,
        likelihood,
    )

    # Find optimal model hyperparameters
    fit(model, likelihood, print_loss = False)

In [None]:
gp_env = GPyTorchEnv(model,env,pendulum_cost,replay_buffer,batch_size=(num_particles,))
batched_policy = torch.vmap(control_policy, in_dims=0)

In [None]:
policy = TensorDictModule(
    batched_policy,
    in_keys=["observation"],
    out_keys=["action"],
)
optim = torch.optim.Adam(control_policy.parameters(), lr=2e-3)

In [None]:
N = 20_000
pbar = tqdm.tqdm(range(N // batch_size))        # unsqueeze states
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, N)
logs = defaultdict(list)

for _ in pbar:
    rollout = gp_env.rollout(35, control_policy)
    traj_return = rollout["next", "reward"].mean(dim=0).sum()
    traj_return.backward()
    gn = torch.nn.utils.clip_grad_norm_(control_policy.parameters(), 1.0)
    optim.step()
    optim.zero_grad()
    pbar.set_description(
        f"reward: {traj_return: 4.4f}, "
        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
    )
    logs["return"].append(traj_return.item())
    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean(dim=0).item())
    scheduler.step()