In [None]:
!pip install --upgrade torchrl

In [None]:
import torchrl
print(torchrl.__version__)

0.7.2


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import random
from torchrl.envs import EnvBase, GymEnv, GymLikeEnv
from torchrl.data.replay_buffers import ReplayBuffer, TensorDictReplayBuffer, LazyTensorStorage
from torchrl.objectives import DQNLoss
from torchrl.collectors import SyncDataCollector
from tensordict import TensorDict, TensorDictBase
from tensordict.nn import TensorDictModule
from typing import Optional
from torchrl.modules import EGreedyModule, MLP, QValueModule
from torchrl.data import OneHot, Composite, UnboundedContinuous, Categorical

from tensordict.nn import TensorDictModule, TensorDictSequential
from torchrl.envs import GymEnv, StepCounter, TransformedEnv, step_mdp
from torch.optim import Adam



# Generate Realistic Synthetic Data
def generate_synthetic_data(num_samples=1000):
    data = {
        "keyword": [f"Keyword_{i}" for i in range(num_samples)],
        "competitiveness": np.random.uniform(0, 1, num_samples),
        "difficulty_score": np.random.uniform(0, 1, num_samples),
        "organic_rank": np.random.uniform(1, 10, num_samples),
        "organic_clicks": np.random.randint(50, 5000, num_samples),
        "organic_ctr": np.random.uniform(0.01, 0.3, num_samples),
        "paid_clicks": np.random.randint(10, 3000, num_samples),
        "paid_ctr": np.random.uniform(0.01, 0.25, num_samples),
        "ad_spend": np.random.uniform(10, 10000, num_samples),
        "ad_conversions": np.random.uniform(0, 500, num_samples),
        "ad_roas": np.random.uniform(0.5, 5, num_samples),
        "conversion_rate": np.random.uniform(0.01, 0.3, num_samples),
        "cost_per_click": np.random.uniform(0.1, 10, num_samples),
        "cost_per_acquisition": np.random.uniform(5, 500, num_samples),
        "previous_recommendation": np.random.choice([0, 1], size=num_samples),
        "impression_share": np.random.uniform(0.1, 1.0, num_samples),
        "conversion_value": np.random.uniform(0, 10000, num_samples)
    }
    return pd.DataFrame(data)

# Load synthetic dataset
dataset = generate_synthetic_data(1000)
feature_columns = ["competitiveness", "difficulty_score", "organic_rank", "organic_clicks", "organic_ctr", "paid_clicks", "paid_ctr", "ad_spend", "ad_conversions", "ad_roas", "conversion_rate", "cost_per_click"]

from torchrl.envs import EnvBase
from torchrl.data import Composite, OneHot, Unbounded
from tensordict import TensorDict
import torch
import numpy as np

from torchrl.envs import EnvBase
from torchrl.data import Composite, OneHot, Unbounded
from tensordict import TensorDict
import torch
import numpy as np


class AdOptimizationEnv(EnvBase): #GymLikeEnv
    def __init__(self, dataset):
        super().__init__(batch_size=torch.Size([]))  # ✅ Single environment Env Batch - how many envs do I want to train on!!!!

        self.dataset = dataset
        self.num_features = len(feature_columns)

        # ✅ Fix: Define action, observation, and reward specs correctly
        self.action_spec = Composite(action=OneHot(n=2, dtype=torch.int64))
        self.observation_spec = Composite(observation=Unbounded(shape=(self.num_features,), dtype=torch.float32))
        self.reward_spec = Composite(reward=Unbounded(shape=(1,), dtype=torch.float32))  # ✅ Corrected



    def _reset(self, tensordict=None):
        """Reset environment and return initial state."""
        sample = self.dataset.sample(1).iloc[0]
        state = torch.tensor(sample[feature_columns].values.astype(np.float32), dtype=torch.float32)

        return TensorDict({
            "observation": state,
            "done": torch.tensor([False], dtype=torch.bool),  # Explicit shape [1]
        }, batch_size=[])

    def _step(self, tensordict):
        """Performs one step and returns the next state, reward, and done."""
        action = tensordict["action"].argmax().item()
        next_sample = self.dataset.sample(1).iloc[0]

        next_state = torch.tensor(
            next_sample[feature_columns].values.astype(np.float32),
            dtype=torch.float32
        )

        reward_value = self._compute_reward(action, next_sample)
        reward = torch.tensor([reward_value], dtype=torch.float32)  # Shape [1]

        done = torch.tensor([False], dtype=torch.bool)  # Shape [1]

        # ✅ Fix: Ensure the correct structure
        return TensorDict({
          "observation": next_state,
          "reward": torch.tensor([reward], dtype=torch.float32),
          "done": torch.tensor([done], dtype=torch.bool),
          "action": action
      })  # ✅ Ensure batch size is correctly set


    def _compute_reward(self, action, sample):
        """Computes reward based on the action and dataset sample. Business Logic here!"""
        cost = sample.ad_spend
        ctr = sample.paid_ctr
        revenue = sample.conversion_value
        roas = revenue / cost if cost > 0 else 0.0

        if action == 1:
            reward = 2.0 if (cost > 5000 and roas > 2.0) else (1.0 if roas > 1.0 else -1.0)
        else:
            reward = 1.0 if ctr > 0.15 else -0.5

        print(f"[Reward Computation] Action: {action}, Cost: {cost:.2f}, CTR: {ctr:.4f}, ROAS: {roas:.4f}, Reward: {reward}")
        return reward

    def _set_seed(self, seed=None):
        torch.manual_seed(seed)


# ✅ Initialize and Verify
env = AdOptimizationEnv(dataset)
print("Observation Keys:", env.observation_keys)
print("Action Keys:", env.action_keys)
print("Reward Keys:", env.reward_keys)


Observation Keys: ['observation']
Action Keys: ['action']
Reward Keys: ['reward']


In [None]:
env.observation_spec, env.action_spec, env.reward_spec

(Composite(
     observation: UnboundedContinuous(
         shape=torch.Size([12]),
         space=ContinuousBox(
             low=Tensor(shape=torch.Size([12]), device=cpu, dtype=torch.float32, contiguous=True),
             high=Tensor(shape=torch.Size([12]), device=cpu, dtype=torch.float32, contiguous=True)),
         device=cpu,
         dtype=torch.float32,
         domain=continuous),
     device=None,
     shape=torch.Size([])),
 OneHot(
     shape=torch.Size([2]),
     space=CategoricalBox(n=2),
     device=cpu,
     dtype=torch.int64,
     domain=discrete),
 UnboundedContinuous(
     shape=torch.Size([1]),
     space=ContinuousBox(
         low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
         high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
     device=cpu,
     dtype=torch.float32,
     domain=continuous))

In [None]:
from torchrl.modules import EGreedyModule, MLP, QValueModule

value_mlp = MLP(in_features=env.num_features, out_features=env.action_spec.shape[-1], num_cells=[64, 64])
value_net = TensorDictModule(value_mlp, in_keys=["observation"], out_keys=["action_value"])
policy = TensorDictSequential(value_net, QValueModule(spec=env.action_spec))
exploration_module = EGreedyModule(
    env.action_spec, annealing_num_steps=10_000, eps_init=0.9
)
policy_explore = TensorDictSequential(policy, exploration_module)



In [None]:
value_mlp

MLP(
  (0): Linear(in_features=12, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): Tanh()
  (4): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
policy_explore

TensorDictSequential(
    module=ModuleList(
      (0): TensorDictSequential(
          module=ModuleList(
            (0): TensorDictModule(
                module=MLP(
                  (0): Linear(in_features=12, out_features=64, bias=True)
                  (1): Tanh()
                  (2): Linear(in_features=64, out_features=64, bias=True)
                  (3): Tanh()
                  (4): Linear(in_features=64, out_features=2, bias=True)
                ),
                device=cpu,
                in_keys=['observation'],
                out_keys=['action_value'])
            (1): QValueModule()
          ),
          device=cpu,
          in_keys=['observation'],
          out_keys=['action', 'action_value', 'chosen_action_value'])
      (1): EGreedyModule()
    ),
    device=cpu,
    in_keys=['observation'],
    out_keys=['action_value', 'chosen_action_value', 'action'])

In [None]:
def create_env():
  return TransformedEnv(AdOptimizationEnv(dataset), StepCounter(max_steps=50_000))

In [None]:
init_rand_steps = 5000
collector = SyncDataCollector(
    create_env_fn = create_env,
    policy = policy_explore,
    frames_per_batch=1,  # ✅ Match the batch size
    total_frames=10_000,
    init_random_frames=init_rand_steps,
    storing_device="cpu",  # ✅ Ensure storing happens on CPU
    split_trajs=False,  # ✅ Prevent trajectory splitting from dropping keys
    exploration_type="mode"
)
rb = ReplayBuffer(storage=LazyTensorStorage(100_000))

In [None]:


for data in collector:
    print("✅ Next keys:", data["next"].keys())
    print("✅ Reward tensor shape:", data["next"]["reward"].shape)
    print("✅ Reward tensor value:", data["next"]["reward"])
    break


[Reward Computation] Action: 0, Cost: 8390.88, CTR: 0.0732, ROAS: 0.1678, Reward: -0.5
✅ Next keys: _StringKeys(dict_keys(['observation', 'step_count', 'reward', 'done', 'terminated', 'truncated']))
✅ Reward tensor shape: torch.Size([1, 1])
✅ Reward tensor value: tensor([[-0.5000]])


In [None]:

for i, data in enumerate(collector):
    print(f"Iteration {i+1}: Collector Output:", data)
    break

[Reward Computation] Action: 1, Cost: 85.62, CTR: 0.1741, ROAS: 81.6573, Reward: 1.0
Iteration 1: Collector Output: TensorDict(
    fields={
        action: Tensor(shape=torch.Size([1, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([1, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([1]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=tor

In [None]:
from torchrl.objectives import DQNLoss, SoftUpdate

loss = DQNLoss(value_network=policy, action_space=env.action_spec, delay_value=True)
optim = Adam(loss.parameters(), lr=0.02)
updater = SoftUpdate(loss, eps=0.99)

In [None]:
import time
total_count = 0
total_episodes = 0
optim_steps = 10
t0 = time.time()
for i, data in enumerate(collector):
    # Write data in replay buffer
    print(data)
    rb.extend(data)
    max_length = rb[:]["next", "step_count"].max()
    print("max_length", max_length)
    if len(rb) > init_rand_steps:
        # Optim loop (we do several optim steps
        # per batch collected for efficiency)
        for _ in range(optim_steps):
            sample = rb.sample(128)
            loss_vals = loss(sample)
            print("loss function is called:", loss_vals)
            loss_vals["loss"].backward()
            optim.step()
            optim.zero_grad()
            # Update exploration factor
            exploration_module.step(data.numel())
            # Update target params
            updater.step()
            if i % 10:
                print(f"Max num steps: {max_length}, rb length {len(rb)}")
            total_count += data.numel()
            total_episodes += data["next", "done"].sum()
    # if max_length > 200:
    #    break

t1 = time.time()

print(
    f"solved after {total_count} steps, {total_episodes} episodes and in {t1-t0}s."
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    is_shared=False)
Max num steps: 9951, rb length 9949
loss function is called: TensorDict(
    fields={
        loss: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)
Max num steps: 9951, rb length 9949
loss function is called: TensorDict(
    fields={
        loss: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)
Max num steps: 9951, rb length 9949
loss function is called: TensorDict(
    fields={
        loss: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)
Max num steps: 9951, rb length 9949
loss function is called: TensorDict(
    fields={
        loss: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is