# GRPO with Coding Task and Tools

## Import modules

In [1]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import asyncio
import time
import uuid
from dataclasses import dataclass
from typing import Any, Callable

import torch
import torch.nn.functional as F
import torchstore as ts
from datasets import load_dataset
from forge.actors._torchstore_utils import (
    get_dcp_whole_state_dict_key,
    get_param_prefix,
)
from forge.actors.generator import Generator as Policy
from forge.actors.reference_model import ReferenceModel
from forge.actors.replay_buffer import ReplayBuffer
from forge.actors.trainer import RLTrainer
from forge.cli.config import parse
from forge.controller.actor import ForgeActor
from forge.controller.provisioner import init_provisioner, shutdown
from forge.data.rewards import MathReward, ThinkingReward
from forge.observability.metric_actors import get_or_create_metric_logger
from forge.observability.metrics import record_metric, Reduce
from forge.observability.perf_tracker import Tracer

from forge.types import LauncherConfig, ProvisionerConfig
from forge.util.ops import compute_logprobs
from monarch.actor import endpoint
from omegaconf import DictConfig
from vllm.transformers_utils.tokenizer import get_tokenizer

import os
os.environ["MONARCH_HOSTMESH_V1"] = "1"
os.environ["TORCHSTORE_RDMA_ENABLED"] = "1"





INFO 10-16 20:07:55 [__init__.py:235] Automatically detected platform cuda.


## Define Data Structures

In [2]:
@dataclass
class Episode:
    # TODO: add adtional layer for multi-turn
    episode_id: str
    request: str
    policy_version: int
    pad_id: int
    request_len: int
    response_len: int
    target: Any | None = None
    # processed data
    response: str | None = None
    request_tokens: list[int] | None = None
    response_tokens: list[int] | None = None
    ref_logprobs: torch.Tensor | None = None
    reward: float | None = None
    advantage: float | None = None

    @property
    def request_tensor(self):
        tensor = torch.tensor(self.request_tokens, dtype=torch.long)
        if tensor.shape[0] < self.request_len:  # left pad
            diff = self.request_len - tensor.shape[0]
            tensor = F.pad(tensor, (diff, 0), value=self.pad_id)
        return tensor

    @property
    def response_tensor(self):
        tensor = torch.tensor(self.response_tokens, dtype=torch.long)
        if tensor.shape[0] < self.response_len:  # right pad
            diff = self.response_len - tensor.shape[0]
            tensor = F.pad(tensor, (0, diff), value=self.pad_id)
        return tensor


@dataclass
class Group:
    group_id: str
    episodes: list[Episode]

    @classmethod
    def new_group(
        cls,
        group_id: int,
        group_size: int,
        request: str,
        policy_version: int,
        pad_id: int,
        request_len: int,
        response_len: int,
        target: Any = None,
    ):
        episodes = []
        for _ in range(group_size):
            episodes.append(
                Episode(
                    episode_id=str(uuid.uuid4()),
                    request=request,
                    policy_version=policy_version,
                    pad_id=pad_id,
                    request_len=request_len,
                    response_len=response_len,
                    target=target,
                )
            )
        return cls(str(group_id), episodes)


def collate(batches: list[list[Episode]]):
    inputs = []
    targets = []
    for batch in batches:
        request = [e.request_tensor for e in batch]
        request = torch.stack(request)  # [b x s]

        response = [e.response_tensor for e in batch]
        response = torch.stack(response)  # [b x s]

        ref_logprobs = [e.ref_logprobs for e in batch]
        ref_logprobs = torch.stack(ref_logprobs).squeeze()  # [b x s]

        advantages = [e.advantage for e in batch]
        advantages = torch.tensor(advantages).unsqueeze(-1)  # [b x 1]

        pad_id = batch[0].pad_id
        mask = response != pad_id

        input = {"tokens": torch.cat([request, response], dim=1)}
        target = {
            "response": response,
            "ref_logprobs": ref_logprobs,
            "advantages": advantages,
            "padding_mask": mask,
        }
        inputs.append(input)
        targets.append(target)
    return inputs, targets

@dataclass
class DatasetActor(ForgeActor):
    """Actor wrapper for HuggingFace dataset to provide async interface."""

    path: str = "openai/gsm8k"
    revision: str = "main"
    data_split: str = "train"
    streaming: bool = True
    model: str = "Qwen/Qwen3-1.7B"

    @endpoint
    def setup(self):
        self._tokenizer = get_tokenizer(self.model)

        def gsm8k_transform(sample):
            system_prompt = """
            Put all your scratchpad work between <think> and </think> tags.
            Your final answer should be between <answer> and </answer> tags otherwise it will not be scored.
            """
            request: str = sample["question"]
            as_chat = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": request},
            ]
            formatted_request = self._tokenizer.apply_chat_template(
                as_chat,
                tokenize=False,
                add_generation_prompt=True,
            )
            target: str = sample["answer"]
            formatted_target = target.split("#### ")[1]
            return {"request": formatted_request, "target": formatted_target}

        ds = load_dataset(
            self.path, self.revision, split=self.data_split, streaming=self.streaming
        )
        ds = ds.map(gsm8k_transform)
        ds = ds.shuffle()
        self._iterator = iter(ds)

    @endpoint
    async def sample(self) -> dict[str, str] | None:
        try:
            sample = next(self._iterator)

            # Record dataset metrics
            record_metric("dataset/sample/count_samples_generated", 1, Reduce.SUM)
            record_metric(
                "dataset/sample/avg_sample_len",
                len(sample["request"]),
                Reduce.MEAN,
            )

            return sample
        except StopIteration:
            return None

    @endpoint
    async def pad_token(self):
        return self._tokenizer.pad_token_id

## Define loss

In [3]:
def simple_grpo_loss(
    logits: torch.Tensor,
    response: torch.Tensor,
    ref_logprobs: torch.Tensor,
    advantages: torch.Tensor,
    padding_mask: torch.Tensor,
    beta: float = 0.1,
) -> torch.Tensor:
    """
    Example GRPO Loss Function for RLTrainer
    """
    logprobs: torch.Tensor = compute_logprobs(logits, response)

    # Note: This is also available in losses.grpo_loss via `SimpleGRPOLoss`
    kl = torch.exp(ref_logprobs - logprobs) - (ref_logprobs - logprobs) - 1
    per_token_policy_loss = torch.exp(logprobs - logprobs.detach()) * advantages
    per_token_loss = -(per_token_policy_loss - beta * kl)
    loss = (
        ((per_token_loss * padding_mask).sum(dim=1))
        / (padding_mask.sum(dim=1).clamp(min=1.0))
    ).mean()
    return loss

## Define Reward

In [4]:
@dataclass
class RewardActor(ForgeActor):
    """Reward actor that uses a list of scoring functions."""

    reward_functions: list[Callable]

    @endpoint
    async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
        total_rewards = 0.0
        for reward_fn in self.reward_functions:
            reward = reward_fn(prompt, response, target)
            total_rewards += reward

            # Get a name for the reward function (works for classes, functions, lambdas)
            reward_fn_name = getattr(
                reward_fn, "__name__", reward_fn.__class__.__name__
            )
            # per function reward
            record_metric(
                f"reward/evaluate_response/sum_{reward_fn_name}_reward",
                reward,
                Reduce.SUM,
            )
            record_metric(
                f"reward/evaluate_response/avg_{reward_fn_name}_reward",
                reward,
                Reduce.MEAN,
            )
            record_metric(
                f"reward/evaluate_response/std_{reward_fn_name}_reward",
                reward,
                Reduce.STD,
            )

            # avg total reward
            record_metric(
                "reward/evaluate_response/avg_total_reward",
                reward,
                Reduce.MEAN,
            )

            # count fn calls
            record_metric(
                f"reward/evaluate_response/count_{reward_fn_name}_calls",
                1,
                Reduce.SUM,
            )

        avg_reward = total_rewards / len(self.reward_functions)
        return avg_reward


@dataclass
class ComputeAdvantages(ForgeActor):
    """Compute advantages for GRPO using reward signals."""

    @endpoint
    async def compute(self, group: Group) -> list[float]:
        # TODO: add batch processing
        rewards = torch.tensor([[e.reward for e in group.episodes]])
        mean = rewards.mean(1, keepdim=True)
        std = rewards.std(1, keepdim=True)
        advantages = (rewards - mean) / (std + 1e-4)
        return advantages.squeeze(0).tolist()

In [5]:
async def drop_weights(version: int):
    print(f"Dropping weights @ version {version}")
    start_time = time.perf_counter()
    prefix = get_param_prefix(version)
    matching_keys = await ts.keys(prefix)
    # TODO: once we have something like `get_meta()` in torchstore, we can just
    # query the type of the object instead of relying on keys.
    dcp_key = get_dcp_whole_state_dict_key(version)
    if dcp_key in matching_keys:
        dcp_handle = await ts.get(dcp_key)
        dcp_handle.drop()
    for key in matching_keys:
        await ts.delete(key)
    elapsed = time.perf_counter() - start_time
    print(f"Dropped weights @ version {version}, took {elapsed:.2f} seconds")

## !!!!!!! Custom Testing !!!!!!!

In [6]:
#from actors.coder import SandboxedPythonCoder

## Setup Services

In [10]:
await trainer.push_weights.call(1)



[0] [34m[RLTrainer-0/1] 2025-10-16 20:11:48 INFO[0m Pushing weights for policy version 1
[0] [34m[RLTrainer-0/1] 2025-10-16 20:11:52 INFO[0m Completed weights push in 3.96 seconds


ValueMesh({procs: 1}):
  (({'procs': 0/1}, None),)

In [13]:
get_param_prefix(1)

'policy_ver_0000000001'

In [14]:
keys = await ts.keys(get_param_prefix(1))
set(k.split(".")[0] for k in keys)

{'policy_ver_0000000001'}

In [7]:
from omegaconf import OmegaConf
from forge.cli.config import resolve_hf_hub_paths

cfg = OmegaConf.load('apps/grpo/qwen3_1_7b.yaml')
cfg = resolve_hf_hub_paths(cfg)
OmegaConf.resolve(cfg)

group_size = cfg.group_size # 8
max_req_tokens = cfg.max_req_tokens # 512
max_res_tokens = cfg.max_res_tokens # 512

metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
mlogger = await get_or_create_metric_logger()
await mlogger.init_backends.call_one(metric_logging_cfg)
await ts.initialize(strategy=ts.ControllerStorageVolumes())

dataloader, policy, trainer, replay_buffer, compute_advantages, ref_model, reward_actor = await asyncio.gather(
    DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset),
    Policy.options(**cfg.services.policy).as_service(**cfg.policy),
    RLTrainer.options(**cfg.actors.trainer).as_actor(
        **cfg.trainer, loss=simple_grpo_loss
    ),
    ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor(
        **cfg.replay_buffer, collate=collate
    ),
    ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(),
    ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model),
    RewardActor.options(**cfg.services.reward_actor).as_service(
        reward_functions=[MathReward(), ThinkingReward()]
    ),
)

[34m[1mwandb[0m: Currently logged in as: [33mpbontrager[0m ([33mbontrager[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Spawning actor DatasetActor
Launcher not provided, remote allocations will not work.
Spawning service Generator
Spawning actor RLTrainer
Spawning actor ReplayBuffer
Spawning actor ComputeAdvantages
Spawning service ReferenceModel
Spawning service RewardActor




[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:12 INFO[0m Compiling loss
[0] INFO 10-16 20:08:14 [__init__.py:235] Automatically detected platform cuda.




[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:14 INFO[0m Building 0-D device mesh with [], []
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:14 INFO[0m [GC] Initial GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m Total parameter count: dense 2,031,739,904, sparse 0, active 2,031,739,904
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m Applied selective activation checkpointing to the model
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m Checkpointing active. Checkpoints will be loaded from and saved to checkpoint
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m Mixed precision training is handled by AMP
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m loading from HF safetensors from --checkpoint.initial_load_path: /mnt/home/pbontrager/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e
[0] [34m[RLTrainer-0/1] 2025-10-16 20:08:15 INFO[0m Loading the checkpoint from /mnt/home/pbontrager/



[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m Total parameter count: dense 2,031,739,904, sparse 0, active 2,031,739,904
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m Applied selective activation checkpointing to the model
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m Checkpointing active. Checkpoints will be loaded from and saved to checkpoint
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m Mixed precision training is handled by AMP
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m loading from HF safetensors from --checkpoint.initial_load_path: /mnt/home/pbontrager/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:19 INFO[0m Loading the checkpoint from /mnt/home/pbontrager/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e.
[0] [34m[ReferenceModel-0/1] 2025-10-16 20:08:20 INFO[0m [

[0] `torch_dtype` is deprecated! Use `dtype` instead!


[0] INFO 10-16 20:08:25 [config.py:1604] Using max model len 40960
[0] INFO 10-16 20:08:26 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=16384.
[0] INFO 10-16 20:08:30 [__init__.py:235] Automatically detected platform cuda.




[0] [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0] [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0] [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0] [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0] [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0] INFO 10-16 20:08:35 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[0] INFO 10-16 20:08:35 [gpu_model_runner.py:1843] Starting to load model Qwen/Qwen3-1.7B...
[0] INFO 10-16 20:08:36 [gpu_model_runner.py:1875] Loading model from scratch...
[0] INFO 10-16 20:08:36 [cuda.py:290] Using Flash Attention backend on V1 engine.
[0] INFO 10-16 20:08:36 [weight_utils.py:296] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  3.89it/s]
[0] 


[0] INFO 10-16 20:08:37 [default_loader.py:262] Loading weights took 0.63 seconds
[0] INFO 10-16 20:08:37 [gpu_model_runner.py:1892] Model loading took 3.2152 GiB and 1.116859 seconds
[0] INFO 10-16 20:08:45 [backends.py:530] Using cache directory: /mnt/home/pbontrager/.cache/vllm/torch_compile_cache/7ab64a271d/rank_0_0/backbone for vLLM's torch.compile
[0] INFO 10-16 20:08:45 [backends.py:541] Dynamo bytecode transform time: 7.57 s
[0] INFO 10-16 20:08:50 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.502 s
[0] INFO 10-16 20:09:03 [monitor.py:34] torch.compile takes 7.57 s in total
[0] INFO 10-16 20:09:04 [gpu_worker.py:255] Available KV cache memory: 62.38 GiB
[0] INFO 10-16 20:09:04 [kv_cache_utils.py:833] GPU KV cache size: 584,000 tokens
[0] INFO 10-16 20:09:04 [kv_cache_utils.py:837] Maximum concurrency for 40,960 tokens per request: 14.26x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:01<00:00, 38.63it/s]


[0] INFO 10-16 20:09:06 [gpu_model_runner.py:2485] Graph capturing finished in 2 secs, took 0.60 GiB


## Rollout Loop

In [8]:
async def continuous_rollouts():
    rollout_count = 0
    pad_id = await dataloader.pad_token.call_one()
    while True:
        t = Tracer("main_perf/continuous_rollouts")
        t.start()
        sample = await dataloader.sample.call_one()
        if sample is None:
            print("Dataloader is empty, exiting continuous rollout")
            return

        t.step("data_loading")

        prompt, target = sample["request"], sample["target"]
        responses = await policy.generate.route(prompt)
        # TODO: this shall be part of the responses metadata instead of a separate call
        version = await policy.get_version.route()

        t.step("policy_generation")

        assert (
            len(responses) > 0
        ), "Sanity check: Responses should NEVER return empty"
        assert (
            version := responses[0].generator_version
        ) is not None, "Response must indicate a version"
        group = Group.new_group(
            group_id=rollout_count,
            group_size=group_size,
            request=prompt,
            policy_version=version,
            pad_id=pad_id,
            request_len=max_req_tokens,
            response_len=max_res_tokens,
            target=target,
        )

        input_ids = torch.ones(
            (group_size, max_req_tokens + max_res_tokens),
            dtype=torch.long,
            device="cuda",
        )
        # Populate episode info and calculate rewards
        for i, (episode, response) in enumerate(zip(group.episodes, responses)):
            episode.request_tokens = response.prompt_ids
            episode.response_tokens = response.token_ids
            episode.response = response.text
            input_ids[i, :max_req_tokens] = episode.request_tensor
            input_ids[i, max_req_tokens:] = episode.response_tensor
            episode.reward = await reward_actor.evaluate_response.route(
                prompt=prompt, response=response.text, target=target
            )

        t.step("reward_evaluation")

        ref_logprobs = await ref_model.forward.route(
            input_ids, max_req_tokens, return_logprobs=True
        )
        t.step("reference_model_calculate_logprobs")

        for i, episode in enumerate(group.episodes):
            episode.ref_logprobs = ref_logprobs[i]
        del ref_logprobs, input_ids
        t.step("compute_logprobs")

        # Calculate advantages and add to replay buffer
        advantages = await compute_advantages.compute.call_one(group)
        for episode, advantage in zip(group.episodes, advantages):
            episode.advantage = advantage
            await replay_buffer.add.call_one(episode)

        # Log metrics
        rollout_count += 1
        record_metric(
            "main/continuous_rollouts/count_rollout_iterations", 1, Reduce.SUM
        )
        t.stop()

## Training Loop

In [9]:
async def continuous_training():
    training_step = 0
    restart_tracer = True  # Flag to control when to restart tracer
    # update_task = asyncio.sleep(0) philip
    while True:
        # Restart tracer when needed (initial start or after completing a training step)
        # Otherwise, we cannot measure time waiting for buffer
        if restart_tracer:
            t = Tracer("main_perf/continuous_training")
            t.start()
            restart_tracer = False

        batch = await replay_buffer.sample.call_one(
            curr_policy_version=training_step
        )
        if batch is None:
            await asyncio.sleep(0.1)
        else:
            t.step("waiting_for_buffer")

            inputs, targets = batch
            await trainer.train_step.call(inputs, targets)
            training_step += 1
            t.step("train_step")

            await trainer.push_weights.call(training_step)
            t.step("push_weights")

            await policy.update_weights.fanout(training_step)
            #await update_task philip
            update_task = asyncio.create_task(policy.update_weights.fanout(training_step))
            t.step("update_weights")

            if training_step >= 2:
                await drop_weights(training_step - 1)
                #asyncio.create_task(drop_weights(training_step - 1)) philip
                t.step("drop_weights")

            t.stop()
            restart_tracer = True

            # Flush metrics every training step to WandB
            await mlogger.flush.call_one(training_step)

## Run

In [None]:
num_rollout_threads = 1
num_training_threads = 1

rollout_tasks = [
    asyncio.create_task(continuous_rollouts()) for _ in range(num_rollout_threads)
]
training_task = asyncio.create_task(continuous_training())

try:
    await asyncio.gather(*rollout_tasks, training_task)
except KeyboardInterrupt:
    print("Training interrupted by user")
    for rollout_task in rollout_tasks:
        rollout_task.cancel()
    training_task.cancel()

  tensor = torch.tensor(self.request_tokens, dtype=torch.long)
  tensor = torch.tensor(self.response_tokens, dtype=torch.long)


[0] [34m[ReferenceModel-0/1] 2025-10-16 19:37:27 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds




[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:27 INFO[0m Pushing weights for policy version 1
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:37:28 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:37:30 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:31 INFO[0m Completed weights push in 4.24 seconds
WandbBackend: Logged 91 metrics at global_step 1
=== [global_logger_2nwJ_r0] - METRICS STEP 1 ===
  buffer/add/count_episodes_added: 40.0
  buffer/evict/avg_policy_age: 0.0
  buffer/evict/max_policy_age: 0.0
  buffer/evict/sum_episodes_evicted: 0.0
  buffer/sample/avg_data_utilization: 1.9565217391304348
  buffer/sample/count_sample_requests: 59.0
  buffer_perf/sample/total_duration_avg_s: 8.071606221087909e-05
  buffer_perf/sample/total_duration_max_s: 0.002548671793192625
  dataset/sample/avg_sample_len: 442.5
  dataset/sample/count_samples_generated: 6.0
  generator/generate



[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:32 INFO[0m Pushing weights for policy version 2
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:37:32 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:35 INFO[0m Completed weights push in 3.20 seconds
[0] INFO 10-16 19:37:37 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 1
[0] [34m[Generator-0/1] 2025-10-16 19:37:37 INFO[0m Weight update completed (now v1)
WandbBackend: Logged 96 metrics at global_step 2
=== [global_logger_2nwJ_r0] - METRICS STEP 2 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 0.6666666666666666
  buffer/sample/count_sample_requests: 1.0
  buffer_perf/sample/total_duration_avg_s: 0.0007956749759614468
  buffer_perf/sample/total_duration_max_s: 0.0007956749759614468
  dataset/sample/avg_sample_l



[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:45 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:45 INFO[0m Pushing weights for policy version 3
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:37:47 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:48 INFO[0m Completed weights push in 3.09 seconds
Dropping weights @ version 2
WandbBackend: Logged 96 metrics at global_step 3
=== [global_logger_2nwJ_r0] - METRICS STEP 3 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9831168831168831
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 32.0
  buffer/sample/avg_data_utilization: 1.9662337662337661
  buffer/sample/count_sample_requests: 76.0
  buffer_perf/sample/total_duration_avg_s: 8.448450458481123e-05
  buffer_perf/sample/total_duration_max_s: 0.0005777152255177498
  dataset/sample/avg_sample_len: 490.3333333333333
  dataset/sam



[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:49 INFO[0m Pushing weights for policy version 4
[0] [34m[RLTrainer-0/1] 2025-10-16 19:37:52 INFO[0m Completed weights push in 3.00 seconds
[0] INFO 10-16 19:37:52 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:37:52 INFO[0m Weight update completed (now v3)
Dropping weights @ version 3
WandbBackend: Logged 96 metrics at global_step 4
=== [global_logger_2nwJ_r0] - METRICS STEP 4 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8571428571428572
  buffer/sample/count_sample_requests: 7.0
  buffer_perf/sample/total_duration_avg_s: 0.0001803584662931306
  buffer_perf/sample/total_duration_max_s: 0.0006847120821475983
  dataset/sample/avg_sample_len: 505.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 485.0
  generator



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:00 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:00 INFO[0m Pushing weights for policy version 5
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:38:02 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:03 INFO[0m Completed weights push in 2.52 seconds
Dropping weights @ version 4
WandbBackend: Logged 96 metrics at global_step 5
=== [global_logger_2nwJ_r0] - METRICS STEP 5 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9824797843665768
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9649595687331536
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.8939422455591125e-05
  buffer_perf/sample/total_duration_max_s: 0.0005676951259374619
  dataset/sample/avg_sample_len: 439.3333333333333
  dataset/sa



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:04 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:04 INFO[0m Pushing weights for policy version 6
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:07 INFO[0m Completed weights push in 2.54 seconds
[0] INFO 10-16 19:38:07 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:38:07 INFO[0m Weight update completed (now v5)
Dropping weights @ version 5
WandbBackend: Logged 96 metrics at global_step 6
=== [global_logger_2nwJ_r0] - METRICS STEP 6 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.871212121212121
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00012379054290552935
  buffer_perf/sample/total_duration_max_s: 0.0007347636856138706
  dataset/sample/avg_sample_len: 



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:14 INFO[0m Pushing weights for policy version 7
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:38:16 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:17 INFO[0m Completed weights push in 2.56 seconds
Dropping weights @ version 6
WandbBackend: Logged 96 metrics at global_step 7
=== [global_logger_2nwJ_r0] - METRICS STEP 7 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 68.0
  buffer_perf/sample/total_duration_avg_s: 5.284167470081764e-05
  buffer_perf/sample/total_duration_max_s: 0.00060642184689641
  dataset/sample/avg_sample_len: 474.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 418.25
  generator/generate/count_requests: 3.0
 



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:18 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:19 INFO[0m Pushing weights for policy version 8
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:21 INFO[0m Completed weights push in 2.55 seconds
[0] INFO 10-16 19:38:22 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:38:22 INFO[0m Weight update completed (now v7)
Dropping weights @ version 7
WandbBackend: Logged 96 metrics at global_step 8
=== [global_logger_2nwJ_r0] - METRICS STEP 8 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8717948717948718
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00010701279657391402
  buffer_perf/sample/total_duration_max_s: 0.0006678123027086258
  dataset/sample/avg_sample_len:



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:29 INFO[0m Pushing weights for policy version 9
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:38:31 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:32 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 8
WandbBackend: Logged 96 metrics at global_step 9
=== [global_logger_2nwJ_r0] - METRICS STEP 9 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 2.2452830188679247
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.555742265222823e-05
  buffer_perf/sample/total_duration_max_s: 0.0008008363656699657
  dataset/sample/avg_sample_len: 490.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 480.0833333333333
  generator/gener



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:33 INFO[0m Pushing weights for policy version 10
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:36 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:38:36 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 9[0] [34m[Generator-0/1] 2025-10-16 19:38:36 INFO[0m Weight update completed (now v9)

WandbBackend: Logged 96 metrics at global_step 10
=== [global_logger_2nwJ_r0] - METRICS STEP 10 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00017215621968110403
  buffer_perf/sample/total_duration_max_s: 0.0008265669457614422
  dataset/sample/avg_sample_len: 446.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  gene



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:44 INFO[0m Pushing weights for policy version 11
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:38:45 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:47 INFO[0m Completed weights push in 2.54 seconds
Dropping weights @ version 10
WandbBackend: Logged 96 metrics at global_step 11
=== [global_logger_2nwJ_r0] - METRICS STEP 11 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.418978820924889e-05
  buffer_perf/sample/total_duration_max_s: 0.0005802600644528866
  dataset/sample/avg_sample_len: 498.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 463.2083333333333
  generator/g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:48 INFO[0m Pushing weights for policy version 12
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:50 INFO[0m Completed weights push in 2.38 seconds
[0] INFO 10-16 19:38:51 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 11[0] [34m[Generator-0/1] 2025-10-16 19:38:51 INFO[0m Weight update completed (now v11)

WandbBackend: Logged 96 metrics at global_step 12
=== [global_logger_2nwJ_r0] - METRICS STEP 12 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9
  buffer/sample/count_sample_requests: 10.0
  buffer_perf/sample/total_duration_avg_s: 0.00013622748665511608
  buffer_perf/sample/total_duration_max_s: 0.0007180990651249886
  dataset/sample/avg_sample_len: 723.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 497.875
  generator/gener



[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:58 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:38:59 INFO[0m Pushing weights for policy version 13
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:39:00 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:01 INFO[0m Completed weights push in 2.39 seconds
Dropping weights @ version 12
WandbBackend: Logged 96 metrics at global_step 13
=== [global_logger_2nwJ_r0] - METRICS STEP 13 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9867924528301886
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9735849056603771
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 7.429588711946397e-05
  buffer_perf/sample/total_duration_max_s: 0.0005884449928998947
  dataset/sample/avg_sample_len: 510.6666666666667
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:03 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:03 INFO[0m Pushing weights for policy version 14
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:06 INFO[0m Completed weights push in 2.50 seconds
[0] INFO 10-16 19:39:06 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:39:06 INFO[0m Weight update completed (now v13)
Dropping weights @ version 13
WandbBackend: Logged 96 metrics at global_step 14
=== [global_logger_2nwJ_r0] - METRICS STEP 14 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8896103896103895
  buffer/sample/count_sample_requests: 14.0
  buffer_perf/sample/total_duration_avg_s: 0.00011975890291588647
  buffer_perf/sample/total_duration_max_s: 0.0006601158529520035
  dataset/sample/avg_sample



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:14 INFO[0m Pushing weights for policy version 15
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:39:15 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:16 INFO[0m Completed weights push in 2.52 seconds
Dropping weights @ version 14
WandbBackend: Logged 96 metrics at global_step 15
=== [global_logger_2nwJ_r0] - METRICS STEP 15 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.536588914794465e-05
  buffer_perf/sample/total_duration_max_s: 0.0006006867624819279
  dataset/sample/avg_sample_len: 411.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 503.375
  generator/generate/co



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:18 INFO[0m Pushing weights for policy version 16
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:20 INFO[0m Completed weights push in 2.47 seconds
[0] INFO 10-16 19:39:21 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 15
[0] [34m[Generator-0/1] 2025-10-16 19:39:21 INFO[0m Weight update completed (now v15)
WandbBackend: Logged 96 metrics at global_step 16
=== [global_logger_2nwJ_r0] - METRICS STEP 16 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00013141817628191068
  buffer_perf/sample/total_duration_max_s: 0.0006813881918787956
  dataset/sample/avg_sample_len: 520.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:28 INFO[0m Pushing weights for policy version 17
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:39:30 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:31 INFO[0m Completed weights push in 2.52 seconds
Dropping weights @ version 16
WandbBackend: Logged 96 metrics at global_step 17
=== [global_logger_2nwJ_r0] - METRICS STEP 17 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.99
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.98
  buffer/sample/count_sample_requests: 71.0
  buffer_perf/sample/total_duration_avg_s: 7.049547193545691e-05
  buffer_perf/sample/total_duration_max_s: 0.0006145690567791462
  dataset/sample/avg_sample_len: 568.3333333333334
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 474.3333333333333
  generator/generate/count_requests: 3.0




[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:32 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:32 INFO[0m Pushing weights for policy version 18
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:35 INFO[0m Completed weights push in 2.62 seconds
[0] INFO 10-16 19:39:35 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:39:35 INFO[0m Weight update completed (now v17)
Dropping weights @ version 17
WandbBackend: Logged 96 metrics at global_step 18
=== [global_logger_2nwJ_r0] - METRICS STEP 18 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.0001103996943968993
  buffer_perf/sample/total_duration_max_s: 0.0006906343623995781
  dataset/sample/avg_sample_l



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:43 INFO[0m Pushing weights for policy version 19
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:39:45 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:46 INFO[0m Completed weights push in 2.54 seconds
Dropping weights @ version 18
WandbBackend: Logged 96 metrics at global_step 19
=== [global_logger_2nwJ_r0] - METRICS STEP 19 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.61438241703053e-05
  buffer_perf/sample/total_duration_max_s: 0.0005676639266312122
  dataset/sample/avg_sample_len: 516.3333333333334
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 485.5416666666667
  generator/ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:47 INFO[0m Pushing weights for policy version 20
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:50 INFO[0m Completed weights push in 2.38 seconds
[0] INFO 10-16 19:39:50 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 19[0] [34m[Generator-0/1] 2025-10-16 19:39:50 INFO[0m Weight update completed (now v19)

WandbBackend: Logged 96 metrics at global_step 20
=== [global_logger_2nwJ_r0] - METRICS STEP 20 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00013293190083156028
  buffer_perf/sample/total_duration_max_s: 0.0007746540941298008
  dataset/sample/avg_sample_len: 418.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 481.625
  



[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:58 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:39:58 INFO[0m Pushing weights for policy version 21
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:39:59 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:01 INFO[0m Completed weights push in 2.52 seconds
Dropping weights @ version 20
WandbBackend: Logged 96 metrics at global_step 21
=== [global_logger_2nwJ_r0] - METRICS STEP 21 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9854202401372213
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9708404802744426
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.478971398964122e-05
  buffer_perf/sample/total_duration_max_s: 0.0005615358240902424
  dataset/sample/avg_sample_len: 557.6666666666666
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:02 INFO[0m Pushing weights for policy version 22
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:05 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:40:05 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 21[0] [34m[Generator-0/1] 2025-10-16 19:40:05 INFO[0m Weight update completed (now v21)

WandbBackend: Logged 96 metrics at global_step 22
=== [global_logger_2nwJ_r0] - METRICS STEP 22 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.905982905982906
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00011028868791002494
  buffer_perf/sample/total_duration_max_s: 0.0006813290528953075
  dataset/sample/avg_sample_len: 453.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:13 INFO[0m Pushing weights for policy version 23
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:40:14 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:16 INFO[0m Completed weights push in 2.55 seconds
Dropping weights @ version 22
WandbBackend: Logged 96 metrics at global_step 23
=== [global_logger_2nwJ_r0] - METRICS STEP 23 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 7.259831260386352e-05
  buffer_perf/sample/total_duration_max_s: 0.0005564410239458084
  dataset/sample/avg_sample_len: 550.3333333333334
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:17 INFO[0m Pushing weights for policy version 24
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:20 INFO[0m Completed weights push in 2.60 seconds
[0] INFO 10-16 19:40:20 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 23[0] [34m[Generator-0/1] 2025-10-16 19:40:20 INFO[0m Weight update completed (now v23)

WandbBackend: Logged 96 metrics at global_step 24
=== [global_logger_2nwJ_r0] - METRICS STEP 24 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00014699262101203203
  buffer_perf/sample/total_duration_max_s: 0.0007410873658955097
  dataset/sample/avg_sample_len: 391.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 479.125
  



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:28 INFO[0m Pushing weights for policy version 25
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:40:29 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:30 INFO[0m Completed weights push in 2.61 seconds
Dropping weights @ version 24
WandbBackend: Logged 96 metrics at global_step 25
=== [global_logger_2nwJ_r0] - METRICS STEP 25 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.512901426790511e-05
  buffer_perf/sample/total_duration_max_s: 0.000586808193475008
  dataset/sample/avg_sample_len: 475.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 496.5833333333333
  generator/ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:32 INFO[0m Pushing weights for policy version 26
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:34 INFO[0m Completed weights push in 2.40 seconds
[0] INFO 10-16 19:40:35 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 25[0] [34m[Generator-0/1] 2025-10-16 19:40:35 INFO[0m Weight update completed (now v25)

WandbBackend: Logged 96 metrics at global_step 26
=== [global_logger_2nwJ_r0] - METRICS STEP 26 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.0001174025625611345
  buffer_perf/sample/total_duration_max_s: 0.0007265289314091206
  dataset/sample/avg_sample_len: 494.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 485.75
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:42 INFO[0m Pushing weights for policy version 27
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:40:44 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:45 INFO[0m Completed weights push in 2.54 seconds
Dropping weights @ version 26
WandbBackend: Logged 96 metrics at global_step 27
=== [global_logger_2nwJ_r0] - METRICS STEP 27 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.99
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.98
  buffer/sample/count_sample_requests: 71.0
  buffer_perf/sample/total_duration_avg_s: 5.5491484859040085e-05
  buffer_perf/sample/total_duration_max_s: 0.0005910187028348446
  dataset/sample/avg_sample_len: 483.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 474.5416666666667
  generator/generate/count_requests: 3.0



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:46 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:47 INFO[0m Pushing weights for policy version 28
[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:49 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:40:49 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:40:49 INFO[0m Weight update completed (now v27)
Dropping weights @ version 27
WandbBackend: Logged 96 metrics at global_step 28
=== [global_logger_2nwJ_r0] - METRICS STEP 28 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.0001129469690987697
  buffer_perf/sample/total_duration_max_s: 0.0006977841258049011
  dataset/sample/avg_sample_l



[0] [34m[RLTrainer-0/1] 2025-10-16 19:40:57 INFO[0m Pushing weights for policy version 29
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:40:59 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:00 INFO[0m Completed weights push in 2.63 seconds
Dropping weights @ version 28
WandbBackend: Logged 96 metrics at global_step 29
=== [global_logger_2nwJ_r0] - METRICS STEP 29 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 5.497012848723424e-05
  buffer_perf/sample/total_duration_max_s: 0.0005960268899798393
  dataset/sample/avg_sample_len: 467.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 486.9166666666667
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:01 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:02 INFO[0m Pushing weights for policy version 30
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:04 INFO[0m Completed weights push in 2.58 seconds
[0] INFO 10-16 19:41:04 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:41:04 INFO[0m Weight update completed (now v29)
Dropping weights @ version 29
WandbBackend: Logged 96 metrics at global_step 30
=== [global_logger_2nwJ_r0] - METRICS STEP 30 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.905982905982906
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00012108314639100662
  buffer_perf/sample/total_duration_max_s: 0.0006692903116345406
  dataset/sample/avg_sample_



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:12 INFO[0m Pushing weights for policy version 31
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:41:14 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:15 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 30
WandbBackend: Logged 96 metrics at global_step 31
=== [global_logger_2nwJ_r0] - METRICS STEP 31 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 2.2452830188679247
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.616685845672268e-05
  buffer_perf/sample/total_duration_max_s: 0.0006094980053603649
  dataset/sample/avg_sample_len: 541.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 452.0833333333333
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:16 INFO[0m Pushing weights for policy version 32
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:19 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:41:19 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 31[0] [34m[Generator-0/1] 2025-10-16 19:41:19 INFO[0m Weight update completed (now v31)

WandbBackend: Logged 96 metrics at global_step 32
=== [global_logger_2nwJ_r0] - METRICS STEP 32 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011051702313125134
  buffer_perf/sample/total_duration_max_s: 0.0006664539687335491
  dataset/sample/avg_sample_len: 769.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:27 INFO[0m Pushing weights for policy version 33
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:41:29 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:30 INFO[0m Completed weights push in 2.57 seconds
Dropping weights @ version 32
WandbBackend: Logged 96 metrics at global_step 33
=== [global_logger_2nwJ_r0] - METRICS STEP 33 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 5.39605365428206e-05
  buffer_perf/sample/total_duration_max_s: 0.0006032041274011135
  dataset/sample/avg_sample_len: 468.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 511.125
  generator/generate/cou



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:31 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:31 INFO[0m Pushing weights for policy version 34
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:34 INFO[0m Completed weights push in 2.57 seconds
[0] INFO 10-16 19:41:34 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:41:34 INFO[0m Weight update completed (now v33)
Dropping weights @ version 33
WandbBackend: Logged 96 metrics at global_step 34
=== [global_logger_2nwJ_r0] - METRICS STEP 34 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.863905325443787
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00011480126816492814
  buffer_perf/sample/total_duration_max_s: 0.0007470441050827503
  dataset/sample/avg_sample_



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:41 INFO[0m Pushing weights for policy version 35
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:41:43 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:44 INFO[0m Completed weights push in 2.57 seconds
Dropping weights @ version 34
WandbBackend: Logged 96 metrics at global_step 35
=== [global_logger_2nwJ_r0] - METRICS STEP 35 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9895833333333334
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9791666666666667
  buffer/sample/count_sample_requests: 69.0
  buffer_perf/sample/total_duration_avg_s: 5.113672923998556e-05
  buffer_perf/sample/total_duration_max_s: 0.0006203129887580872
  dataset/sample/avg_sample_len: 437.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 442.9583333333333
  generator/g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:45 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:46 INFO[0m Pushing weights for policy version 36
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:48 INFO[0m Completed weights push in 2.52 seconds
[0] INFO 10-16 19:41:49 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:41:49 INFO[0m Weight update completed (now v35)
Dropping weights @ version 35
WandbBackend: Logged 96 metrics at global_step 36
=== [global_logger_2nwJ_r0] - METRICS STEP 36 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00011150223704484793
  buffer_perf/sample/total_duration_max_s: 0.000699714757502079
  dataset/sample/avg_sample_l



[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:57 INFO[0m Pushing weights for policy version 37
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:41:58 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:41:59 INFO[0m Completed weights push in 2.48 seconds
Dropping weights @ version 36WandbBackend: Logged 96 metrics at global_step 37

=== [global_logger_2nwJ_r0] - METRICS STEP 37 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9937106918238992
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.61091589545076e-05
  buffer_perf/sample/total_duration_max_s: 0.0005783890374004841
  dataset/sample/avg_sample_len: 597.3333333333334
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 506.625
  generator/generate/cou



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:01 INFO[0m Pushing weights for policy version 38
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:03 INFO[0m Completed weights push in 2.59 seconds
[0] INFO 10-16 19:42:04 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 37[0] [34m[Generator-0/1] 2025-10-16 19:42:04 INFO[0m Weight update completed (now v37)

WandbBackend: Logged 96 metrics at global_step 38
=== [global_logger_2nwJ_r0] - METRICS STEP 38 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9126984126984128
  buffer/sample/count_sample_requests: 14.0
  buffer_perf/sample/total_duration_avg_s: 0.00010931069430496012
  buffer_perf/sample/total_duration_max_s: 0.0007015308365225792
  dataset/sample/avg_sample_len: 519.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 481.5
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:11 INFO[0m Pushing weights for policy version 39
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:42:13 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:14 INFO[0m Completed weights push in 2.50 seconds
Dropping weights @ version 38
WandbBackend: Logged 96 metrics at global_step 39
=== [global_logger_2nwJ_r0] - METRICS STEP 39 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.097625966553819e-05
  buffer_perf/sample/total_duration_max_s: 0.0008116178214550018
  dataset/sample/avg_sample_len: 576.6666666666666
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:16 INFO[0m Pushing weights for policy version 40
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:18 INFO[0m Completed weights push in 2.67 seconds
[0] INFO 10-16 19:42:19 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 39[0] [34m[Generator-0/1] 2025-10-16 19:42:19 INFO[0m Weight update completed (now v39)

WandbBackend: Logged 96 metrics at global_step 40
=== [global_logger_2nwJ_r0] - METRICS STEP 40 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00011279405309603765
  buffer_perf/sample/total_duration_max_s: 0.0006868317723274231
  dataset/sample/avg_sample_len: 645.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:27 INFO[0m Pushing weights for policy version 41
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:42:28 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:29 INFO[0m Completed weights push in 2.59 seconds
Dropping weights @ version 40
WandbBackend: Logged 96 metrics at global_step 41
=== [global_logger_2nwJ_r0] - METRICS STEP 41 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9817610062893082
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9635220125786164
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.440962111023632e-05
  buffer_perf/sample/total_duration_max_s: 0.0006024409085512161
  dataset/sample/avg_sample_len: 627.6666666666666
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:31 INFO[0m Pushing weights for policy version 42
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:33 INFO[0m Completed weights push in 2.46 seconds
[0] INFO 10-16 19:42:34 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 41[0] [34m[Generator-0/1] 2025-10-16 19:42:34 INFO[0m Weight update completed (now v41)

WandbBackend: Logged 96 metrics at global_step 42
=== [global_logger_2nwJ_r0] - METRICS STEP 42 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011669821105897427
  buffer_perf/sample/total_duration_max_s: 0.0006679189391434193
  dataset/sample/avg_sample_len: 440.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 505.125
  



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:42 INFO[0m Pushing weights for policy version 43
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:42:43 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:44 INFO[0m Completed weights push in 2.65 seconds
Dropping weights @ version 42
WandbBackend: Logged 96 metrics at global_step 43
=== [global_logger_2nwJ_r0] - METRICS STEP 43 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.3852299429677625e-05
  buffer_perf/sample/total_duration_max_s: 0.0005702259950339794
  dataset/sample/avg_sample_len: 511.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 509.0
  generator/generate/cou



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:46 INFO[0m Pushing weights for policy version 44
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:48 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:42:49 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 43
[0] [34m[Generator-0/1] 2025-10-16 19:42:49 INFO[0m Weight update completed (now v43)
WandbBackend: Logged 96 metrics at global_step 44
=== [global_logger_2nwJ_r0] - METRICS STEP 44 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011264466835806768
  buffer_perf/sample/total_duration_max_s: 0.0006381841376423836
  dataset/sample/avg_sample_len: 729.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:57 INFO[0m Pushing weights for policy version 45
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:42:58 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:42:59 INFO[0m Completed weights push in 2.54 seconds
Dropping weights @ version 44
WandbBackend: Logged 96 metrics at global_step 45
=== [global_logger_2nwJ_r0] - METRICS STEP 45 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9854202401372213
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9708404802744426
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 5.639440690477689e-05
  buffer_perf/sample/total_duration_max_s: 0.0006215851753950119
  dataset/sample/avg_sample_len: 553.6666666666666
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:00 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:01 INFO[0m Pushing weights for policy version 46
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:03 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:43:04 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:43:04 INFO[0m Weight update completed (now v45)
Dropping weights @ version 45
WandbBackend: Logged 96 metrics at global_step 46
=== [global_logger_2nwJ_r0] - METRICS STEP 46 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.875
  buffer/sample/count_sample_requests: 8.0
  buffer_perf/sample/total_duration_avg_s: 0.0001501995138823986
  buffer_perf/sample/total_duration_max_s: 0.0007111448794603348
  dataset/sample/avg_sample_len: 430.0
  d



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:11 INFO[0m Pushing weights for policy version 47
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:43:13 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:14 INFO[0m Completed weights push in 2.49 seconds
Dropping weights @ version 46
WandbBackend: Logged 96 metrics at global_step 47
=== [global_logger_2nwJ_r0] - METRICS STEP 47 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9884696016771488
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9769392033542976
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.890396297783465e-05
  buffer_perf/sample/total_duration_max_s: 0.0006290236487984657
  dataset/sample/avg_sample_len: 471.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 503.8333333333333
  generator/g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:16 INFO[0m Pushing weights for policy version 48
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:18 INFO[0m Completed weights push in 2.47 seconds
[0] INFO 10-16 19:43:18 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:43:18 INFO[0m Weight update completed (now v47)
Dropping weights @ version 47
WandbBackend: Logged 96 metrics at global_step 48
=== [global_logger_2nwJ_r0] - METRICS STEP 48 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00010294116173799221
  buffer_perf/sample/total_duration_max_s: 0.000623106025159359
  dataset/sample/avg_sample_len: 431.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:26 INFO[0m Pushing weights for policy version 49
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:43:28 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:29 INFO[0m Completed weights push in 2.44 seconds
Dropping weights @ version 48
WandbBackend: Logged 96 metrics at global_step 49
=== [global_logger_2nwJ_r0] - METRICS STEP 49 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9824797843665768
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9649595687331536
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.0246036564176144e-05
  buffer_perf/sample/total_duration_max_s: 0.0007492918521165848
  dataset/sample/avg_sample_len: 445.6666666666667
  datase



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:30 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:31 INFO[0m Pushing weights for policy version 50
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:33 INFO[0m Completed weights push in 2.39 seconds
[0] INFO 10-16 19:43:34 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 49[0] [34m[Generator-0/1] 2025-10-16 19:43:34 INFO[0m Weight update completed (now v49)

WandbBackend: Logged 96 metrics at global_step 50
=== [global_logger_2nwJ_r0] - METRICS STEP 50 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8711111111111112
  buffer/sample/count_sample_requests: 15.0
  buffer_perf/sample/total_duration_avg_s: 0.00011752707262833914
  buffer_perf/sample/total_duration_max_s: 0.000755571760237217
  dataset/sample/avg_sample_



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:42 INFO[0m Pushing weights for policy version 51
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:43:43 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:44 INFO[0m Completed weights push in 2.44 seconds
Dropping weights @ version 50
WandbBackend: Logged 96 metrics at global_step 51
=== [global_logger_2nwJ_r0] - METRICS STEP 51 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9824797843665768
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9649595687331536
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 6.211245432496071e-05
  buffer_perf/sample/total_duration_max_s: 0.0005779541097581387
  dataset/sample/avg_sample_len: 591.3333333333334
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:46 INFO[0m Pushing weights for policy version 52
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:48 INFO[0m Completed weights push in 2.49 seconds
[0] INFO 10-16 19:43:49 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:43:49 INFO[0m Weight update completed (now v51)
Dropping weights @ version 51
WandbBackend: Logged 96 metrics at global_step 52
=== [global_logger_2nwJ_r0] - METRICS STEP 52 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00013340117696386116
  buffer_perf/sample/total_duration_max_s: 0.0007184157148003578
  dataset/sample/avg_sample_len: 495.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:57 INFO[0m Pushing weights for policy version 53
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:43:58 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:43:59 INFO[0m Completed weights push in 2.44 seconds
Dropping weights @ version 52
WandbBackend: Logged 96 metrics at global_step 53
=== [global_logger_2nwJ_r0] - METRICS STEP 53 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9854202401372213
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9708404802744426
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.124676784148087e-05
  buffer_perf/sample/total_duration_max_s: 0.0005596177652478218
  dataset/sample/avg_sample_len: 608.0
  dataset/sample/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:01 INFO[0m Pushing weights for policy version 54
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:03 INFO[0m Completed weights push in 2.40 seconds
[0] INFO 10-16 19:44:04 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:44:04 INFO[0m Weight update completed (now v53)
Dropping weights @ version 53
WandbBackend: Logged 96 metrics at global_step 54
=== [global_logger_2nwJ_r0] - METRICS STEP 54 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00010259911561241516
  buffer_perf/sample/total_duration_max_s: 0.0006533539853990078
  dataset/sample/avg_sample_len: 469.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:11 INFO[0m Pushing weights for policy version 55
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:44:13 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:14 INFO[0m Completed weights push in 2.45 seconds
Dropping weights @ version 54
WandbBackend: Logged 96 metrics at global_step 55
=== [global_logger_2nwJ_r0] - METRICS STEP 55 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.56144119390886e-05
  buffer_perf/sample/total_duration_max_s: 0.0006125504150986671
  dataset/sample/avg_sample_len: 530.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 499.0
  generator/generate/count_requests: 3



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:15 INFO[0m Pushing weights for policy version 56
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:18 INFO[0m Completed weights push in 2.42 seconds
[0] INFO 10-16 19:44:18 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:44:18 INFO[0m Weight update completed (now v55)
Dropping weights @ version 55
WandbBackend: Logged 96 metrics at global_step 56
=== [global_logger_2nwJ_r0] - METRICS STEP 56 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9
  buffer/sample/count_sample_requests: 10.0
  buffer_perf/sample/total_duration_avg_s: 0.00012715714983642102
  buffer_perf/sample/total_duration_max_s: 0.0006671976298093796
  dataset/sample/avg_sample_len: 472.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 303.0
  generator/generat



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:26 INFO[0m Pushing weights for policy version 57
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:44:27 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:29 INFO[0m Completed weights push in 3.09 seconds
Dropping weights @ version 56
WandbBackend: Logged 96 metrics at global_step 57
=== [global_logger_2nwJ_r0] - METRICS STEP 57 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9817610062893082
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9635220125786164
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.269951086390663e-05
  buffer_perf/sample/total_duration_max_s: 0.0005781189538538456
  dataset/sample/avg_sample_len: 441.6666666666667
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:30 INFO[0m Pushing weights for policy version 58
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:33 INFO[0m Completed weights push in 2.50 seconds
[0] INFO 10-16 19:44:33 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 57
[0] [34m[Generator-0/1] 2025-10-16 19:44:33 INFO[0m Weight update completed (now v57)
WandbBackend: Logged 96 metrics at global_step 58
=== [global_logger_2nwJ_r0] - METRICS STEP 58 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8333333333333333
  buffer/sample/count_sample_requests: 6.0
  buffer_perf/sample/total_duration_avg_s: 0.00020294799469411373
  buffer_perf/sample/total_duration_max_s: 0.0007157051004469395
  dataset/sample/avg_sample_len: 493.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 489.625
  g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:40 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:41 INFO[0m Pushing weights for policy version 59
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:44:42 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:44 INFO[0m Completed weights push in 2.56 seconds
Dropping weights @ version 58
WandbBackend: Logged 96 metrics at global_step 59
=== [global_logger_2nwJ_r0] - METRICS STEP 59 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9842767295597484
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9685534591194969
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.04960079450865e-05
  buffer_perf/sample/total_duration_max_s: 0.0005829497240483761
  dataset/sample/avg_sample_len: 490.0
  dataset/sample/count



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:45 INFO[0m Pushing weights for policy version 60
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:48 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:44:48 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 59[0] [34m[Generator-0/1] 2025-10-16 19:44:48 INFO[0m Weight update completed (now v59)

WandbBackend: Logged 96 metrics at global_step 60
=== [global_logger_2nwJ_r0] - METRICS STEP 60 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00012859566292415062
  buffer_perf/sample/total_duration_max_s: 0.0006772470660507679
  dataset/sample/avg_sample_len: 535.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 496.625
  



[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:56 INFO[0m Pushing weights for policy version 61
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:44:57 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:44:58 INFO[0m Completed weights push in 2.42 seconds
Dropping weights @ version 60
WandbBackend: Logged 96 metrics at global_step 61
=== [global_logger_2nwJ_r0] - METRICS STEP 61 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9903846153846154
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9807692307692308
  buffer/sample/count_sample_requests: 73.0
  buffer_perf/sample/total_duration_avg_s: 6.52739625066927e-05
  buffer_perf/sample/total_duration_max_s: 0.0005552456714212894
  dataset/sample/avg_sample_len: 435.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/count



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:00 INFO[0m Pushing weights for policy version 62
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:02 INFO[0m Completed weights push in 2.45 seconds
[0] INFO 10-16 19:45:03 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:45:03 INFO[0m Weight update completed (now v61)
Dropping weights @ version 61
WandbBackend: Logged 96 metrics at global_step 62
=== [global_logger_2nwJ_r0] - METRICS STEP 62 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011985589905331533
  buffer_perf/sample/total_duration_max_s: 0.0006834007799625397
  dataset/sample/avg_sample_len: 614.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 357.375
  



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:10 INFO[0m Pushing weights for policy version 63
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:45:12 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:13 INFO[0m Completed weights push in 2.53 seconds
Dropping weights @ version 62
WandbBackend: Logged 96 metrics at global_step 63
=== [global_logger_2nwJ_r0] - METRICS STEP 63 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.123793915518232e-05
  buffer_perf/sample/total_duration_max_s: 0.0005514598451554775
  dataset/sample/avg_sample_len: 497.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:15 INFO[0m Pushing weights for policy version 64
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:17 INFO[0m Completed weights push in 2.42 seconds
[0] INFO 10-16 19:45:18 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 63[0] [34m[Generator-0/1] 2025-10-16 19:45:18 INFO[0m Weight update completed (now v63)

WandbBackend: Logged 96 metrics at global_step 64
=== [global_logger_2nwJ_r0] - METRICS STEP 64 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9285714285714286
  buffer/sample/count_sample_requests: 14.0
  buffer_perf/sample/total_duration_avg_s: 0.00010807642580143042
  buffer_perf/sample/total_duration_max_s: 0.000652907881885767
  dataset/sample/avg_sample_len: 538.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:26 INFO[0m Pushing weights for policy version 65
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:45:27 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:28 INFO[0m Completed weights push in 2.55 seconds
Dropping weights @ version 64
WandbBackend: Logged 96 metrics at global_step 65
=== [global_logger_2nwJ_r0] - METRICS STEP 65 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.318727847692129e-05
  buffer_perf/sample/total_duration_max_s: 0.0005662101320922375
  dataset/sample/avg_sample_len: 541.6666666666666
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:30 INFO[0m Pushing weights for policy version 66
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:32 INFO[0m Completed weights push in 2.50 seconds
[0] INFO 10-16 19:45:33 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 65[0] [34m[Generator-0/1] 2025-10-16 19:45:33 INFO[0m Weight update completed (now v65)

WandbBackend: Logged 96 metrics at global_step 66
=== [global_logger_2nwJ_r0] - METRICS STEP 66 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00010263887592233144
  buffer_perf/sample/total_duration_max_s: 0.0006297859363257885
  dataset/sample/avg_sample_len: 472.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:40 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:41 INFO[0m Pushing weights for policy version 67
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:45:42 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:43 INFO[0m Completed weights push in 2.55 seconds
Dropping weights @ version 66
WandbBackend: Logged 96 metrics at global_step 67
=== [global_logger_2nwJ_r0] - METRICS STEP 67 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9842767295597484
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9685534591194969
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 7.183537692637057e-05
  buffer_perf/sample/total_duration_max_s: 0.0006119427271187305
  dataset/sample/avg_sample_len: 531.3333333333334
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:45 INFO[0m Pushing weights for policy version 68
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:47 INFO[0m Completed weights push in 2.47 seconds
[0] INFO 10-16 19:45:48 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 67[0] [34m[Generator-0/1] 2025-10-16 19:45:48 INFO[0m Weight update completed (now v67)

WandbBackend: Logged 96 metrics at global_step 68
=== [global_logger_2nwJ_r0] - METRICS STEP 68 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011041515972465277
  buffer_perf/sample/total_duration_max_s: 0.0006347321905195713
  dataset/sample/avg_sample_len: 562.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:55 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:56 INFO[0m Pushing weights for policy version 69
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:45:57 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:58 INFO[0m Completed weights push in 2.50 seconds
Dropping weights @ version 68
WandbBackend: Logged 96 metrics at global_step 69
=== [global_logger_2nwJ_r0] - METRICS STEP 69 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9836182336182335
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.967236467236467
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 6.795891871054967e-05
  buffer_perf/sample/total_duration_max_s: 0.0005606706254184246
  dataset/sample/avg_sample_len: 451.3333333333333
  dataset/



[0] [34m[RLTrainer-0/1] 2025-10-16 19:45:59 INFO[0m Pushing weights for policy version 70
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:02 INFO[0m Completed weights push in 2.43 seconds
[0] INFO 10-16 19:46:02 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 69[0] [34m[Generator-0/1] 2025-10-16 19:46:02 INFO[0m Weight update completed (now v69)

WandbBackend: Logged 96 metrics at global_step 70
=== [global_logger_2nwJ_r0] - METRICS STEP 70 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8333333333333333
  buffer/sample/count_sample_requests: 6.0
  buffer_perf/sample/total_duration_avg_s: 0.00019701927279432616
  buffer_perf/sample/total_duration_max_s: 0.0006700139492750168
  dataset/sample/avg_sample_len: 502.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 303.75
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:10 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:10 INFO[0m Pushing weights for policy version 71
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:46:12 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:12 INFO[0m Completed weights push in 2.39 seconds
Dropping weights @ version 70
WandbBackend: Logged 96 metrics at global_step 71
=== [global_logger_2nwJ_r0] - METRICS STEP 71 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9856902356902357
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9713804713804715
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 5.040739973386129e-05
  buffer_perf/sample/total_duration_max_s: 0.0005782661028206348
  dataset/sample/avg_sample_len: 487.6666666666667
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:14 INFO[0m Pushing weights for policy version 72
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:17 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:46:18 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 71[0] [34m[Generator-0/1] 2025-10-16 19:46:18 INFO[0m Weight update completed (now v71)

WandbBackend: Logged 96 metrics at global_step 72
=== [global_logger_2nwJ_r0] - METRICS STEP 72 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9375
  buffer/sample/count_sample_requests: 16.0
  buffer_perf/sample/total_duration_avg_s: 0.00010043001384474337
  buffer_perf/sample/total_duration_max_s: 0.0006704856641590595
  dataset/sample/avg_sample_len: 444.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  generator/gene



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:25 INFO[0m Pushing weights for policy version 73
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:46:27 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:28 INFO[0m Completed weights push in 2.57 seconds
Dropping weights @ version 72
WandbBackend: Logged 96 metrics at global_step 73
=== [global_logger_2nwJ_r0] - METRICS STEP 73 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9884696016771488
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9769392033542976
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 4.915767852720377e-05
  buffer_perf/sample/total_duration_max_s: 0.0005612177774310112
  dataset/sample/avg_sample_len: 570.3333333333334
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:30 INFO[0m Pushing weights for policy version 74
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:32 INFO[0m Completed weights push in 2.46 seconds
[0] INFO 10-16 19:46:33 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 73[0] [34m[Generator-0/1] 2025-10-16 19:46:33 INFO[0m Weight update completed (now v73)

WandbBackend: Logged 96 metrics at global_step 74
=== [global_logger_2nwJ_r0] - METRICS STEP 74 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00010860660889496405
  buffer_perf/sample/total_duration_max_s: 0.000607282854616642
  dataset/sample/avg_sample_len: 477.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:40 INFO[0m Pushing weights for policy version 75
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:46:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:43 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 74
WandbBackend: Logged 96 metrics at global_step 75
=== [global_logger_2nwJ_r0] - METRICS STEP 75 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9895833333333334
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9791666666666667
  buffer/sample/count_sample_requests: 69.0
  buffer_perf/sample/total_duration_avg_s: 5.2894904291716173e-05
  buffer_perf/sample/total_duration_max_s: 0.0006058490835130215
  dataset/sample/avg_sample_len: 472.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 399.75
  generator/generate/co



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:44 INFO[0m Pushing weights for policy version 76
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:46 INFO[0m Completed weights push in 2.48 seconds
[0] INFO 10-16 19:46:47 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 75
[0] [34m[Generator-0/1] 2025-10-16 19:46:47 INFO[0m Weight update completed (now v75)
WandbBackend: Logged 96 metrics at global_step 76
=== [global_logger_2nwJ_r0] - METRICS STEP 76 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8888888888888888
  buffer/sample/count_sample_requests: 9.0
  buffer_perf/sample/total_duration_avg_s: 0.00014418425659338632
  buffer_perf/sample/total_duration_max_s: 0.0007209400646388531
  dataset/sample/avg_sample_len: 438.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 493.875
  g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:55 INFO[0m Pushing weights for policy version 77
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:46:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:57 INFO[0m Completed weights push in 2.69 seconds
Dropping weights @ version 76
WandbBackend: Logged 96 metrics at global_step 77
=== [global_logger_2nwJ_r0] - METRICS STEP 77 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.70963734690402e-05
  buffer_perf/sample/total_duration_max_s: 0.0005780747160315514
  dataset/sample/avg_sample_len: 489.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 479.9583333333333
  generator/generate/count



[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:59 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:46:59 INFO[0m Pushing weights for policy version 78
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:02 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:47:02 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 77[0] [34m[Generator-0/1] 2025-10-16 19:47:02 INFO[0m Weight update completed (now v77)

WandbBackend: Logged 96 metrics at global_step 78
=== [global_logger_2nwJ_r0] - METRICS STEP 78 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00011223866246067561
  buffer_perf/sample/total_duration_max_s: 0.0007216348312795162
  dataset/sample/avg_sample_



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:10 INFO[0m Pushing weights for policy version 79
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:47:11 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:12 INFO[0m Completed weights push in 2.48 seconds
Dropping weights @ version 78
WandbBackend: Logged 96 metrics at global_step 79
=== [global_logger_2nwJ_r0] - METRICS STEP 79 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9884696016771488
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9769392033542976
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.43682968173478e-05
  buffer_perf/sample/total_duration_max_s: 0.0006142621859908104
  dataset/sample/avg_sample_len: 466.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 469.6666666666667
  generator/ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:14 INFO[0m Pushing weights for policy version 80
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:17 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:47:17 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 79
[0] [34m[Generator-0/1] 2025-10-16 19:47:17 INFO[0m Weight update completed (now v79)
WandbBackend: Logged 96 metrics at global_step 80
=== [global_logger_2nwJ_r0] - METRICS STEP 80 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00012224776527056328
  buffer_perf/sample/total_duration_max_s: 0.0006776242516934872
  dataset/sample/avg_sample_len: 453.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:24 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:25 INFO[0m Pushing weights for policy version 81
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:47:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:28 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 80
WandbBackend: Logged 96 metrics at global_step 81
=== [global_logger_2nwJ_r0] - METRICS STEP 81 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9817610062893082
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9635220125786164
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 7.171169351282958e-05
  buffer_perf/sample/total_duration_max_s: 0.0005520349368453026
  dataset/sample/avg_sample_len: 553.3333333333334
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:29 INFO[0m Pushing weights for policy version 82
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:32 INFO[0m Completed weights push in 2.56 seconds
[0] INFO 10-16 19:47:32 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 81[0] [34m[Generator-0/1] 2025-10-16 19:47:32 INFO[0m Weight update completed (now v81)

WandbBackend: Logged 96 metrics at global_step 82
=== [global_logger_2nwJ_r0] - METRICS STEP 82 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9090909090909092
  buffer/sample/count_sample_requests: 11.0
  buffer_perf/sample/total_duration_avg_s: 0.00012036285955797543
  buffer_perf/sample/total_duration_max_s: 0.0007116599008440971
  dataset/sample/avg_sample_len: 534.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:40 INFO[0m Pushing weights for policy version 83
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:47:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:42 INFO[0m Completed weights push in 2.53 seconds
Dropping weights @ version 82
WandbBackend: Logged 96 metrics at global_step 83
=== [global_logger_2nwJ_r0] - METRICS STEP 83 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9865229110512128
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 6.575104025368755e-05
  buffer_perf/sample/total_duration_max_s: 0.0005638338625431061
  dataset/sample/avg_sample_len: 480.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/count_requests: 



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:44 INFO[0m Pushing weights for policy version 84
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:47 INFO[0m Completed weights push in 2.49 seconds
[0] INFO 10-16 19:47:47 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 83[0] [34m[Generator-0/1] 2025-10-16 19:47:47 INFO[0m Weight update completed (now v83)

WandbBackend: Logged 96 metrics at global_step 84
=== [global_logger_2nwJ_r0] - METRICS STEP 84 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9230769230769231
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00010417380298559483
  buffer_perf/sample/total_duration_max_s: 0.000655696727335453
  dataset/sample/avg_sample_len: 529.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 424.0
  gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:55 INFO[0m Pushing weights for policy version 85
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:47:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:58 INFO[0m Completed weights push in 2.64 seconds
Dropping weights @ version 84
WandbBackend: Logged 96 metrics at global_step 85
=== [global_logger_2nwJ_r0] - METRICS STEP 85 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.219691130961921e-05
  buffer_perf/sample/total_duration_max_s: 0.0005867430008947849
  dataset/sample/avg_sample_len: 545.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 509.75
  generator/generate/count_requests:



[0] [34m[RLTrainer-0/1] 2025-10-16 19:47:59 INFO[0m Pushing weights for policy version 86
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:02 INFO[0m Completed weights push in 2.61 seconds
[0] INFO 10-16 19:48:02 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 85[0] [34m[Generator-0/1] 2025-10-16 19:48:02 INFO[0m Weight update completed (now v85)

WandbBackend: Logged 96 metrics at global_step 86
=== [global_logger_2nwJ_r0] - METRICS STEP 86 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00010864343494176865
  buffer_perf/sample/total_duration_max_s: 0.0006650383584201336
  dataset/sample/avg_sample_len: 523.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:10 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:10 INFO[0m Pushing weights for policy version 87
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:48:12 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:13 INFO[0m Completed weights push in 2.61 seconds
Dropping weights @ version 86
WandbBackend: Logged 96 metrics at global_step 87
=== [global_logger_2nwJ_r0] - METRICS STEP 87 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9856902356902357
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 2.0824915824915826
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 5.7983969648679096e-05
  buffer_perf/sample/total_duration_max_s: 0.0005920468829572201
  dataset/sample/avg_sample_len: 496.6666666666667
  datase



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:14 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:14 INFO[0m Pushing weights for policy version 88
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:17 INFO[0m Completed weights push in 2.52 seconds
[0] INFO 10-16 19:48:17 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:48:17 INFO[0m Weight update completed (now v87)
Dropping weights @ version 87
WandbBackend: Logged 96 metrics at global_step 88
=== [global_logger_2nwJ_r0] - METRICS STEP 88 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8388888888888888
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00012063037138432264
  buffer_perf/sample/total_duration_max_s: 0.0007004239596426487
  dataset/sample/avg_sample



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:25 INFO[0m Pushing weights for policy version 89
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:48:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:28 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 88
WandbBackend: Logged 96 metrics at global_step 89
=== [global_logger_2nwJ_r0] - METRICS STEP 89 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.764018970768194e-05
  buffer_perf/sample/total_duration_max_s: 0.0005711079575121403
  dataset/sample/avg_sample_len: 508.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/coun



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:29 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:29 INFO[0m Pushing weights for policy version 90
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:32 INFO[0m Completed weights push in 2.59 seconds
[0] INFO 10-16 19:48:32 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 89[0] [34m[Generator-0/1] 2025-10-16 19:48:32 INFO[0m Weight update completed (now v89)

WandbBackend: Logged 96 metrics at global_step 90
=== [global_logger_2nwJ_r0] - METRICS STEP 90 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.0001311747428889458
  buffer_perf/sample/total_duration_max_s: 0.0007735099643468857
  dataset/sample/avg_sample_l



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:39 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:39 INFO[0m Pushing weights for policy version 91
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:48:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:42 INFO[0m Completed weights push in 2.66 seconds
Dropping weights @ version 90
WandbBackend: Logged 96 metrics at global_step 91
=== [global_logger_2nwJ_r0] - METRICS STEP 91 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9833091436865021
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9666182873730043
  buffer/sample/count_sample_requests: 66.0
  buffer_perf/sample/total_duration_avg_s: 6.773680533197793e-05
  buffer_perf/sample/total_duration_max_s: 0.0005966550670564175
  dataset/sample/avg_sample_len: 425.3333333333333
  dataset



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:43 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:44 INFO[0m Pushing weights for policy version 92
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:46 INFO[0m Completed weights push in 2.51 seconds
[0] INFO 10-16 19:48:47 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 91[0] [34m[Generator-0/1] 2025-10-16 19:48:47 INFO[0m Weight update completed (now v91)

WandbBackend: Logged 96 metrics at global_step 92
=== [global_logger_2nwJ_r0] - METRICS STEP 92 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8452380952380951
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011925628253569205
  buffer_perf/sample/total_duration_max_s: 0.0006701163947582245
  dataset/sample/avg_sample



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:55 INFO[0m Pushing weights for policy version 93
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:48:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:57 INFO[0m Completed weights push in 2.56 seconds
Dropping weights @ version 92
WandbBackend: Logged 96 metrics at global_step 93
=== [global_logger_2nwJ_r0] - METRICS STEP 93 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9886831275720164
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9826572604350379
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 5.220916743079821e-05
  buffer_perf/sample/total_duration_max_s: 0.0005929078906774521
  dataset/sample/avg_sample_len: 447.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 480.25
  generator/generate/cou



[0] [34m[RLTrainer-0/1] 2025-10-16 19:48:59 INFO[0m Pushing weights for policy version 94
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:01 INFO[0m Completed weights push in 2.73 seconds
[0] INFO 10-16 19:49:02 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:49:02 INFO[0m Weight update completed (now v93)
Dropping weights @ version 93
WandbBackend: Logged 96 metrics at global_step 94
=== [global_logger_2nwJ_r0] - METRICS STEP 94 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011258894422401984
  buffer_perf/sample/total_duration_max_s: 0.0006952397525310516
  dataset/sample/avg_sample_len: 562.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 483.75
  g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:10 INFO[0m Pushing weights for policy version 95
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:49:11 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:12 INFO[0m Completed weights push in 2.67 seconds
Dropping weights @ version 94
WandbBackend: Logged 96 metrics at global_step 95
=== [global_logger_2nwJ_r0] - METRICS STEP 95 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9907407407407407
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 2.0185185185185186
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 6.0995339105526606e-05
  buffer_perf/sample/total_duration_max_s: 0.0005840021185576916
  dataset/sample/avg_sample_len: 498.6666666666667
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 485.625
  generator/generate/c



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:14 INFO[0m Pushing weights for policy version 96
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:16 INFO[0m Completed weights push in 2.55 seconds
[0] INFO 10-16 19:49:17 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 95[0] [34m[Generator-0/1] 2025-10-16 19:49:17 INFO[0m Weight update completed (now v95)

WandbBackend: Logged 96 metrics at global_step 96
=== [global_logger_2nwJ_r0] - METRICS STEP 96 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9090909090909092
  buffer/sample/count_sample_requests: 11.0
  buffer_perf/sample/total_duration_avg_s: 0.0001421005211093209
  buffer_perf/sample/total_duration_max_s: 0.0006960597820580006
  dataset/sample/avg_sample_len: 640.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 442.625
  g



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:25 INFO[0m Pushing weights for policy version 97
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:49:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:27 INFO[0m Completed weights push in 2.65 seconds
Dropping weights @ version 96
WandbBackend: Logged 96 metrics at global_step 97
=== [global_logger_2nwJ_r0] - METRICS STEP 97 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9907407407407407
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9867724867724865
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 7.294429466128349e-05
  buffer_perf/sample/total_duration_max_s: 0.0006040469743311405
  dataset/sample/avg_sample_len: 457.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 507.125
  generator/generate/count_requests



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:29 INFO[0m Pushing weights for policy version 98
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:32 INFO[0m Completed weights push in 2.45 seconds
[0] INFO 10-16 19:49:32 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 97[0] [34m[Generator-0/1] 2025-10-16 19:49:32 INFO[0m Weight update completed (now v97)

WandbBackend: Logged 96 metrics at global_step 98
=== [global_logger_2nwJ_r0] - METRICS STEP 98 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00012081194048126538
  buffer_perf/sample/total_duration_max_s: 0.0006208540871739388
  dataset/sample/avg_sample_len: 698.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0
  ge



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:40 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:40 INFO[0m Pushing weights for policy version 99
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:49:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:43 INFO[0m Completed weights push in 2.61 seconds
Dropping weights @ version 98
WandbBackend: Logged 96 metrics at global_step 99
=== [global_logger_2nwJ_r0] - METRICS STEP 99 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9836182336182335
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.967236467236467
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 6.387346734603245e-05
  buffer_perf/sample/total_duration_max_s: 0.0005996674299240112
  dataset/sample/avg_sample_len: 512.6666666666666
  dataset/



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:44 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:44 INFO[0m Pushing weights for policy version 100
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:47 INFO[0m Completed weights push in 2.52 seconds
[0] INFO 10-16 19:49:47 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:49:47 INFO[0m Weight update completed (now v99)
Dropping weights @ version 99
WandbBackend: Logged 96 metrics at global_step 100
=== [global_logger_2nwJ_r0] - METRICS STEP 100 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.871212121212121
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00011660166395207246
  buffer_perf/sample/total_duration_max_s: 0.0006787208840250969
  dataset/sample/avg_samp



[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:55 INFO[0m Pushing weights for policy version 101
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:49:57 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:58 INFO[0m Completed weights push in 2.77 seconds
Dropping weights @ version 100
WandbBackend: Logged 96 metrics at global_step 101
=== [global_logger_2nwJ_r0] - METRICS STEP 101 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 5.086199254602999e-05
  buffer_perf/sample/total_duration_max_s: 0.0005687740631401539
  dataset/sample/avg_sample_len: 493.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 503.5
  generator/generate/count_reques



Dropped weights @ version 100, took 0.51 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:49:59 INFO[0m Pushing weights for policy version 102
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:01 INFO[0m Completed weights push in 2.49 seconds
[0] INFO 10-16 19:50:02 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:50:02 INFO[0m Weight update completed (now v101)
Dropping weights @ version 101
WandbBackend: Logged 96 metrics at global_step 102
=== [global_logger_2nwJ_r0] - METRICS STEP 102 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.8
  buffer/sample/count_sample_requests: 5.0
  buffer_perf/sample/total_duration_avg_s: 0.00023207776248455048
  buffer_perf/sample/total_duration_max_s: 0.0007106349803507328
  dataset/sample/avg_sample_len: 560.0
  dataset/sample/count_samples_generated: 1.0
  generator/gen



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:10 INFO[0m Pushing weights for policy version 103
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:50:11 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:12 INFO[0m Completed weights push in 2.54 seconds
Dropping weights @ version 102
WandbBackend: Logged 96 metrics at global_step 103
=== [global_logger_2nwJ_r0] - METRICS STEP 103 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9872727272727272
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9866666666666664
  buffer/sample/count_sample_requests: 76.0
  buffer_perf/sample/total_duration_avg_s: 8.987585131667162e-05
  buffer_perf/sample/total_duration_max_s: 0.000585456844419241
  dataset/sample/avg_sample_len: 518.6666666666666
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/c



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:14 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:14 INFO[0m Pushing weights for policy version 104
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:17 INFO[0m Completed weights push in 2.60 seconds
[0] INFO 10-16 19:50:17 [block_pool.py:321] Successfully reset prefix cache
Dropping weights @ version 103[0] [34m[Generator-0/1] 2025-10-16 19:50:17 INFO[0m Weight update completed (now v103)

WandbBackend: Logged 96 metrics at global_step 104
=== [global_logger_2nwJ_r0] - METRICS STEP 104 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.881118881118881
  buffer/sample/count_sample_requests: 13.0
  buffer_perf/sample/total_duration_avg_s: 0.00014519451472621696
  buffer_perf/sample/total_duration_max_s: 0.0007302733138203621
  dataset/sample/avg_sa



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:24 INFO[0m Pushing weights for policy version 105
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:50:26 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:27 INFO[0m Completed weights push in 2.49 seconds
Dropping weights @ version 104
WandbBackend: Logged 96 metrics at global_step 105
=== [global_logger_2nwJ_r0] - METRICS STEP 105 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 2.0943396226415096
  buffer/sample/count_sample_requests: 71.0
  buffer_perf/sample/total_duration_avg_s: 7.72726538420563e-05
  buffer_perf/sample/total_duration_max_s: 0.0006498252041637897
  dataset/sample/avg_sample_len: 451.3333333333333
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 430.9166666666667
  generato



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:28 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:29 INFO[0m Pushing weights for policy version 106
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:31 INFO[0m Completed weights push in 2.62 seconds
[0] INFO 10-16 19:50:32 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:50:32 INFO[0m Weight update completed (now v105)
Dropping weights @ version 105
WandbBackend: Logged 96 metrics at global_step 106
=== [global_logger_2nwJ_r0] - METRICS STEP 106 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9000000000000001
  buffer/sample/count_sample_requests: 14.0
  buffer_perf/sample/total_duration_avg_s: 0.00010653388952570302
  buffer_perf/sample/total_duration_max_s: 0.0006964700296521187
  dataset/sample/avg_s



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:39 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:40 INFO[0m Pushing weights for policy version 107
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:50:41 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:42 INFO[0m Completed weights push in 2.60 seconds
Dropping weights @ version 106
WandbBackend: Logged 96 metrics at global_step 107
=== [global_logger_2nwJ_r0] - METRICS STEP 107 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9836182336182335
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.967236467236467
  buffer/sample/count_sample_requests: 75.0
  buffer_perf/sample/total_duration_avg_s: 6.385939195752144e-05
  buffer_perf/sample/total_duration_max_s: 0.0005684578791260719
  dataset/sample/avg_sample_len: 519.3333333333334
  data



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:44 INFO[0m Pushing weights for policy version 108
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:46 INFO[0m Completed weights push in 2.57 seconds
[0] INFO 10-16 19:50:47 [block_pool.py:321] Successfully reset prefix cache
[0] [34m[Generator-0/1] 2025-10-16 19:50:47 INFO[0m Weight update completed (now v107)
Dropping weights @ version 107
WandbBackend: Logged 96 metrics at global_step 108
=== [global_logger_2nwJ_r0] - METRICS STEP 108 ===
  buffer/add/count_episodes_added: 8.0
  buffer/evict/avg_policy_age: 1.0
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9166666666666667
  buffer/sample/count_sample_requests: 12.0
  buffer_perf/sample/total_duration_avg_s: 0.00012942620863517126
  buffer_perf/sample/total_duration_max_s: 0.000685298815369606
  dataset/sample/avg_sample_len: 461.0
  dataset/sample/count_samples_generated: 1.0
  generator/generate/avg_tokens_generated: 512.0




[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:55 INFO[0m Pushing weights for policy version 109
[0] [34m[ReferenceModel-0/1] 2025-10-16 19:50:56 INFO[0m [GC] Performing periodic GC collection took 0.00 seconds
[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:58 INFO[0m Completed weights push in 3.41 seconds
Dropping weights @ version 108
WandbBackend: Logged 96 metrics at global_step 109
=== [global_logger_2nwJ_r0] - METRICS STEP 109 ===
  buffer/add/count_episodes_added: 24.0
  buffer/evict/avg_policy_age: 0.9905660377358491
  buffer/evict/max_policy_age: 1.0
  buffer/evict/sum_episodes_evicted: 16.0
  buffer/sample/avg_data_utilization: 1.9811320754716981
  buffer/sample/count_sample_requests: 74.0
  buffer_perf/sample/total_duration_avg_s: 8.4217033676199e-05
  buffer_perf/sample/total_duration_max_s: 0.0006482847966253757
  dataset/sample/avg_sample_len: 573.0
  dataset/sample/count_samples_generated: 3.0
  generator/generate/avg_tokens_generated: 512.0
  generator/generate/count_requests



[0] [34m[RLTrainer-0/1] 2025-10-16 19:50:59 INFO[0m Pushing weights for policy version 110


## Shutdown

In [None]:
await mlogger.shutdown.call_one()
await asyncio.sleep(2)

await asyncio.gather(
    DatasetActor.shutdown(dataloader),
    policy.shutdown(),
    RLTrainer.shutdown(trainer),
    ReplayBuffer.shutdown(replay_buffer),
    ComputeAdvantages.shutdown(compute_advantages),
    ref_model.shutdown(),
    reward_actor.shutdown(),
)
await shutdown()