From f8219b903dd9defe5569eaaf971a31b99ba824c2 Mon Sep 17 00:00:00 2001
From: "Jiyue (Jennifer) Wang" <jiyue@meta.com>
Date: Thu, 20 Nov 2025 10:40:45 -0800
Subject: [PATCH] Bug fix for dropping episodes in the GRPO (#601)

Summary:

## Bug Description:

https://github.com/meta-pytorch/torchforge/pull/580 had incorrect indentation
cuasing the input_ids, episodes varibles to be deleted inside the episodes
building loop, causing program to hang.

Next diff shall make background thread crashes to be surfaced to the main thread so that we know what thread crashed for what reason.

Reviewed By: daniellepintz

Differential Revision: D87554570
---
 apps/grpo/main.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/apps/grpo/main.py b/apps/grpo/main.py
index ea37c5351..1c1c2bd4a 100644
--- a/apps/grpo/main.py
+++ b/apps/grpo/main.py
@@ -424,23 +424,21 @@ async def continuous_rollouts():
                 input_ids[i, :max_req_tokens] = episode.request_tensor
                 input_ids[i, max_req_tokens:] = episode.response_tensor
 
-                # drop episodes if
-                # 1> reward std-dev is very small (including all 0s and all 1s)
-                # 2> response is potentially truncated (response_len >= max_res_tokens)
-                rewards = [e.reward for e in episodes]
-                rewards_std = torch.std(torch.tensor(rewards))
-                max_response_len = max(
-                    e.completion.token_ids.shape[0] for e in episodes
-                )
-                drop = rewards_std < 1e-3 or max_response_len >= max_res_tokens
-                record_metric(
-                    "main/continuous_rollouts/dropped_episodes",
-                    1 if drop else 0,
-                    Reduce.SUM,
-                )
-                if drop:
-                    del input_ids, episodes
-                    continue
+            # drop episodes if
+            # 1> reward std-dev is very small (including all 0s and all 1s)
+            # 2> response is potentially truncated (response_len >= max_res_tokens)
+            rewards = [e.reward for e in episodes]
+            rewards_std = torch.std(torch.tensor(rewards))
+            max_response_len = max(e.completion.token_ids.shape[0] for e in episodes)
+            drop = rewards_std < 1e-3 or max_response_len >= max_res_tokens
+            record_metric(
+                "main/continuous_rollouts/dropped_episodes",
+                1 if drop else 0,
+                Reduce.SUM,
+            )
+            if drop:
+                del input_ids, episodes
+                continue
 
             t.step("reward_evaluation")