From 37b7a15f9b4de59492e8136378aebf691389d2cd Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Mon, 13 Oct 2025 12:11:44 -0700
Subject: [PATCH 1/3] update config

---
 apps/grpo/qwen3_32b.yaml | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index 593a2e1fb..5729517da 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -3,10 +3,10 @@
 # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
 
 # Global configuration
-group_size: 2
-local_batch_size: 8 # per-device batch size
-max_req_tokens: 512
-max_res_tokens: 512
+group_size: 16
+local_batch_size: 32 # per-device batch size
+max_req_tokens: 1024
+max_res_tokens: 1024
 model: "Qwen/Qwen3-32B"
 off_by_n: 1 # Off by one by default
 
@@ -14,7 +14,7 @@ provisioner:
   launcher: slurm
 
 # Main loop configuration
-rollout_threads: 1   # Recommended to set equal to policy.num_replicas
+rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
 
 # Observability configuration
 metric_logging:
@@ -35,12 +35,12 @@ dataset:
 
 # Policy configuration
 policy:
-  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+  engine_config:
     model: ${model}
     tensor_parallel_size: 4
     pipeline_parallel_size: 1
     enforce_eager: false
-  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+  sampling_config:
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
@@ -69,8 +69,8 @@ trainer:
     enable: false
   parallelism:
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: -1
-    tensor_parallel_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 8
     pipeline_parallel_degree: 1
     context_parallel_degree: 1
     expert_parallel_degree: 1
@@ -90,7 +90,7 @@ replay_buffer:
   batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
-  dp_size: 8
+  dp_size: 1
 
 # Reference model configuration
 ref_model:
@@ -118,37 +118,30 @@ ref_model:
 # All resource allocations
 services:
   policy:
-    procs: ${policy.engine_args.tensor_parallel_size}
-    num_replicas: 1
+    procs: ${policy.engine_config.tensor_parallel_size}
+    num_replicas: 4
     hosts: 1
     with_gpus: true
-    mesh_name: policy
   ref_model:
     procs: ${ref_model.parallelism.tensor_parallel_degree}
     num_replicas: 1
     with_gpus: true
-    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
-    mesh_name: reward_actor
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
-    mesh_name: dataset
   trainer:
     procs: 8
     hosts: 1
     with_gpus: true
-    mesh_name: trainer
   replay_buffer:
     procs: 1
     with_gpus: false
-    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
-    mesh_name: compute_advantages

From 56abc3cad34c39a5ae819d1c2a7b75a594cfffac Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Mon, 13 Oct 2025 12:14:41 -0700
Subject: [PATCH 2/3] fix

---
 apps/grpo/qwen3_32b.yaml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index 5729517da..e49b0ca93 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -35,12 +35,12 @@ dataset:
 
 # Policy configuration
 policy:
-  engine_config:
+  engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
     model: ${model}
     tensor_parallel_size: 4
     pipeline_parallel_size: 1
     enforce_eager: false
-  sampling_config:
+  sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0
@@ -118,30 +118,37 @@ ref_model:
 # All resource allocations
 services:
   policy:
-    procs: ${policy.engine_config.tensor_parallel_size}
+    procs: ${policy.engine_args.tensor_parallel_size}
     num_replicas: 4
     hosts: 1
     with_gpus: true
+    mesh_name: policy
   ref_model:
     procs: ${ref_model.parallelism.tensor_parallel_degree}
     num_replicas: 1
     with_gpus: true
+    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
+    mesh_name: reward_actor
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 8
     hosts: 1
     with_gpus: true
+    mesh_name: trainer
   replay_buffer:
     procs: 1
     with_gpus: false
+    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
+    mesh_name: compute_advantages

From 7decb30d55a5e0a0c2837359618ccda323405ba5 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Mon, 13 Oct 2025 12:47:32 -0700
Subject: [PATCH 3/3] fix spacing

---
 apps/grpo/qwen3_32b.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index e49b0ca93..e7a0cf509 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -35,12 +35,12 @@ dataset:
 
 # Policy configuration
 policy:
-  engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
     model: ${model}
     tensor_parallel_size: 4
     pipeline_parallel_size: 1
     enforce_eager: false
-  sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
     n: ${group_size}
     max_tokens: ${max_res_tokens}
     temperature: 1.0