[Chore] clean commented debug code

nil0x9 · github-actions[bot] · commit a31bd1fa8e11 · 2026-04-21T14:50:32.000Z
diff --git a/tests/engine/test_moe_train_engine.py b/tests/engine/test_moe_train_engine.py
@@ -407,14 +407,12 @@ def test_load_optimizer_with_new_lr(self, device):
             fsdp_cfg=fsdp_cfg,
         )
         engine2.load_dcp(model_dir=model_dir, optimizer_dir=optimizer_dir, load_args=False)
-        # print(f"len(engine.optimizer.state), len(engine2.optimizer.state): {len(engine.optimizer.state)}, {len(engine2.optimizer.state)}")
         assert len(engine.optimizer.state) == len(engine2.optimizer.state)
         assert len(engine.optimizer.state) != 0
         for param_group in engine2.optimizer.param_groups:
-            # print(f"param_group['lr']: {param_group['lr']}")
             assert param_group['lr'] == lr2
             assert param_group['eps'] == eps2
-        
+
         lr3 = 1e-1
         eps3 = 1e-3
         optim_cfg3 = AdamWConfig(lr=lr3, eps=eps3)
diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py
@@ -421,8 +421,6 @@ def fully_shard(
 
         if self.fsdp_config.requires_grad:
             for name, module in self.named_modules():
-                # if "ts_model" in name:
-                #     torch.distributed.breakpoint()
                 for p_name, param in module.named_parameters(recurse=False):
                     if param.requires_grad:
                         param_fp32 = torch.nn.Parameter(param.to(dtype=torch.float32))
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -692,15 +692,6 @@ def forward(
     def backward(ctx, grad_output: torch.Tensor):
         current_stream = torch.cuda.current_stream()
 
-        # if ctx.name == "pre_dispatched":
-        #     torch.cuda.synchronize()
-        #
-        # if ctx.name == "dispatched":
-        #     torch.cuda.synchronize()
-        #
-        # if ctx.name == "pre_combined":
-        #     torch.cuda.synchronize()
-        #
         if ctx.previous_backward_event is not None:
             current_stream.wait_event(ctx.previous_backward_event)
         if ctx.finished_backward_event is not None:
@@ -710,19 +701,3 @@ def backward(ctx, grad_output: torch.Tensor):
 
 
 backward_sync = _BackwardSync.apply
-
-
-# class _DebugBackward(Function):
-#     @staticmethod
-#     def forward(
-#         ctx,
-#         input_tensor: torch.Tensor,
-#         name: str
-#     ) -> torch.Tensor:
-#         ctx.name = name
-#         return input_tensor
-#
-#     @staticmethod
-#     def backward(ctx, grad_output: torch.Tensor):
-#         print(ctx.name)
-#         return grad_output, None
diff --git a/xtuner/v1/ops/flash_attn/gpu.py b/xtuner/v1/ops/flash_attn/gpu.py
@@ -529,8 +529,7 @@ def _flash_attn_varlen_forward(
             return_softmax,
             None,
         )
-        # if out.isnan().any() or softmax_lse.isnan().any():
-        #     breakpoint()
+
         return out, softmax_lse, S_dmask, rng_state
 
     @torch.library.register_fake("flash_attn::_flash_attn_varlen_forward_v2")
diff --git a/xtuner/v1/ops/flash_attn/npu.py b/xtuner/v1/ops/flash_attn/npu.py
@@ -20,7 +20,6 @@ def npu_flash_varlen_attn(
     return_attn_probs=False,
     block_table=None,
 ):
-    # print(f"rank:{torch.distributed.get_rank()}, cu_seqlens_k:{cu_seqlens_q.type}")
     if not causal:
         fa_out = torch_npu.npu_fusion_attention(
             q,
diff --git a/xtuner/v1/ops/moe/cuda/triton_kernels/k_grouped_gemm_TMA_triton3_4.py b/xtuner/v1/ops/moe/cuda/triton_kernels/k_grouped_gemm_TMA_triton3_4.py
@@ -278,7 +278,7 @@ def trace_handler(prof):
         # post-process, row normalization
         out_triton = row_max_normalization(out_triton)
         out_ref = row_max_normalization(out_ref)
-        # breakpoint()
+
         torch.testing.assert_close(out_triton, out_ref, rtol=0.001, atol=0.01)
 
         print(f"{m = }, {n = }, {K = }")
diff --git a/xtuner/v1/ops/moe/cuda/triton_kernels/m_grouped_gemm_TMA_triton3_4.py b/xtuner/v1/ops/moe/cuda/triton_kernels/m_grouped_gemm_TMA_triton3_4.py
@@ -406,7 +406,6 @@ def trace_handler(prof):
                 # with record_function(f"Cutlass_record"):
                 #     backend.gmm(a, b, out_cutlass, batch_sizes, False, trans_b)
                 prof.step()
-        # breakpoint()
         # post-process, row normalization
         out_triton = row_max_normalization(out_triton)
         # out_cublas = row_max_normalization(out_cublas)
diff --git a/xtuner/v1/ray/judger/dapo_math.py b/xtuner/v1/ray/judger/dapo_math.py
@@ -205,7 +205,6 @@ def is_correct_strict_box(
     # Extract and check the boxed answer
     boxed_pred = last_boxed_only_string(pred)
     extracted_pred = remove_boxed(boxed_pred) if boxed_pred is not None else None
-    # print("==========", extracted_pred, gt)
 
     return 1 if (extracted_pred == gt) else -1, extracted_pred
 
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py
@@ -1256,7 +1256,6 @@ def meta(self) -> XTunerMeta:
     def _data_iter(self):
         data_iter = iter(self._dataloader)
         while self._cur_step < self.total_step:
-            # dist.breakpoint(skip=14)
             try:
                 data = next(data_iter)
             except StopIteration:
diff --git a/xtuner/v1/utils/grad_norm.py b/xtuner/v1/utils/grad_norm.py
@@ -69,7 +69,6 @@ def cal_total_norm(
 
 def cal_grad_norm(grads: List[DTensor], dtype=torch.float32):
     grouped_grads = group_tensors_by_device_mesh_and_placements(grads)
-    # print(f"clip_grad_norm dtype: {dtype}")
     total_norms = []
     for grads in grouped_grads.values():
         total_norm = cal_total_norm(grads, norm_type=2.0, foreach=True, dtype=dtype)

Original file line number	Diff line number	Diff line change
`@@ -529,8 +529,7 @@ def _flash_attn_varlen_forward(`
`529`	`529`	`return_softmax,`
`530`	`530`	`None,`
`531`	`531`	`)`
`532`		`- # if out.isnan().any() or softmax_lse.isnan().any():`
`533`		`- # breakpoint()`
	`532`	`+`
`534`	`533`	`return out, softmax_lse, S_dmask, rng_state`
`535`	`534`
`536`	`535`	`@torch.library.register_fake("flash_attn::_flash_attn_varlen_forward_v2")`