Skip to content

Commit a31bd1f

Browse files
nil0x9github-actions[bot]
authored andcommitted
[Chore] clean commented debug code
1 parent a19ae05 commit a31bd1f

10 files changed

Lines changed: 3 additions & 38 deletions

File tree

tests/engine/test_moe_train_engine.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -407,14 +407,12 @@ def test_load_optimizer_with_new_lr(self, device):
407407
fsdp_cfg=fsdp_cfg,
408408
)
409409
engine2.load_dcp(model_dir=model_dir, optimizer_dir=optimizer_dir, load_args=False)
410-
# print(f"len(engine.optimizer.state), len(engine2.optimizer.state): {len(engine.optimizer.state)}, {len(engine2.optimizer.state)}")
411410
assert len(engine.optimizer.state) == len(engine2.optimizer.state)
412411
assert len(engine.optimizer.state) != 0
413412
for param_group in engine2.optimizer.param_groups:
414-
# print(f"param_group['lr']: {param_group['lr']}")
415413
assert param_group['lr'] == lr2
416414
assert param_group['eps'] == eps2
417-
415+
418416
lr3 = 1e-1
419417
eps3 = 1e-3
420418
optim_cfg3 = AdamWConfig(lr=lr3, eps=eps3)

xtuner/v1/model/base.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,6 @@ def fully_shard(
421421

422422
if self.fsdp_config.requires_grad:
423423
for name, module in self.named_modules():
424-
# if "ts_model" in name:
425-
# torch.distributed.breakpoint()
426424
for p_name, param in module.named_parameters(recurse=False):
427425
if param.requires_grad:
428426
param_fp32 = torch.nn.Parameter(param.to(dtype=torch.float32))

xtuner/v1/module/decoder_layer/moe_decoder_layer.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -692,15 +692,6 @@ def forward(
692692
def backward(ctx, grad_output: torch.Tensor):
693693
current_stream = torch.cuda.current_stream()
694694

695-
# if ctx.name == "pre_dispatched":
696-
# torch.cuda.synchronize()
697-
#
698-
# if ctx.name == "dispatched":
699-
# torch.cuda.synchronize()
700-
#
701-
# if ctx.name == "pre_combined":
702-
# torch.cuda.synchronize()
703-
#
704695
if ctx.previous_backward_event is not None:
705696
current_stream.wait_event(ctx.previous_backward_event)
706697
if ctx.finished_backward_event is not None:
@@ -710,19 +701,3 @@ def backward(ctx, grad_output: torch.Tensor):
710701

711702

712703
backward_sync = _BackwardSync.apply
713-
714-
715-
# class _DebugBackward(Function):
716-
# @staticmethod
717-
# def forward(
718-
# ctx,
719-
# input_tensor: torch.Tensor,
720-
# name: str
721-
# ) -> torch.Tensor:
722-
# ctx.name = name
723-
# return input_tensor
724-
#
725-
# @staticmethod
726-
# def backward(ctx, grad_output: torch.Tensor):
727-
# print(ctx.name)
728-
# return grad_output, None

xtuner/v1/ops/flash_attn/gpu.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,8 +529,7 @@ def _flash_attn_varlen_forward(
529529
return_softmax,
530530
None,
531531
)
532-
# if out.isnan().any() or softmax_lse.isnan().any():
533-
# breakpoint()
532+
534533
return out, softmax_lse, S_dmask, rng_state
535534

536535
@torch.library.register_fake("flash_attn::_flash_attn_varlen_forward_v2")

xtuner/v1/ops/flash_attn/npu.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ def npu_flash_varlen_attn(
2020
return_attn_probs=False,
2121
block_table=None,
2222
):
23-
# print(f"rank:{torch.distributed.get_rank()}, cu_seqlens_k:{cu_seqlens_q.type}")
2423
if not causal:
2524
fa_out = torch_npu.npu_fusion_attention(
2625
q,

xtuner/v1/ops/moe/cuda/triton_kernels/k_grouped_gemm_TMA_triton3_4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def trace_handler(prof):
278278
# post-process, row normalization
279279
out_triton = row_max_normalization(out_triton)
280280
out_ref = row_max_normalization(out_ref)
281-
# breakpoint()
281+
282282
torch.testing.assert_close(out_triton, out_ref, rtol=0.001, atol=0.01)
283283

284284
print(f"{m = }, {n = }, {K = }")

xtuner/v1/ops/moe/cuda/triton_kernels/m_grouped_gemm_TMA_triton3_4.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,6 @@ def trace_handler(prof):
406406
# with record_function(f"Cutlass_record"):
407407
# backend.gmm(a, b, out_cutlass, batch_sizes, False, trans_b)
408408
prof.step()
409-
# breakpoint()
410409
# post-process, row normalization
411410
out_triton = row_max_normalization(out_triton)
412411
# out_cublas = row_max_normalization(out_cublas)

xtuner/v1/ray/judger/dapo_math.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,6 @@ def is_correct_strict_box(
205205
# Extract and check the boxed answer
206206
boxed_pred = last_boxed_only_string(pred)
207207
extracted_pred = remove_boxed(boxed_pred) if boxed_pred is not None else None
208-
# print("==========", extracted_pred, gt)
209208

210209
return 1 if (extracted_pred == gt) else -1, extracted_pred
211210

xtuner/v1/train/trainer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1256,7 +1256,6 @@ def meta(self) -> XTunerMeta:
12561256
def _data_iter(self):
12571257
data_iter = iter(self._dataloader)
12581258
while self._cur_step < self.total_step:
1259-
# dist.breakpoint(skip=14)
12601259
try:
12611260
data = next(data_iter)
12621261
except StopIteration:

xtuner/v1/utils/grad_norm.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def cal_total_norm(
6969

7070
def cal_grad_norm(grads: List[DTensor], dtype=torch.float32):
7171
grouped_grads = group_tensors_by_device_mesh_and_placements(grads)
72-
# print(f"clip_grad_norm dtype: {dtype}")
7372
total_norms = []
7473
for grads in grouped_grads.values():
7574
total_norm = cal_total_norm(grads, norm_type=2.0, foreach=True, dtype=dtype)

0 commit comments

Comments
 (0)