From d7a24506f95df2f27d8fae7bb6674eabf4900735 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 13 Nov 2025 14:09:19 +0800 Subject: [PATCH 1/2] fix bridge vpp --- swift/megatron/model/gpt_bridge.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py index e15ea770a6..5d69a10df6 100644 --- a/swift/megatron/model/gpt_bridge.py +++ b/swift/megatron/model/gpt_bridge.py @@ -978,8 +978,9 @@ def _convert(self, mg_models, hf_state_dict, hf_prefix: str, to_mcore: bool, tqd else: yield from list(self._add_prefix(hf_state_dict, hf_prefix).items()) hf_state_dict = {} - for layer_idx in tqdm( - range(self.args.num_layers), dynamic_ncols=True, desc=tqdm_desc, disable=self.disable_tqmd): + layer_idx = 0 + prog_bar = tqdm(range(self.args.num_layers), dynamic_ncols=True, desc=tqdm_desc, disable=self.disable_tqmd) + while layer_idx < self.args.num_layers: lm_model = getattr(mg_model, 'language_model') if self.args.is_multimodal else mg_model if len(lm_model.decoder.layers) > 0: start_idx = lm_model.decoder.layers[0].layer_number - 1 @@ -990,6 +991,8 @@ def _convert(self, mg_models, hf_state_dict, hf_prefix: str, to_mcore: bool, tqd mg_layer = lm_model.decoder.layers[layer_idx - start_idx] else: if to_mcore: + layer_idx += 1 + prog_bar.update() continue else: mg_layer = None @@ -997,9 +1000,11 @@ def _convert(self, mg_models, hf_state_dict, hf_prefix: str, to_mcore: bool, tqd has_model = torch.tensor([mg_layer is not None], dtype=torch.bool, device='cuda') dist.all_reduce(has_model, group=self.pp_group) if not has_model: - mg_model = next(mg_models) + mg_model = next(mg_models) # compat vpp continue res = self._set_layer_state(mg_layer, hf_state_dict, f'{self.hf_layers_prefix}.', layer_idx, to_mcore) + layer_idx += 1 + prog_bar.update() if to_mcore: yield else: From 0c914378fae1ac77f567203e2d3a7a25ee2a5abe Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 13 Nov 2025 14:38:05 +0800 Subject: [PATCH 2/2] fix cp kto --- swift/megatron/trainers/kto_trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/swift/megatron/trainers/kto_trainer.py b/swift/megatron/trainers/kto_trainer.py index a3d8cd2f01..d0a385aa41 100644 --- a/swift/megatron/trainers/kto_trainer.py +++ b/swift/megatron/trainers/kto_trainer.py @@ -50,7 +50,7 @@ def _kto_get_logps(self, output_tensor, data, is_KL: bool, is_ref: bool, length: return self.get_logps(output, labels, packed_seq_params, packed_seq_params.num_samples) def loss_func(self, output_tensor, *, data, kl_data, label): - length = data['packed_seq_params'].cu_seqlens_q[-1] + length = data['packed_seq_params'].cu_seqlens_q[-1] // self.args.context_parallel_size policy_logps = self._kto_get_logps(output_tensor, data, False, False, length) ref_logps = self._kto_get_logps(output_tensor, data, False, True, length) if self.args.calculate_KL: @@ -121,8 +121,7 @@ def forward_step(self, data_iterator, model): data.pop('loss_scale', None) kl_data.pop('loss_scale', None) - length = data['packed_seq_params'].cu_seqlens_q[-1] - + length = data['packed_seq_params'].cu_seqlens_q[-1] // self.args.context_parallel_size with torch.no_grad(), self.null_ref_context() as ref_models: ref_model = ref_models[vp_stage or 0] if self.args.calculate_KL: