modelscope · Jintao-Huang · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -2003,3 +2003,9 @@ def _get_inputs_embeds_hf(inputs_embeds, inputs, visual, processor, config):
                 video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
         return inputs_embeds
+
+    @staticmethod
+    def _concat_text_position_ids(position_ids):
+        seq_len = position_ids.shape[-1]
+        text_position_ids = torch.arange(seq_len, device=position_ids.device).expand(1, *position_ids.shape[1:])
+        return torch.concat([text_position_ids, position_ids], dim=0)
diff --git a/swift/llm/template/template/glm.py b/swift/llm/template/template/glm.py
@@ -317,15 +317,15 @@ def _get_position_ids(self, inputs: Dict[str, Any]):
             inputs.get('image_grid_thw'),
             inputs.get('video_grid_thw'),
             attention_mask=inputs.get('attention_mask'))
-        text_position_ids = torch.arange(inputs['input_ids'].shape[-1])
-        return torch.concat([text_position_ids[None, None], position_ids], dim=0)
+        return self._concat_text_position_ids(position_ids)
 
     def forward_context(self, model, inputs):
         position_ids = inputs['position_ids']
         inputs['position_ids'] = position_ids[1:]
-        inputs['text_position_ids'] = position_ids[0]
+        inputs['text_position_ids'] = text_position_ids = position_ids[0]
         # https://github.com/huggingface/transformers/pull/40194
-        inputs.update(get_packed_seq_params(inputs['text_position_ids']))
+        if text_position_ids.shape[0] == 1:
+            inputs.update(get_packed_seq_params(text_position_ids))
         return super().forward_context(model, inputs)
 
     def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -313,7 +313,7 @@ def forward_context(self, model, inputs):
         inputs['position_ids'] = position_ids[1:]
         inputs['text_position_ids'] = text_position_ids = position_ids[0]
         transformers_version = version.parse(transformers.__version__)
-        if transformers_version >= version.parse('4.53'):
+        if transformers_version >= version.parse('4.53') and text_position_ids.shape[0] == 1:
             # https://github.com/huggingface/transformers/pull/40194
             inputs.update(get_packed_seq_params(text_position_ids))
             return super().forward_context(model, inputs)
@@ -372,8 +372,7 @@ def _get_position_ids(self, inputs: Dict[str, Any]):
             inputs.get('video_grid_thw'),
             attention_mask=inputs.get('attention_mask'),
             **kwargs)
-        text_position_ids = torch.arange(inputs['input_ids'].shape[-1])
-        return torch.concat([text_position_ids[None, None], position_ids], dim=0)
+        return self._concat_text_position_ids(position_ids)
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super()._data_collator(batch, padding_to=padding_to)
@@ -591,8 +590,7 @@ def _get_position_ids(self, inputs: Dict[str, Any]):
             audio_feature_lengths,
             video_second_per_grid,
         )
-        text_position_ids = torch.arange(inputs['input_ids'].shape[-1])
-        return torch.concat([text_position_ids[None, None], position_ids], dim=0)
+        return self._concat_text_position_ids(position_ids)
 
     def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
         res = super()._data_collator_mm_data(batch)