diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py index a6bf8d3990..bb9a3cdfea 100644 --- a/swift/llm/template/template/qwen.py +++ b/swift/llm/template/template/qwen.py @@ -536,7 +536,10 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int return ['<|audio_start|><|audio_pad|><|audio_end|>'] elif media_type == 'video': video = inputs.videos[index] - inputs.videos[index] = fetch_video({'video': video}).to(torch.uint8) + _video = fetch_video({'video': video}) + if isinstance(_video, torch.Tensor): + _video = _video.to(torch.uint8) + inputs.videos[index] = _video if self.use_audio_in_video: import librosa if video.startswith('http://') or video.startswith('https://'):