[rank7]: Traceback (most recent call last):
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/cli/sft.py", line 20, in <module>
[rank7]: sft_main()
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 364, in sft_main
[rank7]: return SwiftSft(args).main()
[rank7]: ^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/base.py", line 49, in main
[rank7]: result = self.run()
[rank7]: ^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/ray/base.py", line 170, in wrapper
[rank7]: return func(self, *args, **kwargs)
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 209, in run
[rank7]: return self.train(trainer)
[rank7]: ^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 257, in train
[rank7]: trainer.train(trainer.args.resume_from_checkpoint)
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/mixin.py", line 841, in train
[rank7]: res = super().train(*args, **kwargs)
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 2325, in train
[rank7]: return inner_training_loop(
[rank7]: ^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
[rank7]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/trainers.py", line 420, in training_step
[rank7]: return super().training_step(model, inputs, *args, **kwargs)
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 4020, in training_step
[rank7]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank7]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/trainers.py", line 376, in compute_loss
[rank7]: metrics[f'loss_{channel}'].update(outputs.loss[slice_][masks[slice_]])
[rank7]: ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
[rank7]: IndexError: The shape of the mask [1] at index 0 does not match the shape of the indexed tensor [2903] at index 0
[rank3]: Traceback (most recent call last):
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/cli/sft.py", line 20, in <module>
[rank3]: sft_main()
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 364, in sft_main
[rank3]: return SwiftSft(args).main()
[rank3]: ^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/base.py", line 49, in main
[rank3]: result = self.run()
[rank3]: ^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/ray/base.py", line 170, in wrapper
[rank3]: return func(self, *args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 209, in run
[rank3]: return self.train(trainer)
[rank3]: ^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/llm/train/sft.py", line 257, in train
[rank3]: trainer.train(trainer.args.resume_from_checkpoint)
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/mixin.py", line 841, in train
[rank3]: res = super().train(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 2325, in train
[rank3]: return inner_training_loop(
[rank3]: ^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
[rank3]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/trainers.py", line 420, in training_step
[rank3]: return super().training_step(model, inputs, *args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/env/conda/envs/ms-swift-3.11.1/lib/python3.11/site-packages/transformers/trainer.py", line 4020, in training_step
[rank3]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/cpfs/shared/L4-new/VLM/dxb/code/ms-swift-3.11.1/swift/trainers/trainers.py", line 376, in compute_loss
[rank3]: metrics[f'loss_{channel}'].update(outputs.loss[slice_][masks[slice_]])
[rank3]: ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
[rank3]: IndexError: The shape of the mask [2383] at index 0 does not match the shape of the indexed tensor [1, 49140] at index 0
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NNODES=2 \
NODE_RANK=0 \
MASTER_ADDR=xx.xx.xx.xx \
MASTER_PORT=29501 \
NPROC_PER_NODE=8 \
swift sft \
--model Qwen3-VL-8B-Instruct \
--train_type full \
--dataset 'test.jsonl' \
--load_from_cache_file false \
--torch_dtype bfloat16 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--learning_rate 1e-5 \
--gradient_accumulation_steps 1 \
--eval_steps 100 \
--save_steps 100 \
--save_total_limit 2 \
--logging_steps 1 \
--output_dir output \
--system 'You are a helpful assistant.' \
--warmup_ratio 0.05 \
--dataloader_num_workers 8 \
--attn_impl flash_attn \
--ddp_backend nccl \
--dataset_num_proc 8 \
--packing true \
--max_length 24576 \
--dataloader_pin_memory false \
--sequence_parallel_size 2 \
--save_only_model true \
--deepspeed zero3 \
--enable_channel_loss true \
两台机器共16张A100显卡
ubuntu22.04系统、cuda12.2、torch2.8.0
ms-swift3.11.1
Describe the bug
多机多卡 sft Qwen3-VL-8B时同时开启sequence_parallel_size和enable_channel_loss时报错:
训练脚本如下:
Your hardware and system info
Write your system info like CUDA version/system/GPU/torch version here(在这里给出硬件信息和系统信息,如```
CUDA版本,系统,GPU型号和torch版本等)
Additional context
Add any other context about the problem here(在这里补充其他信息)