Skip to content

qwen2.5 vl 7B sft8节点以上CUDA error: misaligned #3283

@wqf321

Description

@wqf321

Train: 0%| | 0/19314 [00:00<?, ?it/s]/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
Traceback (most recent call last):
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
return func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1992, in all_reduce
work = group.allreduce([tensor], opts)
RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/cli/sft.py", line 5, in
sft_main()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 256, in sft_main
return SwiftSft(args).main()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/base.py", line 46, in main
result = self.run()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 137, in run
return self.train(trainer)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 196, in train
trainer.train(trainer.args.resume_from_checkpoint)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/trainers/mixin.py", line 261, in train
res = super().train(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2184, in train
return inner_training_loop(
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2490, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3640, in training_step
self.accelerator.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 2238, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/accelerate/utils/deepspeed.py", line 261, in backward
self.engine.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 2011, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2063, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/autograd/init.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 907, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1423, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 936, in reduce_independent_p_g_buckets_and_remove_grads
self.reduce_ipg_grads()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1371, in reduce_ipg_grads
self.average_tensor(extra_large_grad_reduc.view(-1))
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1134, in average_tensor
self.allreduce_and_scatter(buckets[bucket_key],
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1027, in allreduce_and_scatter
self.allreduce_and_copy_with_multiple_ranks(small_bucket,
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1010, in allreduce_and_copy_with_multiple_ranks
allreduced = self.allreduce_bucket(small_bucket, log=log, divide=divide, process_group=process_group)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1507, in allreduce_bucket
dist.all_reduce(tensor_to_allreduce, group=process_group)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 496, in all_reduce
return cdb.all_reduce(tensor, op, group, async_op)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
return fn(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 152, in all_reduce
return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 74, in wrapper
msg_dict = _get_msg_dict(func.name, *args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 49, in _get_msg_dict
"args": f"{args}, {kwargs}",
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor.py", line 461, in repr
return torch._tensor_str._str(self, tensor_contents=tensor_contents)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 677, in _str
return _str_intern(self, tensor_contents=tensor_contents)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 597, in _str_intern
tensor_str = _tensor_str(self, indent)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 331, in _tensor_str
self = self.float()
RuntimeError: CUDA error: misaligned address
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: misaligned address
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:44 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f430d478d87 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f430d42975f in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x118 (0x7f430d5498a8 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x1d40e (0x7f430d51440e in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #4: + 0x1f744 (0x7f430d516744 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #5: + 0x1fb6d (0x7f430d516b6d in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #6: + 0x540210 (0x7f4357546210 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #7: + 0x649bf (0x7f430d45d9bf in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #8: c10::TensorImpl::~TensorImpl() + 0x21b (0x7f430d456c8b in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #9: c10::TensorImpl::~TensorImpl() + 0x9 (0x7f430d456e39 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #10: + 0x802b98 (0x7f4357808b98 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #11: THPVariable_subclass_dealloc(_object*) + 0x2f6 (0x7f4357808f16 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #12: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x53e368]
frame #13: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58dea0]
frame #14: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #15: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #16: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #17: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #18: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #19: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #20: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #21: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #22: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #23: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #24: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #25: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #26: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #27: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #28: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #29: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #30: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #31: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #32: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #33: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #34: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #35: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #36: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #37: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #38: PyDict_SetItem + 0x377 (0x6032b7 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #39: PyDict_SetItemString + 0x41 (0x5feec1 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #40: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x6c131a]
frame #41: Py_FinalizeEx + 0x149 (0x6c17e9 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #42: Py_RunMain + 0xf0 (0x704f60 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #43: Py_BytesMain + 0x2d (0x7053bd in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #44: __libc_start_main + 0xf3 (0x7f435992d083 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #45: _start + 0x2e (0x630e2e in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)

[2025-02-24 20:48:57,821] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8260 closing signal SIGTERM
[2025-02-24 20:48:57,823] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8261 closing signal SIGTERM
[2025-02-24 20:48:57,823] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8262 closing signal SIGTERM
[2025-02-24 20:48:57,824] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8263 closing signal SIGTERM
[2025-02-24 20:48:57,825] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8264 closing signal SIGTERM
[2025-02-24 20:48:57,826] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8265 closing signal SIGTERM
[2025-02-24 20:48:57,826] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8267 closing signal SIGTERM

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions