-
Notifications
You must be signed in to change notification settings - Fork 906
Description
Train: 0%| | 0/19314 [00:00<?, ?it/s]/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn(
Traceback (most recent call last):
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
return func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1992, in all_reduce
work = group.allreduce([tensor], opts)
RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/cli/sft.py", line 5, in
sft_main()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 256, in sft_main
return SwiftSft(args).main()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/base.py", line 46, in main
result = self.run()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 137, in run
return self.train(trainer)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/llm/train/sft.py", line 196, in train
trainer.train(trainer.args.resume_from_checkpoint)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/swift/trainers/mixin.py", line 261, in train
res = super().train(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2184, in train
return inner_training_loop(
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2490, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3640, in training_step
self.accelerator.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 2238, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/accelerate/utils/deepspeed.py", line 261, in backward
self.engine.backward(loss, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 2011, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2063, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/autograd/init.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 907, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1423, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 936, in reduce_independent_p_g_buckets_and_remove_grads
self.reduce_ipg_grads()
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1371, in reduce_ipg_grads
self.average_tensor(extra_large_grad_reduc.view(-1))
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1134, in average_tensor
self.allreduce_and_scatter(buckets[bucket_key],
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1027, in allreduce_and_scatter
self.allreduce_and_copy_with_multiple_ranks(small_bucket,
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1010, in allreduce_and_copy_with_multiple_ranks
allreduced = self.allreduce_bucket(small_bucket, log=log, divide=divide, process_group=process_group)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1507, in allreduce_bucket
dist.all_reduce(tensor_to_allreduce, group=process_group)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 496, in all_reduce
return cdb.all_reduce(tensor, op, group, async_op)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
return fn(*args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 152, in all_reduce
return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 74, in wrapper
msg_dict = _get_msg_dict(func.name, *args, **kwargs)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 49, in _get_msg_dict
"args": f"{args}, {kwargs}",
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor.py", line 461, in repr
return torch._tensor_str._str(self, tensor_contents=tensor_contents)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 677, in _str
return _str_intern(self, tensor_contents=tensor_contents)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 597, in _str_intern
tensor_str = _tensor_str(self, indent)
File "/root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/_tensor_str.py", line 331, in _tensor_str
self = self.float()
RuntimeError: CUDA error: misaligned address
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: misaligned address
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:44 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f430d478d87 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f430d42975f in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x118 (0x7f430d5498a8 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x1d40e (0x7f430d51440e in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #4: + 0x1f744 (0x7f430d516744 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #5: + 0x1fb6d (0x7f430d516b6d in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so)
frame #6: + 0x540210 (0x7f4357546210 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #7: + 0x649bf (0x7f430d45d9bf in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #8: c10::TensorImpl::~TensorImpl() + 0x21b (0x7f430d456c8b in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #9: c10::TensorImpl::~TensorImpl() + 0x9 (0x7f430d456e39 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #10: + 0x802b98 (0x7f4357808b98 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #11: THPVariable_subclass_dealloc(_object*) + 0x2f6 (0x7f4357808f16 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #12: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x53e368]
frame #13: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58dea0]
frame #14: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #15: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #16: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #17: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #18: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #19: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #20: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #21: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #22: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #23: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #24: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #25: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #26: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #27: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #28: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #29: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #30: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #31: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #32: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #33: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #34: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #35: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #36: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #37: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x58df77]
frame #38: PyDict_SetItem + 0x377 (0x6032b7 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #39: PyDict_SetItemString + 0x41 (0x5feec1 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #40: /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python() [0x6c131a]
frame #41: Py_FinalizeEx + 0x149 (0x6c17e9 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #42: Py_RunMain + 0xf0 (0x704f60 in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #43: Py_BytesMain + 0x2d (0x7053bd in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
frame #44: __libc_start_main + 0xf3 (0x7f435992d083 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #45: _start + 0x2e (0x630e2e in /root/paddlejob/workspace/env_run/qwen2.5vl/qwen2.5vl_env/bin/python)
[2025-02-24 20:48:57,821] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8260 closing signal SIGTERM
[2025-02-24 20:48:57,823] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8261 closing signal SIGTERM
[2025-02-24 20:48:57,823] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8262 closing signal SIGTERM
[2025-02-24 20:48:57,824] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8263 closing signal SIGTERM
[2025-02-24 20:48:57,825] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8264 closing signal SIGTERM
[2025-02-24 20:48:57,826] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8265 closing signal SIGTERM
[2025-02-24 20:48:57,826] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 8267 closing signal SIGTERM