-
Notifications
You must be signed in to change notification settings - Fork 1k
Description
Traceback (most recent call last):
[rank3]: Traceback (most recent call last):
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/cli/_megatron/sft.py", line 5, in
[rank3]: megatron_sft_main()
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 79, in megatron_sft_main
[rank3]: return MegatronSft(args).main()
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/llm/base.py", line 49, in main
[rank3]: result = self.run()
[rank3]: ^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 69, in run
[rank3]: self.trainer.train(train_dataset, val_dataset, data_collator)
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 774, in train
[rank3]: pretrain(
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 806, in pretrain
[rank3]: model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 263, in setup_model_and_optimizer
[rank3]: model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 1296, in setup_model_and_optimizer
[rank3]: args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
[rank3]: ^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 1264, in load_checkpoint
[rank3]: state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 184, in _load_base_checkpoint
[rank3]: return checkpointing.origin__load_base_checkpoint(_args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 993, in _load_base_checkpoint
[rank3]: return _load_global_dist_base_checkpoint(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 868, in _load_global_dist_base_checkpoint
[rank3]: state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/serialization.py", line 178, in load_common_state_dict
[rank3]: return common_strategy.load_common(checkpoint_dir)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/strategies/common.py", line 89, in load_common
[rank3]: return torch.load(load_path, map_location='cpu', weights_only=False)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 1530, in load
[rank3]: return _load(
[rank3]: ^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2119, in _load
[rank3]: result = unpickler.load()
[rank3]: ^^^^^^^^^^^^^^^^
[rank3]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2108, in find_class
[rank3]: return super().find_class(mod_name, name)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: ModuleNotFoundError: No module named 'swift.megatron.model.gpt_bridge'
[rank5]: Traceback (most recent call last):
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/cli/_megatron/sft.py", line 5, in
[rank5]: megatron_sft_main()
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 79, in megatron_sft_main
[rank5]: return MegatronSft(args).main()
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/llm/base.py", line 49, in main
[rank5]: result = self.run()
[rank5]: ^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 69, in run
[rank5]: self.trainer.train(train_dataset, val_dataset, data_collator)
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 774, in train
[rank5]: pretrain(
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 806, in pretrain
[rank5]: model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 263, in setup_model_and_optimizer
[rank5]: model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 1296, in setup_model_and_optimizer
[rank5]: args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
[rank5]: ^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 1264, in load_checkpoint
[rank5]: state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 184, in _load_base_checkpoint
[rank5]: return checkpointing.origin__load_base_checkpoint(_args, **kwargs)
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 993, in _load_base_checkpoint
[rank5]: return _load_global_dist_base_checkpoint(
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 868, in _load_global_dist_base_checkpoint
[rank5]: state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name)
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/serialization.py", line 178, in load_common_state_dict
[rank5]: return common_strategy.load_common(checkpoint_dir)
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/strategies/common.py", line 89, in load_common
[rank5]: return torch.load(load_path, map_location='cpu', weights_only=False)
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 1530, in load
[rank5]: return _load(
[rank5]: ^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2119, in _load
[rank5]: result = unpickler.load()
[rank5]: ^^^^^^^^^^^^^^^^
[rank5]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2108, in find_class
[rank5]: return super().find_class(mod_name, name)
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank5]: ModuleNotFoundError: No module named 'swift.megatron.model.gpt_bridge'
[rank4]: Traceback (most recent call last):
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/cli/_megatron/sft.py", line 5, in
[rank4]: megatron_sft_main()
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 79, in megatron_sft_main
[rank4]: return MegatronSft(args).main()
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/llm/base.py", line 49, in main
[rank4]: result = self.run()
[rank4]: ^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 69, in run
[rank4]: self.trainer.train(train_dataset, val_dataset, data_collator)
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 774, in train
[rank4]: pretrain(
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 806, in pretrain
[rank4]: model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 263, in setup_model_and_optimizer
[rank4]: model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/training.py", line 1296, in setup_model_and_optimizer
[rank4]: args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
[rank4]: ^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 1264, in load_checkpoint
[rank4]: state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/swift/megatron/trainers/base.py", line 184, in _load_base_checkpoint
[rank4]: return checkpointing.origin__load_base_checkpoint(*_args, **kwargs)
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 993, in _load_base_checkpoint
[rank4]: return _load_global_dist_base_checkpoint(
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/training/checkpointing.py", line 868, in _load_global_dist_base_checkpoint
[rank4]: state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name)
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/serialization.py", line 178, in load_common_state_dict
[rank4]: return common_strategy.load_common(checkpoint_dir)
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/home/haolu/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/dist_checkpointing/strategies/common.py", line 89, in load_common
[rank4]: return torch.load(load_path, map_location='cpu', weights_only=False)
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 1530, in load
[rank4]: return _load(
[rank4]: ^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2119, in _load
[rank4]: result = unpickler.load()
[rank4]: ^^^^^^^^^^^^^^^^
[rank4]: File "/data/jcxy/haolu/anaconda3/envs/Moe/lib/python3.11/site-packages/torch/serialization.py", line 2108, in find_class
[rank4]: return super().find_class(mod_name, name)
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank4]: ModuleNotFoundError: No module named 'swift.megatron.model.gpt_bridge'