-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
Error occurred running megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh
finished creating GPT2 datasets ...
setting training data start iteration to 0
setting validation data start iteration to 0
done with setups ...
time (ms) | model and optimizer: 1716.01 | train/valid/test data iterators: 5358.74
training ...
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with None total layers
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:554:forward] ----Synchronization False
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False
Traceback (most recent call last):
Traceback (most recent call last):
File "pretrain_gpt2.py", line 157, in
File "pretrain_gpt2.py", line 157, in
Traceback (most recent call last):
File "pretrain_gpt2.py", line 157, in
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
Traceback (most recent call last):
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
File "pretrain_gpt2.py", line 157, in
iteration = train(forward_step_func,
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
return train_step_pipe(model, data_iterator)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
return train_step_pipe(model, data_iterator)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
loss_dict, skipped_iter = train_step(forward_step_func,loss = model.train_batch(data_iter=data_iterator)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
return train_step_pipe(model, data_iterator)
self._exec_schedule(sched)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
return train_step_pipe(model, data_iterator)
self._exec_instr(**cmd.kwargs)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range