Skip to content

Megatron-LM-v1.1.5-3D_parallelism ds_pretrain_gpt2_pipe.sh error #185

@zhisunyy

Description

@zhisunyy

Error occurred running megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh

finished creating GPT2 datasets ...
setting training data start iteration to 0
setting validation data start iteration to 0
done with setups ...
time (ms) | model and optimizer: 1716.01 | train/valid/test data iterators: 5358.74
training ...
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with None total layers
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:554:forward] ----Synchronization False
[2022-07-06 11:31:24,416] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False
Traceback (most recent call last):
Traceback (most recent call last):
File "pretrain_gpt2.py", line 157, in
File "pretrain_gpt2.py", line 157, in
Traceback (most recent call last):
File "pretrain_gpt2.py", line 157, in
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,pretrain(train_valid_test_datasets_provider, model_provider, forward_step,

File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
Traceback (most recent call last):
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
File "pretrain_gpt2.py", line 157, in
iteration = train(forward_step_func,
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
return train_step_pipe(model, data_iterator)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
return train_step_pipe(model, data_iterator)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
loss_dict, skipped_iter = train_step(forward_step_func,loss = model.train_batch(data_iter=data_iterator)

File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 98, in pretrain
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
return train_step_pipe(model, data_iterator)
self._exec_schedule(sched)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
iteration = train(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 481, in train
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 325, in train_step
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
return train_step_pipe(model, data_iterator)
self._exec_instr(**cmd.kwargs)
File "/root/zsf/DeepSpeedExamples-master/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py", line 359, in train_step_pipe
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
loss = model.train_batch(data_iter=data_iterator)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 353, in train_batch
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
self._exec_schedule(sched)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1384, in _exec_schedule
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
self._exec_instr(**cmd.kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1034, in _exec_send_grads
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range
p2p.send(inputs[1], self.prev_stage)
IndexError: tuple index out of range

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions