-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
如题,7B模型多机训练没有问题,同样脚本,因为72B模型vllm/lmdeploy需要开tensor并行,会超时
shell 脚本
`
export NNODES="$MLP_WORKER_NUM"
export NODE_RANK="$MLP_ROLE_INDEX"
export MASTER_ADDR="$MLP_WORKER_0_HOST"
export MASTER_PORT="$MLP_WORKER_0_PORT"
export TP_SIZE=4
echo "NODE_RANK=$NODE_RANK"
if [ "$NODE_RANK" = "0" ]; then
export NPROC_PER_NODE=4
else
export NPROC_PER_NODE=8
fi
echo "MASTER_ADDR=$MASTER_ADDR"
echo "NPROC_PER_NODE=$NPROC_PER_NODE"
swift rlhf
--rlhf_type grpo
--model xxx
--model_type qwen2_5
--reward_funcs accuracy format
--use_lmdeploy true
--train_type full
--torch_dtype bfloat16
--dataset 'AI-MO/NuminaMath-TIR#5000'
--max_completion_length 8192
--num_train_epochs 15
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--learning_rate 1e-6
--gradient_accumulation_steps 1
--eval_steps 10
--save_steps 10
--save_total_limit 2
--logging_steps 1
--max_length 8192
--output_dir output
--warmup_ratio 0.05
--dataloader_num_workers 8
--dataset_num_proc 8
--num_generations 7
--temperature 0.9
--attn_impl flash_attn
--system 'xxx/code/ms-swift/examples/train/grpo/prompt.txt'
--deepspeed zero3_offload
--log_completions true
--report_to wandb
--async_generate true
--num_iterations 1
--num_infer_workers 1
`
报错
