From 39a189d8371d82c5e5b6edce4344066a8e688c62 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sat, 7 Oct 2023 12:02:12 +0800 Subject: [PATCH 1/7] update readme and sh --- examples/pytorch/llm/README.md | 1 + examples/pytorch/llm/README_CN.md | 5 ++- .../baichuan2_7b_chat/lora_ddp_ds/infer.sh | 17 ++++++++ .../baichuan2_7b_chat/lora_ddp_ds/sft.sh | 40 +++++++++++++++++++ 4 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index 50f6069367..a44c25dee4 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -48,6 +48,7 @@ ## News +- 2023.10.7: Supported DeepSpeed ZeRO-2, enabling LoRA (not just QLoRA) to run DDP on 2*A10. The corresponding shell script can be found at `scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`. - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. - 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat. diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index f8ff30a2b0..caa4d0f6f8 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -48,11 +48,12 @@ ## 新闻 +- 2023.10.7: 支持DeepSpeed ZeRO-2, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP. 对应的sh脚本可以查看`scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`. - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. -- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat +- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat. - 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. -- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat +- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat. - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`. - 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. - 2023.9.5: 支持openbuddy-llama2-70b模型. diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh new file mode 100644 index 0000000000..b16ce5a171 --- /dev/null +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh @@ -0,0 +1,17 @@ +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_infer.py \ + --model_type baichuan2-7b-chat \ + --sft_type lora \ + --template_type baichuan \ + --dtype bf16 \ + --ckpt_dir "output/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ + --eval_human false \ + --dataset damo-agent-mini-zh \ + --max_length 4096 \ + --max_new_tokens 2048 \ + --temperature 0.9 \ + --top_k 20 \ + --top_p 0.9 \ + --do_sample true \ + --merge_lora_and_save false \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh new file mode 100644 index 0000000000..2f387b1ea7 --- /dev/null +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh @@ -0,0 +1,40 @@ +# Experimental environment: 2 * A10 +# 2 * 21GB GPU memory +nproc_per_node=2 + +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0,1 \ +torchrun \ + --nproc_per_node=$nproc_per_node \ + --master_port 29500 \ + src/llm_sft.py \ + --model_type baichuan2-7b-chat \ + --sft_type lora \ + --template_type baichuan \ + --dtype bf16 \ + --output_dir output \ + --ddp_backend nccl \ + --dataset damo-agent-mini-zh \ + --train_dataset_sample -1 \ + --num_train_epochs 1 \ + --max_length 4096 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --lora_dropout_p 0. \ + --lora_target_modules ALL \ + --gradient_checkpointing true \ + --batch_size 1 \ + --weight_decay 0. \ + --learning_rate 1e-4 \ + --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ + --max_grad_norm 0.5 \ + --warmup_ratio 0.03 \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 10 \ + --push_to_hub false \ + --hub_model_id baichuan2-7b-chat-lora \ + --hub_private_repo true \ + --hub_token 'your-sdk-token' \ + --deepspeed_config_path ds_config/zero2.json From 095768b426bb6f646f53c2efc2cf93aa81933d58 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sat, 7 Oct 2023 16:59:31 +0800 Subject: [PATCH 2/7] update dataset.py --- examples/pytorch/llm/src/utils/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 96a20168dd..36a9f65784 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -438,7 +438,8 @@ def get_medical_dataset( ]) val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() if train_dataset_sample >= 0: - idxs = np.random.permutation(train_dataset_sample) + random_state = np.random.RandomState(42) + idxs = random_state.permutation(train_dataset_sample) train_dataset = train_dataset.select(idxs) return tuple( _preprocess_medical(dataset, subset_name) From 8231b64a20009de3d9ecb7364568289b5a0d8052 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 8 Oct 2023 14:32:33 +0800 Subject: [PATCH 3/7] update sh --- examples/pytorch/llm/requirements.txt | 4 ++ .../baichuan2_7b_chat/lora_ddp_ds/sft.sh | 2 +- .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh | 7 +-- .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh | 6 +-- .../scripts/chatglm2_6b/lora_ddp_ds/infer.sh | 17 ++++++++ .../scripts/chatglm2_6b/lora_ddp_ds/sft.sh | 40 +++++++++++++++++ .../{lora_ddp => lora_ddp_ds}/infer.sh | 2 +- .../qwen_7b/{lora_ddp => lora_ddp_ds}/sft.sh | 7 ++- .../scripts/qwen_7b_chat/lora_ddp_ds/sft.sh | 4 +- .../scripts/qwen_7b_chat/lora_mp_ddp/sft.sh | 2 +- .../qwen_7b_chat/qlora_ddp_ds/infer.sh | 21 +++++++++ .../scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh | 43 +++++++++++++++++++ 12 files changed, 142 insertions(+), 13 deletions(-) create mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh create mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh rename examples/pytorch/llm/scripts/qwen_7b/{lora_ddp => lora_ddp_ds}/infer.sh (95%) rename examples/pytorch/llm/scripts/qwen_7b/{lora_ddp => lora_ddp_ds}/sft.sh (83%) create mode 100644 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh create mode 100644 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh diff --git a/examples/pytorch/llm/requirements.txt b/examples/pytorch/llm/requirements.txt index f3b70a775a..6147e7a146 100644 --- a/examples/pytorch/llm/requirements.txt +++ b/examples/pytorch/llm/requirements.txt @@ -1,6 +1,10 @@ +accelerate charset_normalizer cpm_kernels +matplotlib modelscope>=1.9 sentencepiece +tensorboard tiktoken +transformers transformers_stream_generator diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh index 2f387b1ea7..d3b3a00057 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh @@ -37,4 +37,4 @@ torchrun \ --hub_model_id baichuan2-7b-chat-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ - --deepspeed_config_path ds_config/zero2.json + --deepspeed_config_path 'ds_config/zero2.json' \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh index acea15fd0b..a143f9f1e5 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh @@ -7,10 +7,11 @@ python src/llm_infer.py \ --dtype bf16 \ --ckpt_dir "output/chatglm2-6b/vx_xxx/checkpoint-xxx" \ --eval_human false \ - --dataset code-python-zh \ - --max_length 8192 \ - --max_new_tokens 1024 \ + --dataset damo-agent-mini-zh \ + --max_length 4096 \ + --max_new_tokens 2048 \ --temperature 0.9 \ --top_k 20 \ --top_p 0.9 \ --do_sample true \ + --merge_lora_and_save false \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh index 7634cc9fc6..68098ec2ce 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * A100 -# 2 * 50GB GPU memory +# 2 * 35GB GPU memory nproc_per_node=2 PYTHONPATH=../../.. \ @@ -14,10 +14,10 @@ torchrun \ --dtype bf16 \ --output_dir output \ --ddp_backend nccl \ - --dataset code-python-zh \ + --dataset damo-agent-mini-zh \ --train_dataset_sample -1 \ --num_train_epochs 1 \ - --max_length 8192 \ + --max_length 4096 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh new file mode 100644 index 0000000000..a143f9f1e5 --- /dev/null +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh @@ -0,0 +1,17 @@ +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_infer.py \ + --model_type chatglm2-6b \ + --sft_type lora \ + --template_type chatglm2 \ + --dtype bf16 \ + --ckpt_dir "output/chatglm2-6b/vx_xxx/checkpoint-xxx" \ + --eval_human false \ + --dataset damo-agent-mini-zh \ + --max_length 4096 \ + --max_new_tokens 2048 \ + --temperature 0.9 \ + --top_k 20 \ + --top_p 0.9 \ + --do_sample true \ + --merge_lora_and_save false \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh new file mode 100644 index 0000000000..f9d57c80c9 --- /dev/null +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh @@ -0,0 +1,40 @@ +# Experimental environment: 2 * A10 +# 2 * 18GB GPU memory +nproc_per_node=2 + +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0,1 \ +torchrun \ + --nproc_per_node=$nproc_per_node \ + --master_port 29500 \ + src/llm_sft.py \ + --model_type chatglm2-6b \ + --sft_type lora \ + --template_type chatglm2 \ + --dtype bf16 \ + --output_dir output \ + --ddp_backend nccl \ + --dataset damo-agent-mini-zh \ + --train_dataset_sample -1 \ + --num_train_epochs 1 \ + --max_length 4096 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --lora_dropout_p 0. \ + --lora_target_modules ALL \ + --gradient_checkpointing true \ + --batch_size 1 \ + --weight_decay 0. \ + --learning_rate 1e-4 \ + --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ + --max_grad_norm 0.5 \ + --warmup_ratio 0.03 \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 10 \ + --push_to_hub false \ + --hub_model_id chatglm2-6b-lora \ + --hub_private_repo true \ + --hub_token 'your-sdk-token' \ + --deepspeed_config_path 'ds_config/zero2.json' \ diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh similarity index 95% rename from examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh rename to examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh index 5379277715..09ff2633ce 100644 --- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh @@ -10,7 +10,7 @@ python src/llm_infer.py \ --eval_human false \ --dataset dureader-robust-zh \ --max_length 2048 \ - --use_flash_attn true \ + --use_flash_attn false \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 20 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh similarity index 83% rename from examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh rename to examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh index 42742df9d2..ded2933737 100644 --- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: 2 * A10 +# 2 * 19GB GPU memory (not use flash_attn) nproc_per_node=2 PYTHONPATH=../../.. \ @@ -20,7 +22,7 @@ torchrun \ --lora_alpha 32 \ --lora_dropout_p 0. \ --lora_target_modules c_attn c_proj \ - --gradient_checkpointing false \ + --gradient_checkpointing true \ --batch_size 1 \ --weight_decay 0. \ --learning_rate 1e-4 \ @@ -31,8 +33,9 @@ torchrun \ --save_steps 100 \ --save_total_limit 2 \ --logging_steps 10 \ - --use_flash_attn true \ + --use_flash_attn false \ --push_to_hub false \ --hub_model_id qwen-7b-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ + --deepspeed_config_path 'ds_config/zero2.json' \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh index cae5af46d5..22a3dd659d 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * A10 -# 2 * 18GB GPU memory +# 2 * 18GB GPU memory (not use flash_attn) nproc_per_node=2 PYTHONPATH=../../.. \ @@ -38,4 +38,4 @@ torchrun \ --hub_model_id qwen-7b-chat-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ - --deepspeed_config_path ds_config/zero2.json \ + --deepspeed_config_path 'ds_config/zero2.json' \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh index a7c213691c..b96ff3564b 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 4 * V100 -# 4 * 15GB GPU memory +# 4 * 15GB GPU memory (not use flash_attn) nproc_per_node=2 PYTHONPATH=../../.. \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh new file mode 100644 index 0000000000..7d385d8661 --- /dev/null +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh @@ -0,0 +1,21 @@ +# If you want to merge LoRA weight and save it, you need to set `--merge_lora_and_save true`. +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_infer.py \ + --model_type qwen-7b-chat \ + --sft_type lora \ + --template_type chatml \ + --dtype bf16 \ + --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ + --eval_human false \ + --dataset damo-agent-mini-zh \ + --max_length 4096 \ + --quantization_bit 4 \ + --bnb_4bit_comp_dtype bf16 \ + --use_flash_attn false \ + --max_new_tokens 2048 \ + --temperature 0.9 \ + --top_k 20 \ + --top_p 0.9 \ + --do_sample true \ + --merge_lora_and_save false \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh new file mode 100644 index 0000000000..6770417713 --- /dev/null +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh @@ -0,0 +1,43 @@ +# Experimental environment: 2 * 3090 +# 2 * 16GB GPU memory +nproc_per_node=2 + +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0,1 \ +torchrun \ + --nproc_per_node=$nproc_per_node \ + --master_port 29500 \ + src/llm_sft.py \ + --model_type qwen-7b-chat \ + --sft_type lora \ + --template_type chatml \ + --dtype bf16 \ + --output_dir output \ + --ddp_backend nccl \ + --dataset damo-agent-mini-zh \ + --train_dataset_sample 20000 \ + --num_train_epochs 1 \ + --max_length 4096 \ + --quantization_bit 4 \ + --bnb_4bit_comp_dtype bf16 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --lora_dropout_p 0. \ + --lora_target_modules ALL \ + --gradient_checkpointing true \ + --batch_size 1 \ + --weight_decay 0. \ + --learning_rate 1e-4 \ + --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ + --max_grad_norm 0.5 \ + --warmup_ratio 0.03 \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 10 \ + --use_flash_attn false \ + --push_to_hub false \ + --hub_model_id qwen-7b-chat-qlora \ + --hub_private_repo true \ + --hub_token 'your-sdk-token' \ + --deepspeed_config_path 'ds_config/zero2.json' \ From 0e9997b396093689b0dac2ac107143df94b4b52f Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 8 Oct 2023 16:01:42 +0800 Subject: [PATCH 4/7] update readme --- examples/pytorch/llm/README.md | 4 ++-- examples/pytorch/llm/README_CN.md | 4 ++-- examples/pytorch/llm/src/utils/model.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index a44c25dee4..e6575dbb50 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -52,10 +52,10 @@ - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. - 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat. -- 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users. +- 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users. You can check the command-line parameter `--merge_lora_and_save` in the `infer.sh` script. - 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat. - 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed. The corresponding shell script can be found at `scripts/qwen_7b_chat/full_mp_ddp/sft.sh`. -- 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning. +- 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning. You can check the command-line parameter `--only_save_model` in the `sft.sh` script. - 2023.9.5: Supported openbuddy-llama2-70b model. - 2023.9.3: Supported baichuan-13b model series: baichuan-13b, baichuan-13b-chat. diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index caa4d0f6f8..d3da774872 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -52,10 +52,10 @@ - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. - 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat. -- 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. +- 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. 可以查看`infer.sh`中的命令行参数: `--merge_lora_and_save`. - 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat. - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`. -- 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. +- 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. 可以查看`sft.sh`中的命令行参数: `--only_save_model`. - 2023.9.5: 支持openbuddy-llama2-70b模型. - 2023.9.3: 支持baichuan-13b系列模型: baichuan-13b, baichuan-13b-chat. diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index 5123aac1fa..be49a5b233 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -172,13 +172,13 @@ def get_model_tokenizer_qwen(model_dir: str, **kwargs): model_config = AutoConfig.from_pretrained( model_dir, trust_remote_code=True) - mapper = { + dtype_mapping = { torch.float16: 'fp16', torch.bfloat16: 'bf16', torch.float32: 'fp32' } - k_true = mapper[torch_dtype] - for k in mapper.values(): + k_true = dtype_mapping[torch_dtype] + for k in dtype_mapping.values(): v = False if k == k_true: v = True @@ -370,7 +370,7 @@ class ResTunerTM(NamedTuple): # chatglm2 series 'chatglm2-6b': { 'model_id': 'ZhipuAI/chatglm2-6b', - 'revision': 'v1.0.11', + 'revision': 'v1.0.12', 'get_function': get_model_tokenizer_chatglm2, 'template': 'chatglm2', 'lora_TM': LoRATM.chatglm2, From a20735e7d4db060147266936bda674054c3634bb Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 8 Oct 2023 16:31:18 +0800 Subject: [PATCH 5/7] update readme --- examples/pytorch/llm/README.md | 15 ++++++++++----- examples/pytorch/llm/README_CN.md | 15 ++++++++++----- .../llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh | 4 ++-- .../llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh | 2 +- .../llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh | 2 +- 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index e6575dbb50..d555083dba 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -61,7 +61,7 @@ ## Prepare the Environment -Experimental environment: V100, A10, 3090, A100, ... +Experimental environment: A10, 3090, V100, A100, ... ```bash # Installing miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -95,7 +95,7 @@ Tips: - If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`. - If you want to use deepspeed, you need to `pip install deepspeed -U`. - If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16. -- qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (V100, 3090, A10 machines do not support flash-attn). +- qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (A10, 3090, V100 machines do not support flash-attn). - Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`. ```bash # sft lora and infer qwen-7b-chat, Requires 38GB GPU memory. @@ -114,20 +114,25 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/sft.sh bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh # sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*15GB GPU memory. -# Recommended experimental environment: V100, A10, 3090 +# Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh # sft(qlora) and infer qwen-7b-chat, Requires 10GB GPU memory. -# Recommended experimental environment: V100, A10, 3090 +# Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory. -# Recommended experimental environment: V100, A10, 3090 +# Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh +# sft(qlora+ddp+deepspeed) and infer qwen-7b-chat, Requires 2*16GB GPU memory. +# Recommended experimental environment: A10, 3090 +bash scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh +bash scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh + # sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/full_mp/sft.sh diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index d3da774872..431f259678 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -61,7 +61,7 @@ ## 准备实验环境 -实验环境: V100, A10, 3090, A100均可. +实验环境: A10, 3090, V100, A100均可. ```bash # 安装miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -96,7 +96,7 @@ pip install -r requirements.txt -U - 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`. - 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. - 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16. -- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(V100, 3090, A10等显卡不支持flash-attn进行训练). +- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写. ```bash # 微调(lora)+推理 qwen-7b-chat, 需要38GB显存. @@ -115,20 +115,25 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/sft.sh bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh # 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存. -# 推荐的实验环境: V100, 3090, A10 +# 推荐的实验环境: A10, 3090 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh # 微调(qlora)+推理 qwen-7b-chat, 需要10GB显存. -# 推荐的实验环境: V100, 3090, A10 +# 推荐的实验环境: A10, 3090 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh # 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存. -# 推荐的实验环境: V100, 3090, A10 +# 推荐的实验环境: A10, 3090 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh +# 微调(qlora+ddp+deepspeed)+推理 qwen-7b-chat, 需要2卡*16GB显存. +# 推荐的实验环境: A10, 3090 +bash scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh +bash scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh + # 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/full_mp/sft.sh diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh index 1e605f1264..783473b528 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh @@ -5,7 +5,7 @@ python src/llm_infer.py \ --model_type qwen-7b-chat \ --sft_type lora \ --template_type chatml \ - --dtype fp16 \ + --dtype bf16 \ --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh index b96ff3564b..9b11e1aa8f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh @@ -1,4 +1,4 @@ -# Experimental environment: 4 * V100 +# Experimental environment: 4 * 3090 # 4 * 15GB GPU memory (not use flash_attn) nproc_per_node=2 @@ -11,7 +11,7 @@ torchrun \ --model_type qwen-7b-chat \ --sft_type lora \ --template_type chatml \ - --dtype fp16 \ + --dtype bf16 \ --output_dir output \ --ddp_backend nccl \ --dataset advertise-gen \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh index f13a53fd3d..81d8017bd5 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh @@ -1,4 +1,4 @@ -# Experimental environment: 2 * 3090 +# Experimental environment: 2 * A10 # 2 * 14GB GPU memory nproc_per_node=2 diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh index 6770417713..df64baad11 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh @@ -1,4 +1,4 @@ -# Experimental environment: 2 * 3090 +# Experimental environment: 2 * A10 # 2 * 16GB GPU memory nproc_per_node=2 From e5eb07f6e11ed1e1e448f5461668475931000d8d Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 8 Oct 2023 18:12:12 +0800 Subject: [PATCH 6/7] compatible with transformers==4.34 --- swift/trainers/mixin.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py index 091121d3b0..248fc5f080 100644 --- a/swift/trainers/mixin.py +++ b/swift/trainers/mixin.py @@ -92,7 +92,7 @@ def _add_patterns_to_gitignores( self.repo.push(commit_message) def init_hf_repo(self) -> None: - """init ms repo. Compatible with transformers>=v4.34""" + """init ms repo. Compatible with transformers>=4.34""" self.init_git_repo() def init_git_repo(self, at_init: bool = False) -> None: @@ -268,10 +268,21 @@ def __init__(self, Invoke.THIRD_PARTY: kwargs.get(Invoke.THIRD_PARTY, Invoke.SWIFT), }) + + # Compatible with transformers>=4.34 + from swift.tuners import SwiftModel, PeftModel + is_quantized = getattr(model, 'is_quantized', False) + _hf_peft_config_loaded = getattr(model, '_hf_peft_config_loaded', + False) + use_swift = isinstance(model, (SwiftModel, PeftModel)) + if is_quantized and use_swift: + model._hf_peft_config_loaded = True # mro super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics) + if is_quantized and use_swift: + model._hf_peft_config_loaded = _hf_peft_config_loaded if get_function(model.__class__.forward) is not get_function( model.forward): From 2165f42cfd6137c9627a3128b4403caf4d0fb1de Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 8 Oct 2023 22:55:08 +0800 Subject: [PATCH 7/7] update readme --- examples/pytorch/llm/README.md | 14 +++++++------- examples/pytorch/llm/README_CN.md | 14 +++++++------- examples/pytorch/llm/src/utils/model.py | 8 ++------ 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index d555083dba..16940ef4ba 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -50,14 +50,14 @@ ## News - 2023.10.7: Supported DeepSpeed ZeRO-2, enabling LoRA (not just QLoRA) to run DDP on 2*A10. The corresponding shell script can be found at `scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`. - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. -- 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. -- 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat. +- 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. The corresponding shell script can be found at `scripts/xverse_13b`. +- 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat. The corresponding shell script can be found at `scripts/qwen_14b`, `scripts/qwen_14b_chat`. - 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users. You can check the command-line parameter `--merge_lora_and_save` in the `infer.sh` script. -- 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat. +- 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat. The corresponding shell script can be found at `scripts/internlm_20b`, `scripts/internlm_20b_chat`. - 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed. The corresponding shell script can be found at `scripts/qwen_7b_chat/full_mp_ddp/sft.sh`. - 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning. You can check the command-line parameter `--only_save_model` in the `sft.sh` script. -- 2023.9.5: Supported openbuddy-llama2-70b model. -- 2023.9.3: Supported baichuan-13b model series: baichuan-13b, baichuan-13b-chat. +- 2023.9.5: Supported openbuddy-llama2-70b model. The corresponding shell script can be found at `scripts/openbuddy-llama2-70b`. +- 2023.9.3: Supported baichuan2 model series: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat. The corresponding shell script can be found at `scripts/baichuan2_7b`, `scripts/baichuan2_7b_chat`. ## Prepare the Environment @@ -89,11 +89,11 @@ Performace: full(nice) > lora > qlora Training GPU memory: qlora(low,3090) > lora > full(2*A100) Tips: -- You can set `--gradient_checkpointing true` during training to save GPU memory, but this will slightly decrease the training speed. +- You can set `--gradient_checkpointing true` during training to save GPU memory, but this will slightly decrease the training speed. This is useful if you need to train LLM on consumer-grade GPU, e.g. 3090. - If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`. - If you want to merge LoRA weights and save during inference, you need to set `--merge_lora_and_save true`. - If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`. -- If you want to use deepspeed, you need to `pip install deepspeed -U`. +- If you want to use deepspeed, you need to `pip install deepspeed -U`. Using deepspeed can save GPU memory, but this may slightly decrease the training speed. - If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16. - qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (A10, 3090, V100 machines do not support flash-attn). - Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`. diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index 431f259678..98c4ca004f 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -50,14 +50,14 @@ ## 新闻 - 2023.10.7: 支持DeepSpeed ZeRO-2, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP. 对应的sh脚本可以查看`scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`. - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. -- 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. -- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat. +- 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. 对应的sh脚本可以查看`scripts/xverse_13b`. +- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat. 对应的sh脚本可以查看`scripts/qwen_14b`, `scripts/qwen_14b_chat`. - 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. 可以查看`infer.sh`中的命令行参数: `--merge_lora_and_save`. -- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat. +- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat. 对应的sh脚本可以查看`scripts/internlm_20b`, `scripts/internlm_20b_chat`. - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`. - 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. 可以查看`sft.sh`中的命令行参数: `--only_save_model`. -- 2023.9.5: 支持openbuddy-llama2-70b模型. -- 2023.9.3: 支持baichuan-13b系列模型: baichuan-13b, baichuan-13b-chat. +- 2023.9.5: 支持openbuddy-llama2-70b模型. 对应的sh脚本可以查看`scripts/openbuddy_llama2_70b`. +- 2023.9.3: 支持baichuan2系列模型: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat. 对应的sh脚本可以查看`scripts/baichuan2_7b`, `scripts/baichuan2_7b_chat`. ## 准备实验环境 @@ -90,11 +90,11 @@ pip install -r requirements.txt -U 训练显存: qlora(低,3090) > lora > full(2*A100) 提示: -- 你可以在训练时设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度. +- 你可以在训练时设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度. 如果你需要在消费级显卡中训练大模型, 这很有用, 例如: 3090. - 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`. - 如何你想要在推理时, 合并LoRA权重并保存,你需要设置`--merge_lora_and_save true`. - 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`. -- 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. +- 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. 使用deepspeed可以节约显存, 但可能会略微降低训练速度. - 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16. - 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写. diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index be49a5b233..3b4597daea 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -133,12 +133,8 @@ def get_model_tokenizer_chatglm2(model_dir: str, model_kwargs['quantization_config'].llm_int8_skip_modules = [ 'output_layer' ] - return get_model_tokenizer_from_repo( - model_dir, - torch_dtype, - load_model, - automodel_class=AutoModel, - **model_kwargs) + return get_model_tokenizer_from_repo(model_dir, torch_dtype, load_model, + **model_kwargs) def get_model_tokenizer_llama2(model_dir: str,