From 39a189d8371d82c5e5b6edce4344066a8e688c62 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sat, 7 Oct 2023 12:02:12 +0800
Subject: [PATCH 1/7] update readme and sh

---
 examples/pytorch/llm/README.md                |  1 +
 examples/pytorch/llm/README_CN.md             |  5 ++-
 .../baichuan2_7b_chat/lora_ddp_ds/infer.sh    | 17 ++++++++
 .../baichuan2_7b_chat/lora_ddp_ds/sft.sh      | 40 +++++++++++++++++++
 4 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
 create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 50f6069367..a44c25dee4 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -48,6 +48,7 @@
 
 
 ## News
+- 2023.10.7: Supported DeepSpeed ZeRO-2, enabling LoRA (not just QLoRA) to run DDP on 2*A10. The corresponding shell script can be found at `scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`.
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
 - 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index f8ff30a2b0..caa4d0f6f8 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -48,11 +48,12 @@
 
 
 ## 新闻
+- 2023.10.7: 支持DeepSpeed ZeRO-2, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP. 对应的sh脚本可以查看`scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`.
 - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
-- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat
+- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat.
 - 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署.
-- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat
+- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat.
 - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
 - 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题.
 - 2023.9.5: 支持openbuddy-llama2-70b模型.
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
new file mode 100644
index 0000000000..b16ce5a171
--- /dev/null
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
@@ -0,0 +1,17 @@
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type baichuan2-7b-chat \
+    --sft_type lora \
+    --template_type baichuan \
+    --dtype bf16 \
+    --ckpt_dir "output/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
+    --max_new_tokens 2048 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh
new file mode 100644
index 0000000000..2f387b1ea7
--- /dev/null
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh
@@ -0,0 +1,40 @@
+# Experimental environment: 2 * A10
+# 2 * 21GB GPU memory
+nproc_per_node=2
+
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0,1 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    src/llm_sft.py \
+    --model_type baichuan2-7b-chat \
+    --sft_type lora \
+    --template_type baichuan \
+    --dtype bf16 \
+    --output_dir output \
+    --ddp_backend nccl \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 4096 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0. \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --push_to_hub false \
+    --hub_model_id baichuan2-7b-chat-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
+    --deepspeed_config_path ds_config/zero2.json

From 095768b426bb6f646f53c2efc2cf93aa81933d58 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sat, 7 Oct 2023 16:59:31 +0800
Subject: [PATCH 2/7] update dataset.py

---
 examples/pytorch/llm/src/utils/dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 96a20168dd..36a9f65784 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -438,7 +438,8 @@ def get_medical_dataset(
     ])
     val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
     if train_dataset_sample >= 0:
-        idxs = np.random.permutation(train_dataset_sample)
+        random_state = np.random.RandomState(42)
+        idxs = random_state.permutation(train_dataset_sample)
         train_dataset = train_dataset.select(idxs)
     return tuple(
         _preprocess_medical(dataset, subset_name)

From 8231b64a20009de3d9ecb7364568289b5a0d8052 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 8 Oct 2023 14:32:33 +0800
Subject: [PATCH 3/7] update sh

---
 examples/pytorch/llm/requirements.txt         |  4 ++
 .../baichuan2_7b_chat/lora_ddp_ds/sft.sh      |  2 +-
 .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh |  7 +--
 .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh   |  6 +--
 .../scripts/chatglm2_6b/lora_ddp_ds/infer.sh  | 17 ++++++++
 .../scripts/chatglm2_6b/lora_ddp_ds/sft.sh    | 40 +++++++++++++++++
 .../{lora_ddp => lora_ddp_ds}/infer.sh        |  2 +-
 .../qwen_7b/{lora_ddp => lora_ddp_ds}/sft.sh  |  7 ++-
 .../scripts/qwen_7b_chat/lora_ddp_ds/sft.sh   |  4 +-
 .../scripts/qwen_7b_chat/lora_mp_ddp/sft.sh   |  2 +-
 .../qwen_7b_chat/qlora_ddp_ds/infer.sh        | 21 +++++++++
 .../scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh  | 43 +++++++++++++++++++
 12 files changed, 142 insertions(+), 13 deletions(-)
 create mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
 create mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh
 rename examples/pytorch/llm/scripts/qwen_7b/{lora_ddp => lora_ddp_ds}/infer.sh (95%)
 rename examples/pytorch/llm/scripts/qwen_7b/{lora_ddp => lora_ddp_ds}/sft.sh (83%)
 create mode 100644 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
 create mode 100644 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh

diff --git a/examples/pytorch/llm/requirements.txt b/examples/pytorch/llm/requirements.txt
index f3b70a775a..6147e7a146 100644
--- a/examples/pytorch/llm/requirements.txt
+++ b/examples/pytorch/llm/requirements.txt
@@ -1,6 +1,10 @@
+accelerate
 charset_normalizer
 cpm_kernels
+matplotlib
 modelscope>=1.9
 sentencepiece
+tensorboard
 tiktoken
+transformers
 transformers_stream_generator
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh
index 2f387b1ea7..d3b3a00057 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh
@@ -37,4 +37,4 @@ torchrun \
     --hub_model_id baichuan2-7b-chat-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
-    --deepspeed_config_path ds_config/zero2.json
+    --deepspeed_config_path 'ds_config/zero2.json' \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
index acea15fd0b..a143f9f1e5 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
@@ -7,10 +7,11 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "output/chatglm2-6b/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset code-python-zh \
-    --max_length 8192 \
-    --max_new_tokens 1024 \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
+    --max_new_tokens 2048 \
     --temperature 0.9 \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
index 7634cc9fc6..68098ec2ce 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 2 * 50GB GPU memory
+# 2 * 35GB GPU memory
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -14,10 +14,10 @@ torchrun \
     --dtype bf16 \
     --output_dir output \
     --ddp_backend nccl \
-    --dataset code-python-zh \
+    --dataset damo-agent-mini-zh \
     --train_dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 8192 \
+    --max_length 4096 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
new file mode 100644
index 0000000000..a143f9f1e5
--- /dev/null
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
@@ -0,0 +1,17 @@
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type chatglm2-6b \
+    --sft_type lora \
+    --template_type chatglm2 \
+    --dtype bf16 \
+    --ckpt_dir "output/chatglm2-6b/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
+    --max_new_tokens 2048 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh
new file mode 100644
index 0000000000..f9d57c80c9
--- /dev/null
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/sft.sh
@@ -0,0 +1,40 @@
+# Experimental environment: 2 * A10
+# 2 * 18GB GPU memory
+nproc_per_node=2
+
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0,1 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    src/llm_sft.py \
+    --model_type chatglm2-6b \
+    --sft_type lora \
+    --template_type chatglm2 \
+    --dtype bf16 \
+    --output_dir output \
+    --ddp_backend nccl \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 4096 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0. \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --push_to_hub false \
+    --hub_model_id chatglm2-6b-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
+    --deepspeed_config_path 'ds_config/zero2.json' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
similarity index 95%
rename from examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
rename to examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
index 5379277715..09ff2633ce 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
@@ -10,7 +10,7 @@ python src/llm_infer.py \
     --eval_human false \
     --dataset dureader-robust-zh \
     --max_length 2048 \
-    --use_flash_attn true \
+    --use_flash_attn false \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 20 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh
similarity index 83%
rename from examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
rename to examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh
index 42742df9d2..ded2933737 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: 2 * A10
+# 2 * 19GB GPU memory (not use flash_attn)
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -20,7 +22,7 @@ torchrun \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
     --lora_target_modules c_attn c_proj \
-    --gradient_checkpointing false \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
     --learning_rate 1e-4 \
@@ -31,8 +33,9 @@ torchrun \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 10 \
-    --use_flash_attn true \
+    --use_flash_attn false \
     --push_to_hub false \
     --hub_model_id qwen-7b-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
+    --deepspeed_config_path 'ds_config/zero2.json' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh
index cae5af46d5..22a3dd659d 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A10
-# 2 * 18GB GPU memory
+# 2 * 18GB GPU memory (not use flash_attn)
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -38,4 +38,4 @@ torchrun \
     --hub_model_id qwen-7b-chat-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
-    --deepspeed_config_path ds_config/zero2.json \
+    --deepspeed_config_path 'ds_config/zero2.json' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
index a7c213691c..b96ff3564b 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 4 * V100
-# 4 * 15GB GPU memory
+# 4 * 15GB GPU memory (not use flash_attn)
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
new file mode 100644
index 0000000000..7d385d8661
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
@@ -0,0 +1,21 @@
+# If you want to merge LoRA weight and save it, you need to set `--merge_lora_and_save true`.
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type qwen-7b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
+    --max_new_tokens 2048 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
new file mode 100644
index 0000000000..6770417713
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
@@ -0,0 +1,43 @@
+# Experimental environment: 2 * 3090
+# 2 * 16GB GPU memory
+nproc_per_node=2
+
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0,1 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    src/llm_sft.py \
+    --model_type qwen-7b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir output \
+    --ddp_backend nccl \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample 20000 \
+    --num_train_epochs 1 \
+    --max_length 4096 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0. \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn false \
+    --push_to_hub false \
+    --hub_model_id qwen-7b-chat-qlora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
+    --deepspeed_config_path 'ds_config/zero2.json' \

From 0e9997b396093689b0dac2ac107143df94b4b52f Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 8 Oct 2023 16:01:42 +0800
Subject: [PATCH 4/7] update readme

---
 examples/pytorch/llm/README.md          | 4 ++--
 examples/pytorch/llm/README_CN.md       | 4 ++--
 examples/pytorch/llm/src/utils/model.py | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index a44c25dee4..e6575dbb50 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -52,10 +52,10 @@
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
 - 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat.
-- 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users.
+- 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users. You can check the command-line parameter `--merge_lora_and_save` in the `infer.sh` script.
 - 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat.
 - 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed. The corresponding shell script can be found at `scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
-- 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning.
+- 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning. You can check the command-line parameter `--only_save_model` in the `sft.sh` script.
 - 2023.9.5: Supported openbuddy-llama2-70b model.
 - 2023.9.3: Supported baichuan-13b model series: baichuan-13b, baichuan-13b-chat.
 
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index caa4d0f6f8..d3da774872 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -52,10 +52,10 @@
 - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
 - 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat.
-- 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署.
+- 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. 可以查看`infer.sh`中的命令行参数: `--merge_lora_and_save`.
 - 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat.
 - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
-- 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题.
+- 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. 可以查看`sft.sh`中的命令行参数: `--only_save_model`.
 - 2023.9.5: 支持openbuddy-llama2-70b模型.
 - 2023.9.3: 支持baichuan-13b系列模型: baichuan-13b, baichuan-13b-chat.
 
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index 5123aac1fa..be49a5b233 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -172,13 +172,13 @@ def get_model_tokenizer_qwen(model_dir: str,
                              **kwargs):
     model_config = AutoConfig.from_pretrained(
         model_dir, trust_remote_code=True)
-    mapper = {
+    dtype_mapping = {
         torch.float16: 'fp16',
         torch.bfloat16: 'bf16',
         torch.float32: 'fp32'
     }
-    k_true = mapper[torch_dtype]
-    for k in mapper.values():
+    k_true = dtype_mapping[torch_dtype]
+    for k in dtype_mapping.values():
         v = False
         if k == k_true:
             v = True
@@ -370,7 +370,7 @@ class ResTunerTM(NamedTuple):
     # chatglm2 series
     'chatglm2-6b': {
         'model_id': 'ZhipuAI/chatglm2-6b',
-        'revision': 'v1.0.11',
+        'revision': 'v1.0.12',
         'get_function': get_model_tokenizer_chatglm2,
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,

From a20735e7d4db060147266936bda674054c3634bb Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 8 Oct 2023 16:31:18 +0800
Subject: [PATCH 5/7] update readme

---
 examples/pytorch/llm/README.md                    | 15 ++++++++++-----
 examples/pytorch/llm/README_CN.md                 | 15 ++++++++++-----
 .../llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh |  2 +-
 .../llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh   |  4 ++--
 .../llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh     |  2 +-
 .../llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh  |  2 +-
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index e6575dbb50..d555083dba 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -61,7 +61,7 @@
 
 
 ## Prepare the Environment
-Experimental environment: V100, A10, 3090, A100, ...
+Experimental environment: A10, 3090, V100, A100, ...
 ```bash
 # Installing miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -95,7 +95,7 @@ Tips:
 - If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`.
 - If you want to use deepspeed, you need to `pip install deepspeed -U`.
 - If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16.
-- qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (V100, 3090, A10 machines do not support flash-attn).
+- qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (A10, 3090, V100 machines do not support flash-attn).
 - Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`.
 ```bash
 # sft lora and infer qwen-7b-chat, Requires 38GB GPU memory.
@@ -114,20 +114,25 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
 
 # sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*15GB GPU memory.
-# Recommended experimental environment: V100, A10, 3090
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
 # sft(qlora) and infer qwen-7b-chat, Requires 10GB GPU memory.
-# Recommended experimental environment: V100, A10, 3090
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory.
-# Recommended experimental environment: V100, A10, 3090
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
+# sft(qlora+ddp+deepspeed) and infer qwen-7b-chat, Requires 2*16GB GPU memory.
+# Recommended experimental environment: A10, 3090
+bash scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
+bash scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
+
 # sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index d3da774872..431f259678 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -61,7 +61,7 @@
 
 
 ## 准备实验环境
-实验环境: V100, A10, 3090, A100均可.
+实验环境: A10, 3090, V100, A100均可.
 ```bash
 # 安装miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -96,7 +96,7 @@ pip install -r requirements.txt -U
 - 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`.
 - 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`.
 - 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16.
-- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(V100, 3090, A10等显卡不支持flash-attn进行训练).
+- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练).
 - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
 ```bash
 # 微调(lora)+推理 qwen-7b-chat, 需要38GB显存.
@@ -115,20 +115,25 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
 
 # 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存.
-# 推荐的实验环境: V100, 3090, A10
+# 推荐的实验环境: A10, 3090
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
 # 微调(qlora)+推理 qwen-7b-chat, 需要10GB显存.
-# 推荐的实验环境: V100, 3090, A10
+# 推荐的实验环境: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
-# 推荐的实验环境: V100, 3090, A10
+# 推荐的实验环境: A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
+# 微调(qlora+ddp+deepspeed)+推理 qwen-7b-chat, 需要2卡*16GB显存.
+# 推荐的实验环境: A10, 3090
+bash scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
+bash scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
+
 # 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
index 1e605f1264..783473b528 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -5,7 +5,7 @@ python src/llm_infer.py \
     --model_type qwen-7b-chat \
     --sft_type lora \
     --template_type chatml \
-    --dtype fp16 \
+    --dtype bf16 \
     --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human false \
     --dataset advertise-gen \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
index b96ff3564b..9b11e1aa8f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -1,4 +1,4 @@
-# Experimental environment: 4 * V100
+# Experimental environment: 4 * 3090
 # 4 * 15GB GPU memory (not use flash_attn)
 nproc_per_node=2
 
@@ -11,7 +11,7 @@ torchrun \
     --model_type qwen-7b-chat \
     --sft_type lora \
     --template_type chatml \
-    --dtype fp16 \
+    --dtype bf16 \
     --output_dir output \
     --ddp_backend nccl \
     --dataset advertise-gen \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
index f13a53fd3d..81d8017bd5 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
@@ -1,4 +1,4 @@
-# Experimental environment: 2 * 3090
+# Experimental environment: 2 * A10
 # 2 * 14GB GPU memory
 nproc_per_node=2
 
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
index 6770417713..df64baad11 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/sft.sh
@@ -1,4 +1,4 @@
-# Experimental environment: 2 * 3090
+# Experimental environment: 2 * A10
 # 2 * 16GB GPU memory
 nproc_per_node=2
 

From e5eb07f6e11ed1e1e448f5461668475931000d8d Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 8 Oct 2023 18:12:12 +0800
Subject: [PATCH 6/7] compatible with transformers==4.34

---
 swift/trainers/mixin.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index 091121d3b0..248fc5f080 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -92,7 +92,7 @@ def _add_patterns_to_gitignores(
         self.repo.push(commit_message)
 
     def init_hf_repo(self) -> None:
-        """init ms repo. Compatible with transformers>=v4.34"""
+        """init ms repo. Compatible with transformers>=4.34"""
         self.init_git_repo()
 
     def init_git_repo(self, at_init: bool = False) -> None:
@@ -268,10 +268,21 @@ def __init__(self,
                     Invoke.THIRD_PARTY:
                     kwargs.get(Invoke.THIRD_PARTY, Invoke.SWIFT),
                 })
+
+        # Compatible with transformers>=4.34
+        from swift.tuners import SwiftModel, PeftModel
+        is_quantized = getattr(model, 'is_quantized', False)
+        _hf_peft_config_loaded = getattr(model, '_hf_peft_config_loaded',
+                                         False)
+        use_swift = isinstance(model, (SwiftModel, PeftModel))
+        if is_quantized and use_swift:
+            model._hf_peft_config_loaded = True
         # mro
         super().__init__(model, args, data_collator, train_dataset,
                          eval_dataset, tokenizer, model_init, compute_metrics,
                          callbacks, optimizers, preprocess_logits_for_metrics)
+        if is_quantized and use_swift:
+            model._hf_peft_config_loaded = _hf_peft_config_loaded
 
         if get_function(model.__class__.forward) is not get_function(
                 model.forward):

From 2165f42cfd6137c9627a3128b4403caf4d0fb1de Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 8 Oct 2023 22:55:08 +0800
Subject: [PATCH 7/7] update readme

---
 examples/pytorch/llm/README.md          | 14 +++++++-------
 examples/pytorch/llm/README_CN.md       | 14 +++++++-------
 examples/pytorch/llm/src/utils/model.py |  8 ++------
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index d555083dba..16940ef4ba 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -50,14 +50,14 @@
 ## News
 - 2023.10.7: Supported DeepSpeed ZeRO-2, enabling LoRA (not just QLoRA) to run DDP on 2*A10. The corresponding shell script can be found at `scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`.
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
-- 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
-- 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat.
+- 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. The corresponding shell script can be found at `scripts/xverse_13b`.
+- 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat. The corresponding shell script can be found at `scripts/qwen_14b`, `scripts/qwen_14b_chat`.
 - 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users. You can check the command-line parameter `--merge_lora_and_save` in the `infer.sh` script.
-- 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat.
+- 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat. The corresponding shell script can be found at `scripts/internlm_20b`, `scripts/internlm_20b_chat`.
 - 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed. The corresponding shell script can be found at `scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
 - 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning. You can check the command-line parameter `--only_save_model` in the `sft.sh` script.
-- 2023.9.5: Supported openbuddy-llama2-70b model.
-- 2023.9.3: Supported baichuan-13b model series: baichuan-13b, baichuan-13b-chat.
+- 2023.9.5: Supported openbuddy-llama2-70b model. The corresponding shell script can be found at `scripts/openbuddy-llama2-70b`.
+- 2023.9.3: Supported baichuan2 model series: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat. The corresponding shell script can be found at `scripts/baichuan2_7b`, `scripts/baichuan2_7b_chat`.
 
 
 ## Prepare the Environment
@@ -89,11 +89,11 @@ Performace: full(nice) > lora > qlora
 Training GPU memory: qlora(low,3090) > lora > full(2*A100)
 
 Tips:
-- You can set `--gradient_checkpointing true` during training to save GPU memory, but this will slightly decrease the training speed.
+- You can set `--gradient_checkpointing true` during training to save GPU memory, but this will slightly decrease the training speed. This is useful if you need to train LLM on consumer-grade GPU, e.g. 3090.
 - If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`.
 - If you want to merge LoRA weights and save during inference, you need to set `--merge_lora_and_save true`.
 - If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`.
-- If you want to use deepspeed, you need to `pip install deepspeed -U`.
+- If you want to use deepspeed, you need to `pip install deepspeed -U`. Using deepspeed can save GPU memory, but this may slightly decrease the training speed.
 - If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16.
 - qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (A10, 3090, V100 machines do not support flash-attn).
 - Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index 431f259678..98c4ca004f 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -50,14 +50,14 @@
 ## 新闻
 - 2023.10.7: 支持DeepSpeed ZeRO-2, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP. 对应的sh脚本可以查看`scripts/qwen_7b_chat/lora_ddp_ds/sft.sh`, `scripts/baichuan2_7b_chat/lora_ddp_ds/sft.sh`.
 - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
-- 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
-- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat.
+- 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat. 对应的sh脚本可以查看`scripts/xverse_13b`.
+- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat. 对应的sh脚本可以查看`scripts/qwen_14b`, `scripts/qwen_14b_chat`.
 - 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署. 可以查看`infer.sh`中的命令行参数: `--merge_lora_and_save`.
-- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat.
+- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat. 对应的sh脚本可以查看`scripts/internlm_20b`, `scripts/internlm_20b_chat`.
 - 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
 - 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题. 可以查看`sft.sh`中的命令行参数: `--only_save_model`.
-- 2023.9.5: 支持openbuddy-llama2-70b模型.
-- 2023.9.3: 支持baichuan-13b系列模型: baichuan-13b, baichuan-13b-chat.
+- 2023.9.5: 支持openbuddy-llama2-70b模型. 对应的sh脚本可以查看`scripts/openbuddy_llama2_70b`.
+- 2023.9.3: 支持baichuan2系列模型: baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat. 对应的sh脚本可以查看`scripts/baichuan2_7b`, `scripts/baichuan2_7b_chat`.
 
 
 ## 准备实验环境
@@ -90,11 +90,11 @@ pip install -r requirements.txt -U
 训练显存: qlora(低,3090) > lora > full(2*A100)
 
 提示:
-- 你可以在训练时设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
+- 你可以在训练时设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度. 如果你需要在消费级显卡中训练大模型, 这很有用, 例如: 3090.
 - 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
 - 如何你想要在推理时, 合并LoRA权重并保存，你需要设置`--merge_lora_and_save true`.
 - 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`.
-- 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`.
+- 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. 使用deepspeed可以节约显存, 但可能会略微降低训练速度.
 - 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16.
 - 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练).
 - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index be49a5b233..3b4597daea 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -133,12 +133,8 @@ def get_model_tokenizer_chatglm2(model_dir: str,
         model_kwargs['quantization_config'].llm_int8_skip_modules = [
             'output_layer'
         ]
-    return get_model_tokenizer_from_repo(
-        model_dir,
-        torch_dtype,
-        load_model,
-        automodel_class=AutoModel,
-        **model_kwargs)
+    return get_model_tokenizer_from_repo(model_dir, torch_dtype, load_model,
+                                         **model_kwargs)
 
 
 def get_model_tokenizer_llama2(model_dir: str,