diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 853339565a..4c513f6a28 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -27,7 +27,7 @@
8. other: polylm-13b, seqgpt-560m
3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
4. supported datasets:
- 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
+ 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen
2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
3. multi-modal: coco-en
4. other: cls-fudan-news-zh, ner-jave-zh
@@ -71,40 +71,40 @@ Training GPU memory: qlora(low,3090) > lora > full(2*A100)
git clone https://github.com/modelscope/swift.git
cd swift/examples/pytorch/llm
-# sft lora and infer qwen-7b-chat, Requires 27GB GPU memory.
+# sft lora and infer qwen-7b-chat, Requires 38GB GPU memory.
# You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed.
# If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'.
# Recommended experimental environment: A100
bash scripts/qwen_7b_chat/lora/sft.sh
bash scripts/qwen_7b_chat/lora/infer.sh
-# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory.
+# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory.
# Recommended experimental environment: A100
bash scripts/qwen_7b_chat/lora_ddp/sft.sh
bash scripts/qwen_7b_chat/lora_ddp/infer.sh
-# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*14GB GPU memory.
+# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*15GB GPU memory.
# Recommended experimental environment: V100, A10, 3090
bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
-# sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory.
+# sft(qlora) and infer qwen-7b-chat, Requires 12GB GPU memory.
# If you want to use quantification, you need to `pip install bitsandbytes -U`
# Recommended experimental environment: A10, 3090
bash scripts/qwen_7b_chat/qlora/sft.sh
bash scripts/qwen_7b_chat/qlora/infer.sh
-# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory.
+# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory.
# Recommended experimental environment: A10, 3090
bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
-# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory.
+# sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory.
# Recommended experimental environment: A100
bash scripts/qwen_7b_chat/full_mp/sft.sh
bash scripts/qwen_7b_chat/full_mp/infer.sh
-# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory.
+# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*75GB GPU memory.
# Recommended experimental environment: A100
bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index 889368e6f9..d4478ad961 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -28,7 +28,7 @@
8. other: polylm-13b, seqgpt-560m
3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
4. 支持的数据集:
- 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
+ 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen
2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
3. 多模态: coco-en
4. 其他: cls-fudan-news-zh, ner-jave-zh
@@ -73,40 +73,40 @@ pip install .
git clone https://github.com/modelscope/swift.git
cd swift/examples/pytorch/llm
-# 微调(lora)+推理 qwen-7b-chat, 需要27GB显存.
+# 微调(lora)+推理 qwen-7b-chat, 需要38GB显存.
# 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
# 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
# 推荐的实验环境: A100
bash scripts/qwen_7b_chat/lora/sft.sh
bash scripts/qwen_7b_chat/lora/infer.sh
-# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存.
+# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存.
# 推荐的实验环境: A100
bash scripts/qwen_7b_chat/lora_ddp/sft.sh
bash scripts/qwen_7b_chat/lora_ddp/infer.sh
-# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*14GB显存.
+# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存.
# 推荐的实验环境: V100, 3090, A10
bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
-# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存.
+# 微调(qlora)+推理 qwen-7b-chat, 需要12GB显存.
# 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
# 推荐的实验环境: 3090, A10
bash scripts/qwen_7b_chat/qlora/sft.sh
bash scripts/qwen_7b_chat/qlora/infer.sh
-# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存.
+# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
# 推荐的实验环境: 3090, A10
bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
-# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*50G显存.
+# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存.
# 推荐的实验环境: A100
bash scripts/qwen_7b_chat/full_mp/sft.sh
bash scripts/qwen_7b_chat/full_mp/infer.sh
-# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*50G显存.
+# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存.
# 推荐的实验环境: A100
bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
index e62aa4b203..ce54c3ffaa 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type baichuan \
--dtype bf16 \
--ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset damo-agent-mini-zh \
+ --max_length 4096 \
--max_new_tokens 1024 \
--temperature 0.9 \
--top_k 50 \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
index ea219e0759..6ad53030f8 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
@@ -1,4 +1,5 @@
# Experimental environment: 2 * A100
+# 2 * 44GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1 \
torchrun \
@@ -11,10 +12,10 @@ torchrun \
--dtype bf16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample 20000 \
+ --dataset damo-agent-mini-zh \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
- --max_length 2048 \
+ --max_length 4096 \
--lora_rank 8 \
--lora_alpha 32 \
--lora_dropout_p 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh
similarity index 53%
rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh
rename to examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh
index b6c221155d..93225dd116 100644
--- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh
@@ -1,15 +1,15 @@
CUDA_VISIBLE_DEVICES=0 \
python src/llm_infer.py \
- --model_type qwen-7b-chat \
+ --model_type baichuan2-7b-chat \
--sft_type lora \
- --template_type chatml \
+ --template_type baichuan \
--dtype bf16 \
- --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
--eval_human false \
- --dataset damo-agent-mini-zh \
- --dataset_sample -1 \
+ --dataset advertise-gen \
--max_length 2048 \
- --use_flash_attn true \
+ --quantization_bit 4 \
+ --bnb_4bit_comp_dtype bf16 \
--max_new_tokens 1024 \
--temperature 0.9 \
--top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh
similarity index 55%
rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh
rename to examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh
index 7f4c9c37bd..f78cddec1e 100644
--- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh
@@ -1,20 +1,18 @@
-# Experimental environment: 2 * A100
-nproc_per_node=2
-CUDA_VISIBLE_DEVICES=0,1 \
-torchrun \
- --nproc_per_node=$nproc_per_node \
- --master_port 29500 \
- src/llm_sft.py \
- --model_type qwen-7b-chat \
+# Experimental environment: 3090
+# 12GB GPU memory
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+ --model_type baichuan2-7b-chat \
--sft_type lora \
- --template_type chatml \
+ --template_type baichuan \
--dtype bf16 \
--output_dir runs \
- --ddp_backend nccl \
- --dataset damo-agent-mini-zh \
- --dataset_sample -1 \
+ --dataset advertise-gen \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
+ --quantization_bit 4 \
+ --bnb_4bit_comp_dtype bf16 \
--lora_rank 8 \
--lora_alpha 32 \
--lora_dropout_p 0. \
@@ -23,15 +21,14 @@ torchrun \
--batch_size 1 \
--weight_decay 0. \
--learning_rate 1e-4 \
- --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+ --gradient_accumulation_steps 16 \
--max_grad_norm 0.5 \
--warmup_ratio 0.03 \
--eval_steps 100 \
--save_steps 100 \
--save_total_limit 2 \
--logging_steps 10 \
- --use_flash_attn true \
--push_to_hub false \
- --hub_model_id qwen-7b-chat-qlora \
+ --hub_model_id baichuan2-7b-chat-qlora \
--hub_private_repo true \
--hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
index 96aa910f23..61443d071c 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatglm2 \
--dtype bf16 \
--ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset code-python-zh \
+ --max_length 8192 \
--max_new_tokens 1024 \
--temperature 0.9 \
--top_k 50 \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
index 7ec0bb88d9..06ae8c240a 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: A100
+# 50GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1 \
torchrun \
@@ -10,13 +12,14 @@ torchrun \
--dtype bf16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset code-python-zh \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
- --max_length 2048 \
+ --max_length 8192 \
--lora_rank 8 \
--lora_alpha 32 \
--lora_dropout_p 0. \
+ --lora_target_modules ALL \
--gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh
index be9c767529..1d06d09d36 100644
--- a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type internlm \
--dtype bf16 \
--ckpt_dir "runs/internlm-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset jd-zh \
+ --max_length 2048 \
--max_new_tokens 1024 \
--temperature 0.9 \
--top_k 50 \
diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh
index d0be14e4e9..6f2a8abea2 100644
--- a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh
@@ -10,8 +10,8 @@ torchrun \
--dtype bf16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample 20000 \
+ --dataset jd-zh \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh
index 9a0e04499a..ef7554bdc4 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh
@@ -1,11 +1,12 @@
# Experimental environment: 2 * 3090
+# llama2 is not good at Chinese, openbuddy llama2 is recommended
CUDA_VISIBLE_DEVICES=0,1 \
python src/llm_sft.py \
--model_type llama2-70b-chat \
--sft_type lora \
--output_dir runs \
--dataset alpaca-en \
- --dataset_sample 20000 \
+ --train_dataset_sample 20000 \
--num_train_epochs 1 \
--max_length 2048 \
--quantization_bit 4 \
diff --git a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh b/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh
index 6643d2870e..9efcc3ee67 100644
--- a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh
+++ b/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh
@@ -7,7 +7,7 @@ python src/llm_sft.py \
--dtype bf16 \
--output_dir runs \
--dataset alpaca-en,alpaca-zh \
- --dataset_sample 20000 \
+ --train_dataset_sample 20000 \
--num_train_epochs 1 \
--max_length 2048 \
--quantization_bit 4 \
diff --git a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh
index 6917fd2ae9..1081bbeee9 100644
--- a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh
@@ -10,7 +10,7 @@ torchrun \
--ddp_backend nccl \
--dtype bf16 \
--dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--quantization_bit 4 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
index 605b6b886e..fdd1b03d6d 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
@@ -7,7 +7,6 @@ python src/llm_infer.py \
--ckpt_dir "runs/qwen-7b/vx_xxx/checkpoint-xxx" \
--eval_human false \
--dataset dureader-robust-zh \
- --dataset_sample -1 \
--max_length 2048 \
--use_flash_attn true \
--max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
index b210a40bcb..edcd980590 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
@@ -11,7 +11,7 @@ torchrun \
--output_dir runs \
--ddp_backend nccl \
--dataset dureader-robust-zh \
- --dataset_sample -1 \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
index 9ef3c08124..5d280cf86f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset damo-agent-zh \
+ --max_length 8192 \
--use_flash_attn true \
--max_new_tokens 1024 \
--temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
index 9ce0d348de..2950c6189e 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -1,5 +1,5 @@
# Experimental environment: 2 * A100
-# 2 * 50GB GPU memory
+# 2 * 75GB GPU memory
CUDA_VISIBLE_DEVICES=0,1 \
python src/llm_sft.py \
--model_type qwen-7b-chat \
@@ -7,10 +7,10 @@ python src/llm_sft.py \
--template_type chatml \
--dtype bf16 \
--output_dir runs \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset damo-agent-zh \
+ --train_dataset_sample 200000 \
--num_train_epochs 1 \
- --max_length 2048 \
+ --max_length 8192 \
--gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0.01 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
index 9ef3c08124..d02ca2471f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset medical-en,medical-zh \
+ --max_length 8192 \
--use_flash_attn true \
--max_new_tokens 1024 \
--temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
index 1759bc8f2e..e44a960cc9 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -1,5 +1,5 @@
# Experimental environment: 4 * A100
-# 4 * 50GB GPU memory
+# 4 * 75GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1,2,3 \
torchrun \
@@ -11,10 +11,10 @@ torchrun \
--template_type chatml \
--dtype bf16 \
--output_dir runs \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset medical-en,medical-zh \
+ --train_dataset_sample 200000 \
--num_train_epochs 1 \
- --max_length 2048 \
+ --max_length 8192 \
--gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0.01 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
index 445f8d0e7b..b7ab5137fc 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset cot-en,cot-zh \
+ --max_length 2048 \
--use_flash_attn true \
--max_new_tokens 1024 \
--temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
index 025f728cb1..3ec7695054 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: A100
+# 38GB GPU memory
CUDA_VISIBLE_DEVICES=0 \
python src/llm_sft.py \
--model_type qwen-7b-chat \
@@ -5,14 +7,14 @@ python src/llm_sft.py \
--template_type chatml \
--dtype bf16 \
--output_dir runs \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset cot-en,cot-zh \
+ --train_dataset_sample 50000 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
--lora_alpha 32 \
--lora_dropout_p 0. \
- --lora_target_modules c_attn c_proj \
+ --lora_target_modules ALL \
--gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
index 27d3c0cbb3..88bfbd2635 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,8 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset sharegpt-en,sharegpt-zh \
--max_length 2048 \
--use_flash_attn true \
--max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
index fd92b9a941..42aac8e2b5 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
@@ -1,6 +1,6 @@
# Experimental environment: 2 * A100
-# 2 * 27GB GPU memory
-# use_flash_attn=false: 2 * 31GB GPU memory
+# 2 * 38GB GPU memory
+# use_flash_attn=false: 2 * 70GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1 \
torchrun \
@@ -13,14 +13,14 @@ torchrun \
--dtype bf16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset sharegpt-en,sharegpt-zh \
+ --train_dataset_sample 50000 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
--lora_alpha 32 \
--lora_dropout_p 0. \
- --lora_target_modules c_attn c_proj \
+ --lora_target_modules ALL \
--gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
index 152bd6b020..fedd587479 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -5,7 +5,8 @@ python src/llm_infer.py \
--template_type chatml \
--dtype fp16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset advertise-gen \
--max_length 2048 \
--use_flash_attn false \
--max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
index 8846e714d6..2de6d64704 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -1,5 +1,5 @@
# Experimental environment: 4 * V100(16GB)
-# 4 * 14GB GPU memory
+# 4 * 15GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1,2,3 \
torchrun \
@@ -12,8 +12,8 @@ torchrun \
--dtype fp16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset advertise-gen \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
index 66dd4f0fda..644c2d5553 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset advertise-gen \
+ --max_length 2048 \
--quantization_bit 4 \
--bnb_4bit_comp_dtype bf16 \
--use_flash_attn false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
index f2fa03851c..6b4a99e61a 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: 3090
+# 12GB GPU memory
CUDA_VISIBLE_DEVICES=0 \
python src/llm_sft.py \
--model_type qwen-7b-chat \
@@ -5,8 +7,8 @@ python src/llm_sft.py \
--template_type chatml \
--dtype bf16 \
--output_dir runs \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset advertise-gen \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--quantization_bit 4 \
@@ -15,7 +17,7 @@ python src/llm_sft.py \
--lora_alpha 32 \
--lora_dropout_p 0. \
--lora_target_modules ALL \
- --gradient_checkpointing true \
+ --gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0. \
--learning_rate 1e-4 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
index 66dd4f0fda..644c2d5553 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
--template_type chatml \
--dtype bf16 \
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
- --eval_human true \
+ --eval_human false \
+ --dataset advertise-gen \
+ --max_length 2048 \
--quantization_bit 4 \
--bnb_4bit_comp_dtype bf16 \
--use_flash_attn false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
index ab324f14e1..4f7433f8c3 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh
@@ -1,5 +1,5 @@
# Experimental environment: 2 * 3090
-# 2 * 13GB GPU memory
+# 2 * 14GB GPU memory
nproc_per_node=2
CUDA_VISIBLE_DEVICES=0,1 \
torchrun \
@@ -12,8 +12,8 @@ torchrun \
--dtype bf16 \
--output_dir runs \
--ddp_backend nccl \
- --dataset alpaca-en,alpaca-zh \
- --dataset_sample -1 \
+ --dataset advertise-gen \
+ --train_dataset_sample -1 \
--num_train_epochs 1 \
--max_length 2048 \
--quantization_bit 4 \
@@ -22,7 +22,7 @@ torchrun \
--lora_alpha 32 \
--lora_dropout_p 0. \
--lora_target_modules ALL \
- --gradient_checkpointing true \
+ --gradient_checkpointing false \
--batch_size 1 \
--weight_decay 0. \
--learning_rate 1e-4 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh
index 670a7ba72f..aaf3592d74 100644
--- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh
@@ -7,7 +7,6 @@ python src/llm_infer.py \
--ckpt_dir "runs/qwen-vl/vx_xxx/checkpoint-xxx" \
--eval_human false \
--dataset coco-en \
- --dataset_sample 20000 \
--max_length 2048 \
--use_flash_attn true \
--max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh
index 55a8d029ed..b3556d086d 100644
--- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh
@@ -11,7 +11,7 @@ torchrun \
--output_dir runs \
--ddp_backend nccl \
--dataset coco-en \
- --dataset_sample 20000 \
+ --train_dataset_sample 20000 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh
index 220c277536..c9536e1607 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh
@@ -7,7 +7,6 @@ python src/llm_infer.py \
--ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
--eval_human false \
--dataset coco-en \
- --dataset_sample 20000 \
--max_length 2048 \
--use_flash_attn true \
--max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh
index 1760efbe8a..11741a0e6e 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh
@@ -11,7 +11,7 @@ torchrun \
--output_dir runs \
--ddp_backend nccl \
--dataset coco-en \
- --dataset_sample 20000 \
+ --train_dataset_sample 20000 \
--num_train_epochs 1 \
--max_length 2048 \
--lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
index cb3e4b7062..7bc8e82d24 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
@@ -7,7 +7,6 @@ python src/llm_infer.py \
--ckpt_dir "runs/seqgpt-560m/vx_xxx/checkpoint-xxx" \
--eval_human false \
--dataset ner-jave-zh \
- --dataset_sample -1 \
--max_length 1024 \
--max_new_tokens 1024 \
--temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
index 5d0ada5770..89878985f9 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
@@ -6,7 +6,7 @@ python src/llm_sft.py \
--dtype bf16 \
--output_dir runs \
--dataset ner-jave-zh \
- --dataset_sample -1 \
+ --train_dataset_sample -1 \
--num_train_epochs 3 \
--max_length 1024 \
--gradient_checkpointing false \
diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index 674b0b60c9..7f852f0056 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -8,7 +8,7 @@
from transformers import BitsAndBytesConfig, GenerationConfig, TextStreamer
from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
get_dataset, get_model_tokenizer, get_preprocess, inference,
- process_dataset, select_bnb, select_dtype, show_layers)
+ select_bnb, select_dtype, show_layers)
from swift import Swift, get_logger
from swift.utils import parse_args, print_model_info, seed_everything
@@ -26,7 +26,7 @@ class InferArguments:
template_type: str = field(
default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx'
- eval_human: bool = False # False: eval test_dataset
+ eval_human: bool = False # False: eval val_dataset
seed: int = 42
dtype: str = field(
@@ -36,9 +36,9 @@ class InferArguments:
dataset: str = field(
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
- dataset_seed: int = 42
- dataset_sample: int = -1 # -1: all dataset
- dataset_test_size: float = 0.01
+ dataset_split_seed: int = 42
+ dataset_test_ratio: float = 0.01
+ show_dataset_sample: int = 20
system: str = 'you are a helpful assistant!'
max_length: Optional[int] = 2048
@@ -138,14 +138,12 @@ def llm_infer(args: InferArguments) -> None:
inference(input_ids, model, tokenizer, streamer, generation_config,
args.skip_prompt)
else:
- dataset = get_dataset(args.dataset.split(','))
- _, test_dataset = process_dataset(dataset, args.dataset_test_size,
- args.dataset_sample,
- args.dataset_seed)
- mini_test_dataset = test_dataset.select(
- range(min(10, test_dataset.shape[0])))
- del dataset
- for data in mini_test_dataset:
+ _, val_dataset = get_dataset(
+ args.dataset.split(','), args.dataset_test_ratio,
+ args.dataset_split_seed)
+ mini_val_dataset = val_dataset.select(
+ range(min(args.show_dataset_sample, val_dataset.shape[0])))
+ for data in mini_val_dataset:
response = data['response']
data['response'] = None
input_ids = preprocess_func(data)['input_ids']
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 0dbb8f2049..2214f1d6ff 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -7,15 +7,16 @@
from typing import List, Optional
import json
+import numpy as np
import torch
import torch.distributed as dist
from transformers import BitsAndBytesConfig
from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
- broadcast_string, check_json_format,
+ broadcast_string, check_json_format, dataset_map,
find_all_linear_for_lora, get_dataset, get_dist_setting,
get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
- is_dist, is_master, plot_images, process_dataset,
- select_bnb, select_dtype, show_layers, sort_by_max_length)
+ is_dist, is_master, plot_images, select_bnb, select_dtype,
+ show_layers, sort_by_max_length)
from swift import (HubStrategy, LoraConfig, Seq2SeqTrainer,
Seq2SeqTrainingArguments, Swift, get_logger)
@@ -49,9 +50,9 @@ class SftArguments:
dataset: str = field(
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
- dataset_seed: int = 42
- dataset_sample: int = -1 # -1: all dataset
- dataset_test_size: float = 0.01
+ dataset_split_seed: int = 42
+ train_dataset_sample: int = 20000 # -1: all dataset
+ dataset_test_ratio: float = 0.01
system: str = 'you are a helpful assistant!'
max_length: Optional[int] = 2048
@@ -233,21 +234,28 @@ def llm_sft(args: SftArguments) -> None:
show_layers(model)
print_model_info(model)
- logger.info(str(model))
+ logger.info(model)
# ### Loading Dataset
- dataset = get_dataset(args.dataset.split(','))
- train_dataset, val_dataset = process_dataset(dataset,
- args.dataset_test_size,
- args.dataset_sample,
- args.dataset_seed)
+ train_dataset, val_dataset = get_dataset(
+ args.dataset.split(','), args.dataset_test_ratio,
+ args.dataset_split_seed)
preprocess_func = get_preprocess(args.template_type, tokenizer,
args.system, args.max_length)
- train_dataset = train_dataset.map(preprocess_func)
- val_dataset = val_dataset.map(preprocess_func)
- del dataset
+ if args.train_dataset_sample >= 0:
+ val_dataset_sample = int(args.train_dataset_sample
+ * args.dataset_test_ratio)
+ train_idxs = np.random.permutation(args.train_dataset_sample)
+ train_dataset = train_dataset.select(train_idxs)
+ if val_dataset.shape[0] > val_dataset_sample:
+ val_idxs = np.random.permutation(val_dataset_sample)
+ val_dataset = val_dataset.select(val_idxs)
+ logger.info(f'train_dataset: {train_dataset}')
+ logger.info(f'val_dataset: {val_dataset}')
+ train_dataset = dataset_map(train_dataset, preprocess_func)
+ val_dataset = dataset_map(val_dataset, preprocess_func)
if args.test_oom_error:
- train_dataset = sort_by_max_length(train_dataset)
+ train_dataset = sort_by_max_length(train_dataset, 20000)
# Data analysis
stat_dataset(train_dataset)
stat_dataset(val_dataset)
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
index 3fb8d71254..10ace8ba3c 100644
--- a/examples/pytorch/llm/src/utils/__init__.py
+++ b/examples/pytorch/llm/src/utils/__init__.py
@@ -1,8 +1,8 @@
from .dataset import DATASET_MAPPING, get_dataset
from .model import MODEL_MAPPING, get_model_tokenizer
from .preprocess import TEMPLATE_MAPPING, get_preprocess
-from .utils import (broadcast_string, check_json_format, download_dataset,
- find_all_linear_for_lora, get_dist_setting, inference,
- is_ddp_plus_mp, is_dist, is_local_master, is_master,
- plot_images, process_dataset, select_bnb, select_dtype,
- show_layers, sort_by_max_length)
+from .utils import (broadcast_string, check_json_format, dataset_map,
+ download_dataset, find_all_linear_for_lora,
+ get_dist_setting, inference, is_ddp_plus_mp, is_dist,
+ is_local_master, is_master, plot_images, select_bnb,
+ select_dtype, show_layers, sort_by_max_length)
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 3324354cc3..6804a9dca4 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -3,7 +3,7 @@
import os
import re
from functools import partial
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
import json
import numpy as np
@@ -12,6 +12,7 @@
from modelscope import MsDataset
from tqdm.auto import tqdm
+from swift.utils import get_seed
from .preprocess import History
from .utils import download_dataset
@@ -19,20 +20,20 @@
def _preprocess_alpaca_dataset(
dataset: HfDataset,
preprocess_input: Optional[Callable[[str], str]] = None) -> HfDataset:
- instruction = dataset['instruction']
- input_ = dataset['input']
- new_instruction: List[str] = []
- for inst, inp in zip(instruction, input_):
+ query: List[str] = []
+ response = []
+ for d in dataset:
+ inst, inp, output = d['instruction'], d['input'], d['output']
+ if output is None:
+ continue
if inp is None:
inp = ''
if preprocess_input is not None:
inp = preprocess_input(inp)
- inst = f'{inst}\n{inp}'
- new_instruction.append(inst)
- dataset = HfDataset.from_dict({
- 'query': new_instruction,
- 'response': dataset['output']
- })
+ q = f'{inst}\n{inp}'
+ query.append(q)
+ response.append(output)
+ dataset = HfDataset.from_dict({'query': query, 'response': response})
return dataset
@@ -42,6 +43,29 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset:
return _preprocess_alpaca_dataset(dataset)
+def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset:
+ prompt = """Task: Generating advertisements based on keywords.
+Keywords: {query}
+Advertisements: """
+ query = []
+ response = []
+ for d in tqdm(dataset):
+ query.append(prompt.format(query=d['content']))
+ response.append(d['summary'])
+ return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
+ dataset_train: HfDataset = MsDataset.load(
+ 'lvjianjin/AdvertiseGen', split='train').to_hf_dataset()
+ dataset_val: HfDataset = MsDataset.load(
+ 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset()
+ return [
+ _preprocess_advertise_gen_dataset(dataset_train),
+ _preprocess_advertise_gen_dataset(dataset_val)
+ ]
+
+
def get_alpaca_gpt4_zh_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
@@ -148,14 +172,14 @@ def _preprocess_mutimodal_dataset(dataset: HfDataset, prompt: str,
return dataset
-def get_coco_en_dataset() -> HfDataset:
+def get_coco_en_dataset() -> Tuple[HfDataset, HfDataset]:
dataset_dict = MsDataset.load('modelscope/coco_2014_caption')
- dataset: HfDataset = concatenate_datasets([
- dataset_dict['train'].to_hf_dataset(),
- dataset_dict['validation'].to_hf_dataset()
- ])
- return _preprocess_mutimodal_dataset(dataset, 'please describe the image',
- 'image', 'caption')
+ train_dataset = dataset_dict['train'].to_hf_dataset()
+ val_dataset = dataset_dict['validation'].to_hf_dataset()
+ return tuple(
+ _preprocess_mutimodal_dataset(dataset, 'please describe the image',
+ 'image', 'caption')
+ for dataset in (train_dataset, val_dataset))
def _filter_agent_dataset(dataset: List[Dict[str, Any]],
@@ -208,14 +232,17 @@ def _preprocess_agent_dataset(dataset: List[Dict[str, str]]) -> HfDataset:
return dataset
-def get_damo_agent_zh_dataset(use_mini: bool = False) -> HfDataset:
+def get_damo_agent_zh_dataset(
+ use_mini: bool = False) -> Tuple[HfDataset, HfDataset]:
dataset_dict = MsDataset.load('damo/MSAgent-Bench')
- dataset: HfDataset = concatenate_datasets([
- dataset_dict['train'].to_hf_dataset(),
- dataset_dict['validation'].to_hf_dataset()
- ])
- dataset = _filter_agent_dataset(dataset, use_mini)
- return _preprocess_agent_dataset(dataset)
+ train_dataset = dataset_dict['train'].to_hf_dataset()
+ val_dataset = dataset_dict['validation'].to_hf_dataset()
+ dataset_list = []
+ for dataset in (train_dataset, val_dataset):
+ dataset = _filter_agent_dataset(dataset, use_mini)
+ dataset = _preprocess_agent_dataset(dataset)
+ dataset_list.append(dataset)
+ return tuple(dataset_list)
_firefly_kind_list = [
@@ -261,24 +288,33 @@ def get_firefly_all_zh_dataset() -> HfDataset:
return get_firefly_zh_dataset(_firefly_kind_list)
-def get_poetry_zh_dataset() -> HfDataset:
+def get_poetry_zh_dataset() -> Tuple[HfDataset, HfDataset]:
dataset_dict = MsDataset.load('modelscope/chinese-poetry-collection')
- dataset: HfDataset = concatenate_datasets([
- dataset_dict['train'].to_hf_dataset(),
- dataset_dict['test'].to_hf_dataset()
- ])
- return HfDataset.from_dict({
- 'query': ['写诗'] * len(dataset),
- 'response': dataset['text1']
- })
+ train_dataset: HfDataset = dataset_dict['train'].to_hf_dataset()
+ val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
+ dataset_list = []
+ for dataset in (train_dataset, val_dataset):
+ dataset_list.append(
+ HfDataset.from_dict({
+ 'query': ['写诗'] * len(dataset),
+ 'response': dataset['text1']
+ }))
+ return tuple(dataset_list)
def get_instruct_en_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'wyj123456/instruct', split='train').to_hf_dataset()
- dataset = dataset.rename_column('prompt', 'query')
- dataset = dataset.rename_column('completion', 'response')
- return dataset
+ query = []
+ response = []
+ for d in tqdm(dataset):
+ q = d['prompt']
+ r = d['completion']
+ if q is None:
+ continue
+ query.append(q)
+ response.append(r)
+ return HfDataset.from_dict({'query': query, 'response': response})
def get_gpt4all_en_dataset() -> HfDataset:
@@ -297,7 +333,7 @@ def _preprocess_cls_dataset(dataset: HfDataset, cls_mapping: List[str],
prompt = f"""Task: {task}
{input_}
Category: {category}
-Label: """
+Output: """
query = []
response = []
for d in tqdm(dataset):
@@ -314,30 +350,32 @@ def _preprocess_cls_dataset(dataset: HfDataset, cls_mapping: List[str],
return HfDataset.from_dict({'query': query, 'response': response})
-def get_cmnli_zh_dataset() -> HfDataset:
+def get_cmnli_zh_dataset() -> Tuple[HfDataset, HfDataset]:
"""Natural Language Inference"""
dataset_dict = MsDataset.load('clue', subset_name='cmnli')
- dataset: HfDataset = concatenate_datasets([
+ train_dataset: HfDataset = concatenate_datasets([
dataset_dict['train'].to_hf_dataset(),
dataset_dict['validation'].to_hf_dataset(),
- dataset_dict['test'].to_hf_dataset(),
])
+ val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
cls_mapping = ['neutral', 'entailment', 'contradiction']
- return _preprocess_cls_dataset(dataset, cls_mapping,
- 'Natural Language Inference', True)
+ return tuple(
+ _preprocess_cls_dataset(dataset, cls_mapping,
+ 'Natural Language Inference', True)
+ for dataset in (train_dataset, val_dataset))
-def get_jd_zh_dataset() -> HfDataset:
+def get_jd_zh_dataset() -> Tuple[HfDataset, HfDataset]:
"""Sentiment classification"""
dataset_dict = MsDataset.load('DAMO_NLP/jd')
- dataset: HfDataset = concatenate_datasets([
- dataset_dict['train'].to_hf_dataset(),
- dataset_dict['validation'].to_hf_dataset()
- ])
+ train_dataset: HfDataset = dataset_dict['train'].to_hf_dataset()
+ val_dataset: HfDataset = dataset_dict['validation'].to_hf_dataset()
cls_mapping = ['negative', 'positive']
- return _preprocess_cls_dataset(dataset, cls_mapping,
- 'Sentiment Classification', False)
+ return tuple(
+ _preprocess_cls_dataset(dataset, cls_mapping,
+ 'Sentiment Classification', False)
+ for dataset in (train_dataset, val_dataset))
def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset:
@@ -355,44 +393,56 @@ def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset:
return HfDataset.from_dict({'query': query, 'response': response})
-def get_dureader_robust_qg_zh_dataset() -> HfDataset:
+def get_dureader_robust_qg_zh_dataset() -> Tuple[HfDataset, HfDataset]:
"""Question Generation"""
dataset_dict = MsDataset.load('modelscope/DuReader_robust-QG')
- dataset: HfDataset = concatenate_datasets([
+ train_dataset: HfDataset = concatenate_datasets([
dataset_dict['train'].to_hf_dataset(),
dataset_dict['validation'].to_hf_dataset(),
- dataset_dict['test'].to_hf_dataset()
])
- return _preprocess_dureader_robust(dataset)
+ val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
+ return tuple(
+ _preprocess_dureader_robust(dataset)
+ for dataset in (train_dataset, val_dataset))
def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset:
query = []
+ response = []
for d in tqdm(dataset):
+ r = d['output']
+ if r is None:
+ continue
if subset_name == 'zh':
q = d['instruction']
else:
q = d['input']
+ if q is None:
+ continue
query.append(q)
- return HfDataset.from_dict({'query': query, 'response': dataset['output']})
+ response.append(r)
+ return HfDataset.from_dict({'query': query, 'response': response})
-def get_medical_dataset(subset_name: str,
- dataset_sample: int = -1) -> HfDataset:
+def get_medical_dataset(
+ subset_name: str,
+ train_dataset_sample: int = -1) -> Tuple[HfDataset, HfDataset]:
"""
mode: Literal['en', zh]
"""
dataset_dict = MsDataset.load(
'huangjintao/medical_zh', subset_name=subset_name)
- dataset: HfDataset = concatenate_datasets([
+ train_dataset: HfDataset = concatenate_datasets([
dataset_dict['train'].to_hf_dataset(),
dataset_dict['val'].to_hf_dataset(),
- dataset_dict['test'].to_hf_dataset(),
])
- if dataset_sample != -1:
- idxs = np.random.permutation(dataset_sample)
- dataset = dataset.select(idxs)
- return _preprocess_medical(dataset, subset_name)
+ val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
+ if train_dataset_sample >= 0:
+ idxs = np.random.permutation(train_dataset_sample)
+ train_dataset = train_dataset.select(idxs)
+ return tuple(
+ _preprocess_medical(dataset, subset_name)
+ for dataset in (train_dataset, val_dataset))
def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset:
@@ -458,6 +508,23 @@ def get_ner_jave_zh() -> HfDataset:
})
+def _preprocess_code_python_dataset(dataset: HfDataset) -> HfDataset:
+ query = []
+ response = []
+ for d in tqdm(dataset):
+ chat_rounds = ast.literal_eval(d['chat_rounds'])
+ assert len(chat_rounds) == 2
+ query.append(chat_rounds[-2]['content'])
+ response.append(chat_rounds[-1]['content'])
+ return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_code_python_zh_dataset() -> HfDataset:
+ dataset = MsDataset.load(
+ 'codefuse-ai/CodeExercise-Python-27k').to_hf_dataset()
+ return _preprocess_code_python_dataset(dataset)
+
+
DATASET_MAPPING = {
# nlp chat
'alpaca-en':
@@ -491,7 +558,11 @@ def get_ner_jave_zh() -> HfDataset:
'medical-zh':
partial(get_medical_dataset, subset_name='zh'),
'medical-mini-zh':
- partial(get_medical_dataset, subset_name='zh', dataset_sample=100000),
+ partial(
+ get_medical_dataset, subset_name='zh', train_dataset_sample=100000),
+ 'code-python-zh':
+ get_code_python_zh_dataset,
+
# multi-round chat
'damo-agent-mini-zh':
partial(get_damo_agent_zh_dataset, use_mini=True),
@@ -501,13 +572,17 @@ def get_ner_jave_zh() -> HfDataset:
get_sharegpt_all_en_dataset,
'sharegpt-zh':
get_sharegpt_all_zh_dataset,
- # nlp text-generation (please use model:base, template:default-generation)
+
+ # nlp text-generation
'cmnli-zh':
get_cmnli_zh_dataset,
'jd-zh':
get_jd_zh_dataset,
'dureader-robust-zh':
get_dureader_robust_qg_zh_dataset,
+ 'advertise-gen':
+ get_advertise_gen_dataset,
+
# multi-modal chat
'coco-en':
get_coco_en_dataset,
@@ -520,10 +595,35 @@ def get_ner_jave_zh() -> HfDataset:
}
-def get_dataset(dataset_name_list: List[str]) -> HfDataset:
- dataset_list: List[HfDataset] = []
+def get_dataset(
+ dataset_name_list: List[str],
+ dataset_test_ratio: float = 0.,
+ dataset_split_seed: int = 42,
+) -> Tuple[HfDataset, Optional[HfDataset]]:
+ """Returns train_dataset and val_dataset"""
+ train_dataset_list: List[HfDataset] = []
+ val_dataset_list: List[HfDataset] = []
+ random_state = np.random.RandomState(dataset_split_seed)
for dataset_name in dataset_name_list:
get_function = DATASET_MAPPING[dataset_name]
- dataset_list.append(get_function())
- dataset = concatenate_datasets(dataset_list)
- return dataset
+ dataset = get_function()
+ if isinstance(dataset, (list, tuple)):
+ train_d = dataset[0]
+ val_d = dataset[1]
+ else:
+ if dataset_test_ratio > 0:
+ dataset_dict = dataset.train_test_split(
+ dataset_test_ratio, seed=get_seed(random_state))
+ train_d, val_d = dataset_dict['train'], dataset_dict['test']
+ else:
+ train_d, val_d = dataset, None
+ train_dataset_list.append(train_d)
+ if val_d is not None:
+ val_dataset_list.append(val_d)
+
+ train_dataset = concatenate_datasets(train_dataset_list)
+ val_dataset = None
+ if len(val_dataset_list) > 0:
+ val_dataset = concatenate_datasets(val_dataset_list)
+
+ return train_dataset, val_dataset
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index 6b456fe54e..dca397e4f8 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -308,7 +308,7 @@ class LoRATM(NamedTuple):
},
'baichuan2-7b-chat': {
'model_id': 'baichuan-inc/Baichuan2-7B-Chat',
- 'revision': 'v1.0.0',
+ 'revision': 'v1.0.1',
'template': 'baichuan',
'lora_TM': LoRATM.baichuan,
},
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index 4d26613570..f3a08a7ab5 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -8,51 +8,51 @@
TEMPLATE_MAPPING = {
'default': {
- 'prefix': ['{{system}}\n\n'],
- 'prompt': ['### Human:\n', '{{query}}\n\n', '### Assistant:\n'],
+ 'prefix': ['{{SYSTEM}}\n\n'],
+ 'prompt': ['### Human:\n', '{{QUERY}}\n\n', '### Assistant:\n'],
'chat_sep': ['\n\n'],
'suffix': [['eos_token_id']],
},
'default-generation': {
'prefix': [],
- 'prompt': ['{{query}}'],
+ 'prompt': ['{{QUERY}}'],
'suffix': [['eos_token_id']],
},
'chatml': {
- 'prefix': ['<|im_start|>system\n{{system}}<|im_end|>\n'],
+ 'prefix': ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'],
'prompt':
- ['<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n'],
+ ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'],
'chat_sep': ['<|im_end|>\n'],
'suffix': ['<|im_end|><|endoftext|>'],
},
'baichuan': {
'prefix': [],
- 'prompt': [[195], '{{query}}', [196]],
+ 'prompt': [[195], '{{QUERY}}', [196]],
'chat_sep': [],
'suffix': [['eos_token_id']],
},
'chatglm2': {
'prefix': [[64790, 64792]],
- 'prompt': ['[Round {{round}}]\n\n问:{{query}}\n\n答:'],
+ 'prompt': ['[Round {{ROUND}}]\n\n问:{{QUERY}}\n\n答:'],
'chat_sep': ['\n\n'],
'suffix': [['eos_token_id']],
},
'llama': {
'prefix': [['bos_token_id'],
- '[INST] <>\n{{system}}\n<>\n\n'],
- 'prompt': ['{{query}} [/INST] '],
+ '[INST] <>\n{{SYSTEM}}\n<>\n\n'],
+ 'prompt': ['{{QUERY}} [/INST] '],
'chat_sep': [' ', ['eos_token_id', 'bos_token_id'], '[INST] '],
'suffix': [['eos_token_id']],
},
'openbuddy-llama': {
- 'prefix': ['{{system}}\n\n'],
- 'prompt': ['User: {{query}}\nAssistant: '],
+ 'prefix': ['{{SYSTEM}}\n\n'],
+ 'prompt': ['User: {{QUERY}}\nAssistant: '],
'chat_sep': ['\n'],
'suffix': [['eos_token_id']],
},
'internlm': {
'prefix': [''],
- 'prompt': ['<|User|>:{{query}}\n<|Bot|>:'],
+ 'prompt': ['<|User|>:{{QUERY}}\n<|Bot|>:'],
'chat_sep': ['\n'],
'suffix': [''],
}
@@ -87,7 +87,7 @@ def concat_context_list(
for context in context_list:
if isinstance(context, str):
for (old_str,
- new_str) in zip(['{{system}}', '{{query}}', '{{round}}'],
+ new_str) in zip(['{{SYSTEM}}', '{{QUERY}}', '{{ROUND}}'],
[system, query, round]):
if new_str is not None and old_str in context:
placeholder_list.append(new_str)
@@ -108,7 +108,7 @@ def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context],
token = c
input_ids.append(token)
elif isinstance(context, str):
- for old_str in ['{{system}}', '{{query}}', '{{round}}']:
+ for old_str in ['{{SYSTEM}}', '{{QUERY}}', '{{ROUND}}']:
if old_str in context:
new_str = next(placeholder_it)
context = context.replace(old_str, new_str)
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py
index 5f2a85e6e3..94c85f61bf 100644
--- a/examples/pytorch/llm/src/utils/utils.py
+++ b/examples/pytorch/llm/src/utils/utils.py
@@ -1,11 +1,13 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed from huggingface/transformers.
+import heapq
import logging
import os
import shutil
from functools import wraps
from tempfile import TemporaryDirectory
-from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
+ Tuple, Union)
import matplotlib.pyplot as plt
import numpy as np
@@ -27,7 +29,6 @@
from swift import get_logger
from swift.hub import ModelScopeConfig
-from swift.utils import get_seed
from swift.utils.tb_utils import (TB_COLOR, TB_COLOR_SMOOTH,
read_tensorboard_file, tensorboard_smoothing)
from .callback import DefaultFlowCallbackNew, ProgressCallbackNew
@@ -247,22 +248,10 @@ def download_files(url: str, local_path: str, cookies) -> None:
f.write(data)
-def process_dataset(dataset: HfDataset, dataset_test_size: float,
- dataset_sample: int,
- dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
- random_state = np.random.RandomState(dataset_seed)
- if dataset_sample >= 0:
- index = random_state.permutation(len(dataset))[:dataset_sample]
- dataset = dataset.select(index)
- dataset = dataset.train_test_split(
- dataset_test_size, seed=get_seed(random_state))
- return dataset['train'], dataset['test']
-
-
-def sort_by_max_length(dataset: HfDataset) -> HfDataset:
- dataset_len = [len(d['input_ids']) for d in dataset]
- idx = sorted(
- range(len(dataset)), key=lambda i: dataset_len[i], reverse=True)
+def sort_by_max_length(dataset: HfDataset, num_dataset: int) -> HfDataset:
+ dataset_len = [len(d['input_ids']) for d in tqdm(dataset)]
+ idx = heapq.nlargest(
+ num_dataset, range(len(dataset_len)), key=lambda i: dataset_len[i])
input_ids = []
labels = []
for i in tqdm(idx):
@@ -373,6 +362,23 @@ def _infer_auto_device_map_patch(
return infer_auto_device_map(model, max_memory, verbose=verbose, **kwargs)
+def dataset_map(
+ dataset: HfDataset, preprocess_func: Callable[[Dict[str, Any]],
+ Dict[str,
+ Optional[List[int]]]]
+) -> HfDataset:
+ # faster than dataset.map
+ input_ids = []
+ labels = []
+ for d in tqdm(dataset):
+ d = preprocess_func(d)
+ if d['input_ids'] is None:
+ continue
+ input_ids.append(d['input_ids'])
+ labels.append(d['labels'])
+ return HfDataset.from_dict({'input_ids': input_ids, 'labels': labels})
+
+
logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s')
logger.handlers[0].setFormatter(logger_format)