modelscope · tastelikefeet · Sep 15, 2023 · Sep 14, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -27,7 +27,7 @@
    8. other: polylm-13b, seqgpt-560m
 3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
-   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
+   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
    3. multi-modal: coco-en
    4. other: cls-fudan-news-zh, ner-jave-zh
@@ -71,40 +71,40 @@ Training GPU memory: qlora(low,3090) > lora > full(2*A100)
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
-# sft lora and infer qwen-7b-chat, Requires 27GB GPU memory.
+# sft lora and infer qwen-7b-chat, Requires 38GB GPU memory.
 # You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed.
 # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
-# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory.
+# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
-# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*14GB GPU memory.
+# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*15GB GPU memory.
 # Recommended experimental environment: V100, A10, 3090
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
-# sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory.
+# sft(qlora) and infer qwen-7b-chat, Requires 12GB GPU memory.
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
 # Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
-# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory.
+# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory.
 # Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory.
+# sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
 bash scripts/qwen_7b_chat/full_mp/infer.sh
 
-# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory.
+# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*75GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh

diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -28,7 +28,7 @@
    8. other: polylm-13b, seqgpt-560m
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
-   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
+   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
    3. 多模态: coco-en
    4. 其他: cls-fudan-news-zh, ner-jave-zh
@@ -73,40 +73,40 @@ pip install .
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
-# 微调(lora)+推理 qwen-7b-chat, 需要27GB显存.
+# 微调(lora)+推理 qwen-7b-chat, 需要38GB显存.
 # 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
 # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
-# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存.
+# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
-# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*14GB显存.
+# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存.
 # 推荐的实验环境: V100, 3090, A10
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
-# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存.
+# 微调(qlora)+推理 qwen-7b-chat, 需要12GB显存.
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
 # 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
-# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存.
+# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
 # 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*50G显存.
+# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
 bash scripts/qwen_7b_chat/full_mp/infer.sh
 
-# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*50G显存.
+# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh

diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type baichuan \
     --dtype bf16 \
     --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \

diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
@@ -1,4 +1,5 @@
 # Experimental environment: 2 * A100
+# 2 * 44GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -11,10 +12,10 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample 20000 \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 4096 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \

diff --git a/.../llm/scripts/qwen_agent/lora_ddp/infer.sh → .../scripts/baichuan2_7b_chat/qlora/infer.sh b/.../llm/scripts/qwen_agent/lora_ddp/infer.sh → .../scripts/baichuan2_7b_chat/qlora/infer.sh
@@ -1,15 +1,15 @@
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
-    --model_type qwen-7b-chat \
+    --model_type baichuan2-7b-chat \
     --sft_type lora \
-    --template_type chatml \
+    --template_type baichuan \
     --dtype bf16 \
-    --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset damo-agent-mini-zh \
-    --dataset_sample -1 \
+    --dataset advertise-gen \
     --max_length 2048 \
-    --use_flash_attn true \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \

diff --git a/...ch/llm/scripts/qwen_agent/lora_ddp/sft.sh → ...lm/scripts/baichuan2_7b_chat/qlora/sft.sh b/...ch/llm/scripts/qwen_agent/lora_ddp/sft.sh → ...lm/scripts/baichuan2_7b_chat/qlora/sft.sh
@@ -1,20 +1,18 @@
-# Experimental environment: 2 * A100
-nproc_per_node=2
-CUDA_VISIBLE_DEVICES=0,1 \
-torchrun \
-    --nproc_per_node=$nproc_per_node \
-    --master_port 29500 \
-    src/llm_sft.py \
-    --model_type qwen-7b-chat \
+# Experimental environment: 3090
+# 12GB GPU memory
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+    --model_type baichuan2-7b-chat \
     --sft_type lora \
-    --template_type chatml \
+    --template_type baichuan \
     --dtype bf16 \
     --output_dir runs \
-    --ddp_backend nccl \
-    --dataset damo-agent-mini-zh \
-    --dataset_sample -1 \
+    --dataset advertise-gen \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
@@ -23,15 +21,14 @@ torchrun \
     --batch_size 1 \
     --weight_decay 0. \
     --learning_rate 1e-4 \
-    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --gradient_accumulation_steps 16 \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 10 \
-    --use_flash_attn true \
     --push_to_hub false \
-    --hub_model_id qwen-7b-chat-qlora \
+    --hub_model_id baichuan2-7b-chat-qlora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type chatglm2 \
     --dtype bf16 \
     --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset code-python-zh \
+    --max_length 8192 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \

diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: A100
+# 50GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -10,13 +12,14 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset code-python-zh \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 8192 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
+    --lora_target_modules ALL \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \

diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type internlm \
     --dtype bf16 \
     --ckpt_dir "runs/internlm-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset jd-zh \
+    --max_length 2048 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \

diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh
@@ -10,8 +10,8 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample 20000 \
+    --dataset jd-zh \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --lora_rank 8 \

diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh
@@ -1,11 +1,12 @@
 # Experimental environment: 2 * 3090
+# llama2 is not good at Chinese, openbuddy llama2 is recommended
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_sft.py \
     --model_type llama2-70b-chat \
     --sft_type lora \
     --output_dir runs \
     --dataset alpaca-en \
-    --dataset_sample 20000 \
+    --train_dataset_sample 20000 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --quantization_bit 4 \

diff --git a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh b/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh
@@ -7,7 +7,7 @@ python src/llm_sft.py \
     --dtype bf16 \
     --output_dir runs \
     --dataset alpaca-en,alpaca-zh \
-    --dataset_sample 20000 \
+    --train_dataset_sample 20000 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --quantization_bit 4 \

diff --git a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh
@@ -10,7 +10,7 @@ torchrun \
     --ddp_backend nccl \
     --dtype bf16 \
     --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --quantization_bit 4 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
@@ -7,7 +7,6 @@ python src/llm_infer.py \
     --ckpt_dir "runs/qwen-7b/vx_xxx/checkpoint-xxx" \
     --eval_human false \
     --dataset dureader-robust-zh \
-    --dataset_sample -1 \
     --max_length 2048 \
     --use_flash_attn true \
     --max_new_tokens 1024 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
@@ -11,7 +11,7 @@ torchrun \
     --output_dir runs \
     --ddp_backend nccl \
     --dataset dureader-robust-zh \
-    --dataset_sample -1 \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --lora_rank 8 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset damo-agent-zh \
+    --max_length 8192 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -1,16 +1,16 @@
 # Experimental environment: 2 * A100
-# 2 * 50GB GPU memory
+# 2 * 75GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
     --sft_type full \
     --template_type chatml \
     --dtype bf16 \
     --output_dir runs \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset damo-agent-zh \
+    --train_dataset_sample 200000 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 8192 \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0.01 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset medical-en,medical-zh \
+    --max_length 8192 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 4 * A100
-# 4 * 50GB GPU memory
+# 4 * 75GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
@@ -11,10 +11,10 @@ torchrun \
     --template_type chatml \
     --dtype bf16 \
     --output_dir runs \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset medical-en,medical-zh \
+    --train_dataset_sample 200000 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 8192 \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0.01 \

diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset cot-en,cot-zh \
+    --max_length 2048 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \