From 54d5dc3d8a62a511998d879aa515c7d531818dc5 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 06:36:23 +0800 Subject: [PATCH 1/5] update sh --- examples/pytorch/llm/README.md | 16 +- examples/pytorch/llm/README_CN.md | 16 +- .../baichuan2_7b_chat/lora_ddp/infer.sh | 5 +- .../scripts/baichuan2_7b_chat/lora_ddp/sft.sh | 7 +- .../scripts/baichuan2_7b_chat/qlora/infer.sh | 18 ++ .../scripts/baichuan2_7b_chat/qlora/sft.sh | 34 +++ .../lora_ddp => chatglm2_6b/lora}/infer.sh | 9 +- .../lora_ddp => chatglm2_6b/lora}/sft.sh | 25 +- .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh | 5 +- .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh | 7 +- .../llm/scripts/qwen_7b_chat/full_mp/infer.sh | 5 +- .../llm/scripts/qwen_7b_chat/full_mp/sft.sh | 8 +- .../scripts/qwen_7b_chat/full_mp_ddp/infer.sh | 5 +- .../scripts/qwen_7b_chat/full_mp_ddp/sft.sh | 8 +- .../llm/scripts/qwen_7b_chat/lora/infer.sh | 5 +- .../llm/scripts/qwen_7b_chat/lora/sft.sh | 8 +- .../scripts/qwen_7b_chat/lora_ddp/infer.sh | 4 +- .../llm/scripts/qwen_7b_chat/lora_ddp/sft.sh | 10 +- .../scripts/qwen_7b_chat/lora_mp_ddp/infer.sh | 4 +- .../scripts/qwen_7b_chat/lora_mp_ddp/sft.sh | 4 +- .../llm/scripts/qwen_7b_chat/qlora/infer.sh | 5 +- .../llm/scripts/qwen_7b_chat/qlora/sft.sh | 4 +- .../scripts/qwen_7b_chat/qlora_ddp/infer.sh | 5 +- .../llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh | 4 +- examples/pytorch/llm/src/llm_infer.py | 15 +- examples/pytorch/llm/src/llm_sft.py | 19 +- examples/pytorch/llm/src/utils/__init__.py | 10 +- examples/pytorch/llm/src/utils/dataset.py | 234 ++++++++++++------ examples/pytorch/llm/src/utils/model.py | 2 +- examples/pytorch/llm/src/utils/utils.py | 29 ++- 30 files changed, 360 insertions(+), 170 deletions(-) create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh create mode 100644 examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => chatglm2_6b/lora}/infer.sh (60%) rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => chatglm2_6b/lora}/sft.sh (54%) diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index 853339565a..4c513f6a28 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -27,7 +27,7 @@ 8. other: polylm-13b, seqgpt-560m 3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ... 4. supported datasets: - 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh + 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen 2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh 3. multi-modal: coco-en 4. other: cls-fudan-news-zh, ner-jave-zh @@ -71,40 +71,40 @@ Training GPU memory: qlora(low,3090) > lora > full(2*A100) git clone https://github.com/modelscope/swift.git cd swift/examples/pytorch/llm -# sft lora and infer qwen-7b-chat, Requires 27GB GPU memory. +# sft lora and infer qwen-7b-chat, Requires 38GB GPU memory. # You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed. # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/lora/sft.sh bash scripts/qwen_7b_chat/lora/infer.sh -# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory. +# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/lora_ddp/sft.sh bash scripts/qwen_7b_chat/lora_ddp/infer.sh -# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*14GB GPU memory. +# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*15GB GPU memory. # Recommended experimental environment: V100, A10, 3090 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh -# sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory. +# sft(qlora) and infer qwen-7b-chat, Requires 12GB GPU memory. # If you want to use quantification, you need to `pip install bitsandbytes -U` # Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh -# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory. +# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory. # Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh -# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory. +# sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/full_mp/sft.sh bash scripts/qwen_7b_chat/full_mp/infer.sh -# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory. +# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*75GB GPU memory. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index 889368e6f9..d4478ad961 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -28,7 +28,7 @@ 8. other: polylm-13b, seqgpt-560m 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ... 4. 支持的数据集: - 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh + 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh, advertise-gen 2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh 3. 多模态: coco-en 4. 其他: cls-fudan-news-zh, ner-jave-zh @@ -73,40 +73,40 @@ pip install . git clone https://github.com/modelscope/swift.git cd swift/examples/pytorch/llm -# 微调(lora)+推理 qwen-7b-chat, 需要27GB显存. +# 微调(lora)+推理 qwen-7b-chat, 需要38GB显存. # 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度. # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/lora/sft.sh bash scripts/qwen_7b_chat/lora/infer.sh -# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存. +# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/lora_ddp/sft.sh bash scripts/qwen_7b_chat/lora_ddp/infer.sh -# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*14GB显存. +# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存. # 推荐的实验环境: V100, 3090, A10 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh -# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存. +# 微调(qlora)+推理 qwen-7b-chat, 需要12GB显存. # 如果你想要使用量化, 你需要`pip install bitsandbytes -U` # 推荐的实验环境: 3090, A10 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh -# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存. +# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存. # 推荐的实验环境: 3090, A10 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh -# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*50G显存. +# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/full_mp/sft.sh bash scripts/qwen_7b_chat/full_mp/infer.sh -# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*50G显存. +# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh index e62aa4b203..ca53acdf99 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type baichuan \ --dtype bf16 \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset damo-agent-mini-zh \ + --dataset_sample -1 \ + --max_length 4096 \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh index ea219e0759..c315d78850 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh @@ -1,4 +1,5 @@ # Experimental environment: 2 * A100 +# 2 * 44GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -11,10 +12,10 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample 20000 \ + --dataset damo-agent-mini-zh \ + --dataset_sample -1 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 4096 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh new file mode 100644 index 0000000000..0fcbabe1c2 --- /dev/null +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh @@ -0,0 +1,18 @@ +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_infer.py \ + --model_type baichuan2-7b-chat \ + --sft_type lora \ + --template_type baichuan \ + --dtype bf16 \ + --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ + --eval_human false \ + --dataset advertise-gen \ + --dataset_sample -1 \ + --max_length 2048 \ + --quantization_bit 4 \ + --bnb_4bit_comp_dtype bf16 \ + --max_new_tokens 1024 \ + --temperature 0.9 \ + --top_k 50 \ + --top_p 0.9 \ + --do_sample true \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh new file mode 100644 index 0000000000..54e2dfd048 --- /dev/null +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh @@ -0,0 +1,34 @@ +# Experimental environment: 3090 +# 12GB GPU memory +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_sft.py \ + --model_type baichuan2-7b-chat \ + --sft_type lora \ + --template_type baichuan \ + --dtype bf16 \ + --output_dir runs \ + --dataset advertise-gen \ + --dataset_sample -1 \ + --num_train_epochs 1 \ + --max_length 2048 \ + --quantization_bit 4 \ + --bnb_4bit_comp_dtype bf16 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --lora_dropout_p 0. \ + --lora_target_modules ALL \ + --gradient_checkpointing true \ + --batch_size 1 \ + --weight_decay 0. \ + --learning_rate 1e-4 \ + --gradient_accumulation_steps 16 \ + --max_grad_norm 0.5 \ + --warmup_ratio 0.03 \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 10 \ + --push_to_hub false \ + --hub_model_id baichuan2-7b-chat-qlora \ + --hub_private_repo true \ + --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh similarity index 60% rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh rename to examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh index b6c221155d..eb69f44068 100644 --- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh @@ -1,15 +1,14 @@ CUDA_VISIBLE_DEVICES=0 \ python src/llm_infer.py \ - --model_type qwen-7b-chat \ + --model_type chatglm2-6b \ --sft_type lora \ - --template_type chatml \ + --template_type chatglm2 \ --dtype bf16 \ - --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ --eval_human false \ - --dataset damo-agent-mini-zh \ + --dataset advertise-gen \ --dataset_sample -1 \ --max_length 2048 \ - --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh similarity index 54% rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh rename to examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh index 7f4c9c37bd..f399ee9163 100644 --- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh @@ -1,37 +1,32 @@ -# Experimental environment: 2 * A100 -nproc_per_node=2 -CUDA_VISIBLE_DEVICES=0,1 \ -torchrun \ - --nproc_per_node=$nproc_per_node \ - --master_port 29500 \ - src/llm_sft.py \ - --model_type qwen-7b-chat \ +# Experimental environment: V100(16GB) +# 14GB GPU memory +CUDA_VISIBLE_DEVICES=0 \ +python src/llm_sft.py \ + --model_type chatglm2-6b \ --sft_type lora \ - --template_type chatml \ + --template_type chatglm2 \ --dtype bf16 \ --output_dir runs \ - --ddp_backend nccl \ - --dataset damo-agent-mini-zh \ + --dataset advertise-gen \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ - --lora_target_modules ALL \ + --lora_target_modules query_key_value \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ --learning_rate 1e-4 \ - --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ + --gradient_accumulation_steps 16 \ --max_grad_norm 0.5 \ --warmup_ratio 0.03 \ --eval_steps 100 \ --save_steps 100 \ --save_total_limit 2 \ --logging_steps 10 \ - --use_flash_attn true \ --push_to_hub false \ - --hub_model_id qwen-7b-chat-qlora \ + --hub_model_id chatglm2-6b-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh index 96aa910f23..85d856ad36 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatglm2 \ --dtype bf16 \ --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset code-python-zh \ + --dataset_sample -1 \ + --max_length 8192 \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh index 7ec0bb88d9..b54fd9b766 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: A100 +# 50GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -10,13 +12,14 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ + --dataset code-python-zh \ --dataset_sample -1 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 8192 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh index 9ef3c08124..17e53a8c82 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset damo-agent-zh \ + --dataset_sample 200000 \ + --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh index 9ce0d348de..2a961f7e72 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * A100 -# 2 * 50GB GPU memory +# 2 * 75GB GPU memory CUDA_VISIBLE_DEVICES=0,1 \ python src/llm_sft.py \ --model_type qwen-7b-chat \ @@ -7,10 +7,10 @@ python src/llm_sft.py \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset damo-agent-zh \ + --dataset_sample 200000 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 8192 \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0.01 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh index 9ef3c08124..f99464d035 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset medical-en,medical-zh \ + --dataset_sample 200000 \ + --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh index 1759bc8f2e..de95dda252 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 4 * A100 -# 4 * 50GB GPU memory +# 4 * 75GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ torchrun \ @@ -11,10 +11,10 @@ torchrun \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset medical-en,medical-zh \ + --dataset_sample 200000 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 8192 \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0.01 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh index 445f8d0e7b..6382b5d34f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset cot-en,cot-zh \ + --dataset_sample 50000 \ + --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh index 025f728cb1..0d1d205a1a 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: A100 +# 38GB GPU memory CUDA_VISIBLE_DEVICES=0 \ python src/llm_sft.py \ --model_type qwen-7b-chat \ @@ -5,14 +7,14 @@ python src/llm_sft.py \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset cot-en,cot-zh \ + --dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ - --lora_target_modules c_attn c_proj \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh index 27d3c0cbb3..8d5674bef4 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh @@ -5,7 +5,9 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset sharegpt-en,sharegpt-zh \ + --dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh index fd92b9a941..82f0838235 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh @@ -1,6 +1,6 @@ # Experimental environment: 2 * A100 -# 2 * 27GB GPU memory -# use_flash_attn=false: 2 * 31GB GPU memory +# 2 * 38GB GPU memory +# use_flash_attn=false: 2 * 70GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -13,14 +13,14 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset sharegpt-en,sharegpt-zh \ + --dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ - --lora_target_modules c_attn c_proj \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh index 152bd6b020..4b54b46255 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh @@ -5,7 +5,9 @@ python src/llm_infer.py \ --template_type chatml \ --dtype fp16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset advertise-gen \ + --dataset_sample -1 \ --max_length 2048 \ --use_flash_attn false \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh index 8846e714d6..791b1d2d0c 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 4 * V100(16GB) -# 4 * 14GB GPU memory +# 4 * 15GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ torchrun \ @@ -12,7 +12,7 @@ torchrun \ --dtype fp16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ + --dataset advertise-gen \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh index 66dd4f0fda..e48e5d2d1e 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset advertise-gen \ + --dataset_sample -1 \ + --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ --use_flash_attn false \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh index f2fa03851c..4c28727224 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: 3090 +# 12GB GPU memory CUDA_VISIBLE_DEVICES=0 \ python src/llm_sft.py \ --model_type qwen-7b-chat \ @@ -5,7 +7,7 @@ python src/llm_sft.py \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ + --dataset advertise-gen \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh index 66dd4f0fda..e48e5d2d1e 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset advertise-gen \ + --dataset_sample -1 \ + --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ --use_flash_attn false \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh index ab324f14e1..6ac4431d28 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * 3090 -# 2 * 13GB GPU memory +# 2 * 14GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -12,7 +12,7 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ + --dataset advertise-gen \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index 674b0b60c9..275168f8fb 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -8,7 +8,7 @@ from transformers import BitsAndBytesConfig, GenerationConfig, TextStreamer from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, get_dataset, get_model_tokenizer, get_preprocess, inference, - process_dataset, select_bnb, select_dtype, show_layers) + select_bnb, select_dtype, show_layers) from swift import Swift, get_logger from swift.utils import parse_args, print_model_info, seed_everything @@ -138,14 +138,13 @@ def llm_infer(args: InferArguments) -> None: inference(input_ids, model, tokenizer, streamer, generation_config, args.skip_prompt) else: - dataset = get_dataset(args.dataset.split(',')) - _, test_dataset = process_dataset(dataset, args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) - mini_test_dataset = test_dataset.select( - range(min(10, test_dataset.shape[0]))) + _, val_dataset = get_dataset( + args.dataset.split(','), args.dataset_test_ratio, + args.dataset_sample, args.dataset_seed) + mini_val_dataset = val_dataset.select( + range(min(10, val_dataset.shape[0]))) del dataset - for data in mini_test_dataset: + for data in mini_val_dataset: response = data['response'] data['response'] = None input_ids = preprocess_func(data)['input_ids'] diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 0dbb8f2049..a5a91f83fc 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -11,7 +11,7 @@ import torch.distributed as dist from transformers import BitsAndBytesConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, check_json_format, + broadcast_string, check_json_format, dataset_map, find_all_linear_for_lora, get_dataset, get_dist_setting, get_model_tokenizer, get_preprocess, is_ddp_plus_mp, is_dist, is_master, plot_images, process_dataset, @@ -51,7 +51,7 @@ class SftArguments: metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 dataset_sample: int = -1 # -1: all dataset - dataset_test_size: float = 0.01 + dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -236,18 +236,15 @@ def llm_sft(args: SftArguments) -> None: logger.info(str(model)) # ### Loading Dataset - dataset = get_dataset(args.dataset.split(',')) - train_dataset, val_dataset = process_dataset(dataset, - args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) + train_dataset, val_dataset = get_dataset( + args.dataset.split(','), args.dataset_test_ratio, args.dataset_sample, + args.dataset_seed) preprocess_func = get_preprocess(args.template_type, tokenizer, args.system, args.max_length) - train_dataset = train_dataset.map(preprocess_func) - val_dataset = val_dataset.map(preprocess_func) - del dataset + train_dataset = dataset_map(train_dataset, preprocess_func) + val_dataset = dataset_map(val_dataset, preprocess_func) if args.test_oom_error: - train_dataset = sort_by_max_length(train_dataset) + train_dataset = sort_by_max_length(train_dataset, 20000) # Data analysis stat_dataset(train_dataset) stat_dataset(val_dataset) diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index 3fb8d71254..cd330970d3 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -1,8 +1,8 @@ from .dataset import DATASET_MAPPING, get_dataset from .model import MODEL_MAPPING, get_model_tokenizer from .preprocess import TEMPLATE_MAPPING, get_preprocess -from .utils import (broadcast_string, check_json_format, download_dataset, - find_all_linear_for_lora, get_dist_setting, inference, - is_ddp_plus_mp, is_dist, is_local_master, is_master, - plot_images, process_dataset, select_bnb, select_dtype, - show_layers, sort_by_max_length) +from .utils import (broadcast_string, check_json_format, dataset_map, + download_dataset, find_all_linear_for_lora, + get_dist_setting, inference, is_ddp_plus_mp, is_dist, + is_local_master, is_master, plot_images, process_dataset, + select_bnb, select_dtype, show_layers, sort_by_max_length) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 3324354cc3..8dee614304 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -3,7 +3,7 @@ import os import re from functools import partial -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import json import numpy as np @@ -13,26 +13,26 @@ from tqdm.auto import tqdm from .preprocess import History -from .utils import download_dataset +from .utils import download_dataset, process_dataset def _preprocess_alpaca_dataset( dataset: HfDataset, preprocess_input: Optional[Callable[[str], str]] = None) -> HfDataset: - instruction = dataset['instruction'] - input_ = dataset['input'] - new_instruction: List[str] = [] - for inst, inp in zip(instruction, input_): + query: List[str] = [] + response = [] + for d in dataset: + inst, inp, output = d['instruction'], d['input'], d['output'] + if output is None: + continue if inp is None: inp = '' if preprocess_input is not None: inp = preprocess_input(inp) - inst = f'{inst}\n{inp}' - new_instruction.append(inst) - dataset = HfDataset.from_dict({ - 'query': new_instruction, - 'response': dataset['output'] - }) + q = f'{inst}\n{inp}' + query.append(q) + response.append(output) + dataset = HfDataset.from_dict({'query': query, 'response': response}) return dataset @@ -42,6 +42,29 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset: return _preprocess_alpaca_dataset(dataset) +def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset: + prompt = """Task: Generating advertisements based on keywords. +Keywords: {query} +Advertisements: """ + query = [] + response = [] + for d in tqdm(dataset): + query.append(prompt.format(query=d['content'])) + response.append(d['summary']) + return HfDataset.from_dict({'query': query, 'response': response}) + + +def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: + dataset_train: HfDataset = MsDataset.load( + 'lvjianjin/AdvertiseGen', split='train').to_hf_dataset() + dataset_val: HfDataset = MsDataset.load( + 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset() + return [ + _preprocess_advertise_gen_dataset(dataset_train), + _preprocess_advertise_gen_dataset(dataset_val) + ] + + def get_alpaca_gpt4_zh_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset() @@ -148,14 +171,14 @@ def _preprocess_mutimodal_dataset(dataset: HfDataset, prompt: str, return dataset -def get_coco_en_dataset() -> HfDataset: +def get_coco_en_dataset() -> Tuple[HfDataset, HfDataset]: dataset_dict = MsDataset.load('modelscope/coco_2014_caption') - dataset: HfDataset = concatenate_datasets([ - dataset_dict['train'].to_hf_dataset(), - dataset_dict['validation'].to_hf_dataset() - ]) - return _preprocess_mutimodal_dataset(dataset, 'please describe the image', - 'image', 'caption') + train_dataset = dataset_dict['train'].to_hf_dataset() + val_dataset = dataset_dict['validation'].to_hf_dataset() + return tuple( + _preprocess_mutimodal_dataset(dataset, 'please describe the image', + 'image', 'caption') + for dataset in (train_dataset, val_dataset)) def _filter_agent_dataset(dataset: List[Dict[str, Any]], @@ -208,14 +231,17 @@ def _preprocess_agent_dataset(dataset: List[Dict[str, str]]) -> HfDataset: return dataset -def get_damo_agent_zh_dataset(use_mini: bool = False) -> HfDataset: +def get_damo_agent_zh_dataset( + use_mini: bool = False) -> Tuple[HfDataset, HfDataset]: dataset_dict = MsDataset.load('damo/MSAgent-Bench') - dataset: HfDataset = concatenate_datasets([ - dataset_dict['train'].to_hf_dataset(), - dataset_dict['validation'].to_hf_dataset() - ]) - dataset = _filter_agent_dataset(dataset, use_mini) - return _preprocess_agent_dataset(dataset) + train_dataset = dataset_dict['train'].to_hf_dataset() + val_dataset = dataset_dict['validation'].to_hf_dataset() + dataset_list = [] + for dataset in (train_dataset, val_dataset): + dataset = _filter_agent_dataset(dataset, use_mini) + dataset = _preprocess_agent_dataset(dataset) + dataset_list.append(dataset) + return tuple(dataset_list) _firefly_kind_list = [ @@ -261,24 +287,33 @@ def get_firefly_all_zh_dataset() -> HfDataset: return get_firefly_zh_dataset(_firefly_kind_list) -def get_poetry_zh_dataset() -> HfDataset: +def get_poetry_zh_dataset() -> Tuple[HfDataset, HfDataset]: dataset_dict = MsDataset.load('modelscope/chinese-poetry-collection') - dataset: HfDataset = concatenate_datasets([ - dataset_dict['train'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset() - ]) - return HfDataset.from_dict({ - 'query': ['写诗'] * len(dataset), - 'response': dataset['text1'] - }) + train_dataset: HfDataset = dataset_dict['train'].to_hf_dataset() + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() + dataset_list = [] + for dataset in (train_dataset, val_dataset): + dataset_list.append( + HfDataset.from_dict({ + 'query': ['写诗'] * len(dataset), + 'response': dataset['text1'] + })) + return tuple(dataset_list) def get_instruct_en_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'wyj123456/instruct', split='train').to_hf_dataset() - dataset = dataset.rename_column('prompt', 'query') - dataset = dataset.rename_column('completion', 'response') - return dataset + query = [] + response = [] + for d in tqdm(dataset): + q = d['prompt'] + r = d['completion'] + if q is None: + continue + query.append(q) + response.append(r) + return HfDataset.from_dict({'query': query, 'response': response}) def get_gpt4all_en_dataset() -> HfDataset: @@ -314,30 +349,32 @@ def _preprocess_cls_dataset(dataset: HfDataset, cls_mapping: List[str], return HfDataset.from_dict({'query': query, 'response': response}) -def get_cmnli_zh_dataset() -> HfDataset: +def get_cmnli_zh_dataset() -> Tuple[HfDataset, HfDataset]: """Natural Language Inference""" dataset_dict = MsDataset.load('clue', subset_name='cmnli') - dataset: HfDataset = concatenate_datasets([ + train_dataset: HfDataset = concatenate_datasets([ dataset_dict['train'].to_hf_dataset(), dataset_dict['validation'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset(), ]) + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() cls_mapping = ['neutral', 'entailment', 'contradiction'] - return _preprocess_cls_dataset(dataset, cls_mapping, - 'Natural Language Inference', True) + return tuple( + _preprocess_cls_dataset(dataset, cls_mapping, + 'Natural Language Inference', True) + for dataset in (train_dataset, val_dataset)) -def get_jd_zh_dataset() -> HfDataset: +def get_jd_zh_dataset() -> Tuple[HfDataset, HfDataset]: """Sentiment classification""" dataset_dict = MsDataset.load('DAMO_NLP/jd') - dataset: HfDataset = concatenate_datasets([ - dataset_dict['train'].to_hf_dataset(), - dataset_dict['validation'].to_hf_dataset() - ]) + train_dataset: HfDataset = dataset_dict['train'].to_hf_dataset() + val_dataset: HfDataset = dataset_dict['validation'].to_hf_dataset() cls_mapping = ['negative', 'positive'] - return _preprocess_cls_dataset(dataset, cls_mapping, - 'Sentiment Classification', False) + return tuple( + _preprocess_cls_dataset(dataset, cls_mapping, + 'Sentiment Classification', False) + for dataset in (train_dataset, val_dataset)) def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset: @@ -355,44 +392,56 @@ def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset: return HfDataset.from_dict({'query': query, 'response': response}) -def get_dureader_robust_qg_zh_dataset() -> HfDataset: +def get_dureader_robust_qg_zh_dataset() -> Tuple[HfDataset, HfDataset]: """Question Generation""" dataset_dict = MsDataset.load('modelscope/DuReader_robust-QG') - dataset: HfDataset = concatenate_datasets([ + train_dataset: HfDataset = concatenate_datasets([ dataset_dict['train'].to_hf_dataset(), dataset_dict['validation'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset() ]) - return _preprocess_dureader_robust(dataset) + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() + return tuple( + _preprocess_dureader_robust(dataset) + for dataset in (train_dataset, val_dataset)) def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset: query = [] + response = [] for d in tqdm(dataset): + r = d['output'] + if r is None: + continue if subset_name == 'zh': q = d['instruction'] else: q = d['input'] + if q is None: + continue query.append(q) - return HfDataset.from_dict({'query': query, 'response': dataset['output']}) + response.append(r) + return HfDataset.from_dict({'query': query, 'response': response}) -def get_medical_dataset(subset_name: str, - dataset_sample: int = -1) -> HfDataset: +def get_medical_dataset( + subset_name: str, + train_dataset_sample: int = -1) -> Tupe[HfDataset, HfDataset]: """ mode: Literal['en', zh] """ dataset_dict = MsDataset.load( 'huangjintao/medical_zh', subset_name=subset_name) - dataset: HfDataset = concatenate_datasets([ + train_dataset: HfDataset = concatenate_datasets([ dataset_dict['train'].to_hf_dataset(), dataset_dict['val'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset(), ]) - if dataset_sample != -1: + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() + if train_dataset_sample != -1: idxs = np.random.permutation(dataset_sample) - dataset = dataset.select(idxs) - return _preprocess_medical(dataset, subset_name) + train_dataset = train_dataset.select(idxs) + return tuple( + _preprocess_medical(dataset, subset_name) + for dataset in (train_dataset, val_dataset)) def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset: @@ -458,6 +507,23 @@ def get_ner_jave_zh() -> HfDataset: }) +def _preprocess_code_python_dataset(dataset: HfDataset) -> HfDataset: + query = [] + response = [] + for d in tqdm(dataset): + chat_rounds = ast.literal_eval(d['chat_rounds']) + assert len(chat_rounds) == 2 + query.append(chat_rounds[-2]['content']) + response.append(chat_rounds[-1]['content']) + return HfDataset.from_dict({'query': query, 'response': response}) + + +def get_code_python_zh_dataset() -> HfDataset: + dataset = MsDataset.load( + 'codefuse-ai/CodeExercise-Python-27k').to_hf_dataset() + return _preprocess_code_python_dataset(dataset) + + DATASET_MAPPING = { # nlp chat 'alpaca-en': @@ -491,7 +557,11 @@ def get_ner_jave_zh() -> HfDataset: 'medical-zh': partial(get_medical_dataset, subset_name='zh'), 'medical-mini-zh': - partial(get_medical_dataset, subset_name='zh', dataset_sample=100000), + partial( + get_medical_dataset, subset_name='zh', train_dataset_sample=100000), + 'code-python-zh': + get_code_python_zh_dataset, + # multi-round chat 'damo-agent-mini-zh': partial(get_damo_agent_zh_dataset, use_mini=True), @@ -501,13 +571,17 @@ def get_ner_jave_zh() -> HfDataset: get_sharegpt_all_en_dataset, 'sharegpt-zh': get_sharegpt_all_zh_dataset, - # nlp text-generation (please use model:base, template:default-generation) + + # nlp text-generation 'cmnli-zh': get_cmnli_zh_dataset, 'jd-zh': get_jd_zh_dataset, 'dureader-robust-zh': get_dureader_robust_qg_zh_dataset, + 'advertise-gen': + get_advertise_gen_dataset, + # multi-modal chat 'coco-en': get_coco_en_dataset, @@ -520,10 +594,32 @@ def get_ner_jave_zh() -> HfDataset: } -def get_dataset(dataset_name_list: List[str]) -> HfDataset: - dataset_list: List[HfDataset] = [] +def get_dataset( + dataset_name_list: List[str], + dataset_test_ratio: float = 0., + dataset_sample: int = -1, + dataset_seed: int = 42) -> Tuple[HfDataset, Optional[HfDataset]]: + """Returns train_dataset and val_dataset""" + train_dataset_list: List[HfDataset] = [] + val_dataset_list: List[HfDataset] = [] for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] - dataset_list.append(get_function()) - dataset = concatenate_datasets(dataset_list) - return dataset + dataset = get_function() + if isinstance(dataset, (list, tuple)): + train_dataset = dataset[0] + val_dataset = dataset[1] + else: + if dataset_test_ratio > 0: + train_dataset, val_dataset = process_dataset( + dataset, dataset_test_ratio, dataset_sample, dataset_seed) + else: + train_dataset, val_dataset = dataset, None + train_dataset_list.append(train_dataset) + if val_dataset is not None: + val_dataset_list.append(val_dataset) + + train_dataset = concatenate_datasets(train_dataset_list) + val_dataset = None + if len(val_dataset_list) > 0: + val_dataset = concatenate_datasets(val_dataset_list) + return train_dataset, val_dataset diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index 6b456fe54e..dca397e4f8 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -308,7 +308,7 @@ class LoRATM(NamedTuple): }, 'baichuan2-7b-chat': { 'model_id': 'baichuan-inc/Baichuan2-7B-Chat', - 'revision': 'v1.0.0', + 'revision': 'v1.0.1', 'template': 'baichuan', 'lora_TM': LoRATM.baichuan, }, diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index 5f2a85e6e3..e6306d36f3 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -1,11 +1,13 @@ # Copyright (c) Alibaba, Inc. and its affiliates. # Part of the implementation is borrowed from huggingface/transformers. +import heapq import logging import os import shutil from functools import wraps from tempfile import TemporaryDirectory -from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union +from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, + Tuple, Union) import matplotlib.pyplot as plt import numpy as np @@ -259,10 +261,10 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float, return dataset['train'], dataset['test'] -def sort_by_max_length(dataset: HfDataset) -> HfDataset: - dataset_len = [len(d['input_ids']) for d in dataset] - idx = sorted( - range(len(dataset)), key=lambda i: dataset_len[i], reverse=True) +def sort_by_max_length(dataset: HfDataset, num_dataset: int) -> HfDataset: + dataset_len = [len(d['input_ids']) for d in tqdm(dataset)] + idx = heapq.nlargest( + num_dataset, range(len(dataset_len)), key=lambda i: dataset_len[i]) input_ids = [] labels = [] for i in tqdm(idx): @@ -373,6 +375,23 @@ def _infer_auto_device_map_patch( return infer_auto_device_map(model, max_memory, verbose=verbose, **kwargs) +def dataset_map( + dataset: HfDataset, preprocess_func: Callable[[Dict[str, Any]], + Dict[str, + Optional[List[int]]]] +) -> HfDataset: + # faster than dataset.map + input_ids = [] + labels = [] + for d in tqdm(dataset): + d = preprocess_func(d) + if d['input_ids'] is None: + continue + input_ids.append(d['input_ids']) + labels.append(d['labels']) + return HfDataset.from_dict({'input_ids': input_ids, 'labels': labels}) + + logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s') logger.handlers[0].setFormatter(logger_format) From 821281808fcd33feb74020b96738bf72985b9913 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 08:40:36 +0800 Subject: [PATCH 2/5] update sh --- .../baichuan2_7b_chat/lora_ddp/infer.sh | 2 +- .../scripts/baichuan2_7b_chat/lora_ddp/sft.sh | 2 +- .../scripts/baichuan2_7b_chat/qlora/infer.sh | 2 +- .../scripts/baichuan2_7b_chat/qlora/sft.sh | 4 +- .../llm/scripts/chatglm2_6b/lora/infer.sh | 16 ------ .../llm/scripts/chatglm2_6b/lora/sft.sh | 32 ----------- .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh | 2 +- .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh | 2 +- .../internlm_7b_chat/lora_ddp/infer.sh | 5 +- .../scripts/internlm_7b_chat/lora_ddp/sft.sh | 4 +- .../llm/scripts/llama2_70b_chat/qlora/sft.sh | 3 +- .../scripts/openbuddy-llama2-70b/qlora/sft.sh | 2 +- .../llm/scripts/polylm_13b/qlora_ddp/sft.sh | 2 +- .../llm/scripts/qwen_7b/lora_ddp/infer.sh | 2 +- .../llm/scripts/qwen_7b/lora_ddp/sft.sh | 2 +- .../llm/scripts/qwen_7b_chat/full_mp/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/full_mp/sft.sh | 2 +- .../scripts/qwen_7b_chat/full_mp_ddp/infer.sh | 2 +- .../scripts/qwen_7b_chat/full_mp_ddp/sft.sh | 2 +- .../llm/scripts/qwen_7b_chat/lora/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/lora/sft.sh | 2 +- .../scripts/qwen_7b_chat/lora_ddp/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/lora_ddp/sft.sh | 2 +- .../scripts/qwen_7b_chat/lora_mp_ddp/infer.sh | 2 +- .../scripts/qwen_7b_chat/lora_mp_ddp/sft.sh | 2 +- .../llm/scripts/qwen_7b_chat/qlora/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/qlora/sft.sh | 4 +- .../scripts/qwen_7b_chat/qlora_ddp/infer.sh | 2 +- .../llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh | 4 +- .../llm/scripts/qwen_vl/lora_ddp/infer.sh | 2 +- .../llm/scripts/qwen_vl/lora_ddp/sft.sh | 2 +- .../scripts/qwen_vl_chat/lora_ddp/infer.sh | 2 +- .../llm/scripts/qwen_vl_chat/lora_ddp/sft.sh | 2 +- .../llm/scripts/seqgpt_560m/full/infer.sh | 2 +- .../llm/scripts/seqgpt_560m/full/sft.sh | 2 +- examples/pytorch/llm/src/llm_infer.py | 8 +-- examples/pytorch/llm/src/llm_sft.py | 11 ++-- examples/pytorch/llm/src/utils/__init__.py | 4 +- examples/pytorch/llm/src/utils/dataset.py | 55 ++++++++++++------- examples/pytorch/llm/src/utils/preprocess.py | 28 +++++----- examples/pytorch/llm/src/utils/utils.py | 12 ---- 41 files changed, 101 insertions(+), 143 deletions(-) delete mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh delete mode 100644 examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh index ca53acdf99..18857c080c 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset damo-agent-mini-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 4096 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh index c315d78850..6ad53030f8 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh @@ -13,7 +13,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset damo-agent-mini-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 4096 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh index 0fcbabe1c2..1dc288fd7c 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh index 54e2dfd048..f78cddec1e 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/sft.sh @@ -8,7 +8,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ @@ -17,7 +17,7 @@ python src/llm_sft.py \ --lora_alpha 32 \ --lora_dropout_p 0. \ --lora_target_modules ALL \ - --gradient_checkpointing true \ + --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ --learning_rate 1e-4 \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh deleted file mode 100644 index eb69f44068..0000000000 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora/infer.sh +++ /dev/null @@ -1,16 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_infer.py \ - --model_type chatglm2-6b \ - --sft_type lora \ - --template_type chatglm2 \ - --dtype bf16 \ - --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ - --eval_human false \ - --dataset advertise-gen \ - --dataset_sample -1 \ - --max_length 2048 \ - --max_new_tokens 1024 \ - --temperature 0.9 \ - --top_k 50 \ - --top_p 0.9 \ - --do_sample true \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh deleted file mode 100644 index f399ee9163..0000000000 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora/sft.sh +++ /dev/null @@ -1,32 +0,0 @@ -# Experimental environment: V100(16GB) -# 14GB GPU memory -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_sft.py \ - --model_type chatglm2-6b \ - --sft_type lora \ - --template_type chatglm2 \ - --dtype bf16 \ - --output_dir runs \ - --dataset advertise-gen \ - --dataset_sample -1 \ - --num_train_epochs 1 \ - --max_length 2048 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --lora_dropout_p 0. \ - --lora_target_modules query_key_value \ - --gradient_checkpointing false \ - --batch_size 1 \ - --weight_decay 0. \ - --learning_rate 1e-4 \ - --gradient_accumulation_steps 16 \ - --max_grad_norm 0.5 \ - --warmup_ratio 0.03 \ - --eval_steps 100 \ - --save_steps 100 \ - --save_total_limit 2 \ - --logging_steps 10 \ - --push_to_hub false \ - --hub_model_id chatglm2-6b-lora \ - --hub_private_repo true \ - --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh index 85d856ad36..f95648b337 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset code-python-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 8192 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh index b54fd9b766..06ae8c240a 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh @@ -13,7 +13,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset code-python-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 8192 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh index be9c767529..4829026106 100644 --- a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type internlm \ --dtype bf16 \ --ckpt_dir "runs/internlm-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset jd-zh \ + --train_dataset_sample -1 \ + --max_length 2048 \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh index d0be14e4e9..6f2a8abea2 100644 --- a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/sft.sh @@ -10,8 +10,8 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample 20000 \ + --dataset jd-zh \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh index 9a0e04499a..e7400a43c7 100644 --- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh @@ -1,11 +1,12 @@ # Experimental environment: 2 * 3090 +# not good at Chinese CUDA_VISIBLE_DEVICES=0,1 \ python src/llm_sft.py \ --model_type llama2-70b-chat \ --sft_type lora \ --output_dir runs \ --dataset alpaca-en \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ diff --git a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh b/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh index 6643d2870e..9efcc3ee67 100644 --- a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh @@ -7,7 +7,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset alpaca-en,alpaca-zh \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ diff --git a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh index 6917fd2ae9..1081bbeee9 100644 --- a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp/sft.sh @@ -10,7 +10,7 @@ torchrun \ --ddp_backend nccl \ --dtype bf16 \ --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh index 605b6b886e..ca5ba377b1 100644 --- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset dureader-robust-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh index b210a40bcb..edcd980590 100644 --- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh @@ -11,7 +11,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset dureader-robust-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh index 17e53a8c82..2391406ef2 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset damo-agent-zh \ - --dataset_sample 200000 \ + --train_dataset_sample 200000 \ --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh index 2a961f7e72..2950c6189e 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh @@ -8,7 +8,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset damo-agent-zh \ - --dataset_sample 200000 \ + --train_dataset_sample 200000 \ --num_train_epochs 1 \ --max_length 8192 \ --gradient_checkpointing false \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh index f99464d035..1f5e6d7d08 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset medical-en,medical-zh \ - --dataset_sample 200000 \ + --train_dataset_sample 200000 \ --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh index de95dda252..e44a960cc9 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh @@ -12,7 +12,7 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --dataset medical-en,medical-zh \ - --dataset_sample 200000 \ + --train_dataset_sample 200000 \ --num_train_epochs 1 \ --max_length 8192 \ --gradient_checkpointing false \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh index 6382b5d34f..9b6c648f8f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset cot-en,cot-zh \ - --dataset_sample 50000 \ + --train_dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh index 0d1d205a1a..3ec7695054 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh @@ -8,7 +8,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset cot-en,cot-zh \ - --dataset_sample 50000 \ + --train_dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh index 8d5674bef4..0770665cc5 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset sharegpt-en,sharegpt-zh \ - --dataset_sample 50000 \ + --train_dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh index 82f0838235..42aac8e2b5 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh @@ -14,7 +14,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset sharegpt-en,sharegpt-zh \ - --dataset_sample 50000 \ + --train_dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh index 4b54b46255..99e7896e0b 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 2048 \ --use_flash_attn false \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh index 791b1d2d0c..2de6d64704 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh @@ -13,7 +13,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh index e48e5d2d1e..ae4bd09242 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh index 4c28727224..6b4a99e61a 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh @@ -8,7 +8,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ @@ -17,7 +17,7 @@ python src/llm_sft.py \ --lora_alpha 32 \ --lora_dropout_p 0. \ --lora_target_modules ALL \ - --gradient_checkpointing true \ + --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ --learning_rate 1e-4 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh index e48e5d2d1e..ae4bd09242 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh index 6ac4431d28..4f7433f8c3 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/sft.sh @@ -13,7 +13,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset advertise-gen \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 1 \ --max_length 2048 \ --quantization_bit 4 \ @@ -22,7 +22,7 @@ torchrun \ --lora_alpha 32 \ --lora_dropout_p 0. \ --lora_target_modules ALL \ - --gradient_checkpointing true \ + --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ --learning_rate 1e-4 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh index 670a7ba72f..572a6f1bd5 100644 --- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-vl/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset coco-en \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh index 55a8d029ed..b3556d086d 100644 --- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh @@ -11,7 +11,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset coco-en \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh index 220c277536..35e0a78a5d 100644 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset coco-en \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh index 1760efbe8a..11741a0e6e 100644 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh @@ -11,7 +11,7 @@ torchrun \ --output_dir runs \ --ddp_backend nccl \ --dataset coco-en \ - --dataset_sample 20000 \ + --train_dataset_sample 20000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh index cb3e4b7062..5130210162 100644 --- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh +++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh @@ -7,7 +7,7 @@ python src/llm_infer.py \ --ckpt_dir "runs/seqgpt-560m/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset ner-jave-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --max_length 1024 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh index 5d0ada5770..89878985f9 100644 --- a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh +++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh @@ -6,7 +6,7 @@ python src/llm_sft.py \ --dtype bf16 \ --output_dir runs \ --dataset ner-jave-zh \ - --dataset_sample -1 \ + --train_dataset_sample -1 \ --num_train_epochs 3 \ --max_length 1024 \ --gradient_checkpointing false \ diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index 275168f8fb..b5f39cac26 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -37,8 +37,9 @@ class InferArguments: default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 - dataset_sample: int = -1 # -1: all dataset - dataset_test_size: float = 0.01 + train_dataset_sample: int = -1 # -1: all dataset + test_dataset_sample: int = -1 + dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -140,10 +141,9 @@ def llm_infer(args: InferArguments) -> None: else: _, val_dataset = get_dataset( args.dataset.split(','), args.dataset_test_ratio, - args.dataset_sample, args.dataset_seed) + args.dataset_seed, args.train_dataset_sample) mini_val_dataset = val_dataset.select( range(min(10, val_dataset.shape[0]))) - del dataset for data in mini_val_dataset: response = data['response'] data['response'] = None diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index a5a91f83fc..0811179e58 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -14,8 +14,8 @@ broadcast_string, check_json_format, dataset_map, find_all_linear_for_lora, get_dataset, get_dist_setting, get_model_tokenizer, get_preprocess, is_ddp_plus_mp, - is_dist, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers, sort_by_max_length) + is_dist, is_master, plot_images, select_bnb, select_dtype, + show_layers, sort_by_max_length) from swift import (HubStrategy, LoraConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) @@ -50,7 +50,8 @@ class SftArguments: default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 - dataset_sample: int = -1 # -1: all dataset + train_dataset_sample: int = -1 # -1: all dataset + test_dataset_sample: int = -1 dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -237,8 +238,8 @@ def llm_sft(args: SftArguments) -> None: # ### Loading Dataset train_dataset, val_dataset = get_dataset( - args.dataset.split(','), args.dataset_test_ratio, args.dataset_sample, - args.dataset_seed) + args.dataset.split(','), args.dataset_test_ratio, args.dataset_seed, + args.train_dataset_sample) preprocess_func = get_preprocess(args.template_type, tokenizer, args.system, args.max_length) train_dataset = dataset_map(train_dataset, preprocess_func) diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index cd330970d3..10ace8ba3c 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -4,5 +4,5 @@ from .utils import (broadcast_string, check_json_format, dataset_map, download_dataset, find_all_linear_for_lora, get_dist_setting, inference, is_ddp_plus_mp, is_dist, - is_local_master, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers, sort_by_max_length) + is_local_master, is_master, plot_images, select_bnb, + select_dtype, show_layers, sort_by_max_length) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 8dee614304..9ab23e7bf1 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -3,7 +3,7 @@ import os import re from functools import partial -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple import json import numpy as np @@ -13,7 +13,7 @@ from tqdm.auto import tqdm from .preprocess import History -from .utils import download_dataset, process_dataset +from .utils import download_dataset def _preprocess_alpaca_dataset( @@ -332,7 +332,7 @@ def _preprocess_cls_dataset(dataset: HfDataset, cls_mapping: List[str], prompt = f"""Task: {task} {input_} Category: {category} -Label: """ +Output: """ query = [] response = [] for d in tqdm(dataset): @@ -425,7 +425,7 @@ def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset: def get_medical_dataset( subset_name: str, - train_dataset_sample: int = -1) -> Tupe[HfDataset, HfDataset]: + train_dataset_sample: int = -1) -> Tuple[HfDataset, HfDataset]: """ mode: Literal['en', zh] """ @@ -437,7 +437,7 @@ def get_medical_dataset( ]) val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() if train_dataset_sample != -1: - idxs = np.random.permutation(dataset_sample) + idxs = np.random.permutation(train_dataset_sample) train_dataset = train_dataset.select(idxs) return tuple( _preprocess_medical(dataset, subset_name) @@ -597,29 +597,42 @@ def get_code_python_zh_dataset() -> HfDataset: def get_dataset( dataset_name_list: List[str], dataset_test_ratio: float = 0., - dataset_sample: int = -1, - dataset_seed: int = 42) -> Tuple[HfDataset, Optional[HfDataset]]: + dataset_seed: int = 42, + train_dataset_sample: int = -1, + test_dataset_sample: int = -1 +) -> Tuple[HfDataset, Optional[HfDataset]]: """Returns train_dataset and val_dataset""" train_dataset_list: List[HfDataset] = [] val_dataset_list: List[HfDataset] = [] + random_state = np.random.RandomState(dataset_seed) for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] dataset = get_function() if isinstance(dataset, (list, tuple)): - train_dataset = dataset[0] - val_dataset = dataset[1] + train_d = dataset[0] + val_d = dataset[1] else: if dataset_test_ratio > 0: - train_dataset, val_dataset = process_dataset( - dataset, dataset_test_ratio, dataset_sample, dataset_seed) + train_d, val_d = dataset.train_test_split( + dataset_test_size, seed=get_seed(random_state)) else: - train_dataset, val_dataset = dataset, None - train_dataset_list.append(train_dataset) - if val_dataset is not None: - val_dataset_list.append(val_dataset) - - train_dataset = concatenate_datasets(train_dataset_list) - val_dataset = None - if len(val_dataset_list) > 0: - val_dataset = concatenate_datasets(val_dataset_list) - return train_dataset, val_dataset + train_d, val_d = dataset, None + train_dataset_list.append(train_d) + if val_d is not None: + val_dataset_list.append(val_d) + + train_dataset = concatenate_datasets(train_dataset_list) + val_dataset = None + if len(val_dataset_list) > 0: + val_dataset = concatenate_datasets(val_dataset_list) + + dataset_list = [] + for dataset, dataset_sample in zip( + [train_dataset, val_dataset], + [train_dataset_sample, test_dataset_sample], + ): + if dataset_sample >= 0: + index = random_state.permutation(len(dataset))[:dataset_sample] + dataset = dataset.select(index) + dataset_list.append(dataset) + return tuple(dataset_list) diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 4d26613570..f3a08a7ab5 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -8,51 +8,51 @@ TEMPLATE_MAPPING = { 'default': { - 'prefix': ['{{system}}\n\n'], - 'prompt': ['### Human:\n', '{{query}}\n\n', '### Assistant:\n'], + 'prefix': ['{{SYSTEM}}\n\n'], + 'prompt': ['### Human:\n', '{{QUERY}}\n\n', '### Assistant:\n'], 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, 'default-generation': { 'prefix': [], - 'prompt': ['{{query}}'], + 'prompt': ['{{QUERY}}'], 'suffix': [['eos_token_id']], }, 'chatml': { - 'prefix': ['<|im_start|>system\n{{system}}<|im_end|>\n'], + 'prefix': ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'], 'prompt': - ['<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n'], + ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], 'chat_sep': ['<|im_end|>\n'], 'suffix': ['<|im_end|><|endoftext|>'], }, 'baichuan': { 'prefix': [], - 'prompt': [[195], '{{query}}', [196]], + 'prompt': [[195], '{{QUERY}}', [196]], 'chat_sep': [], 'suffix': [['eos_token_id']], }, 'chatglm2': { 'prefix': [[64790, 64792]], - 'prompt': ['[Round {{round}}]\n\n问:{{query}}\n\n答:'], + 'prompt': ['[Round {{ROUND}}]\n\n问:{{QUERY}}\n\n答:'], 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, 'llama': { 'prefix': [['bos_token_id'], - '[INST] <>\n{{system}}\n<>\n\n'], - 'prompt': ['{{query}} [/INST] '], + '[INST] <>\n{{SYSTEM}}\n<>\n\n'], + 'prompt': ['{{QUERY}} [/INST] '], 'chat_sep': [' ', ['eos_token_id', 'bos_token_id'], '[INST] '], 'suffix': [['eos_token_id']], }, 'openbuddy-llama': { - 'prefix': ['{{system}}\n\n'], - 'prompt': ['User: {{query}}\nAssistant: '], + 'prefix': ['{{SYSTEM}}\n\n'], + 'prompt': ['User: {{QUERY}}\nAssistant: '], 'chat_sep': ['\n'], 'suffix': [['eos_token_id']], }, 'internlm': { 'prefix': [''], - 'prompt': ['<|User|>:{{query}}\n<|Bot|>:'], + 'prompt': ['<|User|>:{{QUERY}}\n<|Bot|>:'], 'chat_sep': ['\n'], 'suffix': [''], } @@ -87,7 +87,7 @@ def concat_context_list( for context in context_list: if isinstance(context, str): for (old_str, - new_str) in zip(['{{system}}', '{{query}}', '{{round}}'], + new_str) in zip(['{{SYSTEM}}', '{{QUERY}}', '{{ROUND}}'], [system, query, round]): if new_str is not None and old_str in context: placeholder_list.append(new_str) @@ -108,7 +108,7 @@ def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context], token = c input_ids.append(token) elif isinstance(context, str): - for old_str in ['{{system}}', '{{query}}', '{{round}}']: + for old_str in ['{{SYSTEM}}', '{{QUERY}}', '{{ROUND}}']: if old_str in context: new_str = next(placeholder_it) context = context.replace(old_str, new_str) diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index e6306d36f3..2f0c2df093 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -249,18 +249,6 @@ def download_files(url: str, local_path: str, cookies) -> None: f.write(data) -def process_dataset(dataset: HfDataset, dataset_test_size: float, - dataset_sample: int, - dataset_seed: int) -> Tuple[HfDataset, HfDataset]: - random_state = np.random.RandomState(dataset_seed) - if dataset_sample >= 0: - index = random_state.permutation(len(dataset))[:dataset_sample] - dataset = dataset.select(index) - dataset = dataset.train_test_split( - dataset_test_size, seed=get_seed(random_state)) - return dataset['train'], dataset['test'] - - def sort_by_max_length(dataset: HfDataset, num_dataset: int) -> HfDataset: dataset_len = [len(d['input_ids']) for d in tqdm(dataset)] idx = heapq.nlargest( From 696fb78aedbc2835c112ba60b77042fc4481da03 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 09:39:52 +0800 Subject: [PATCH 3/5] fix bugs --- .../baichuan2_7b_chat/lora_ddp/infer.sh | 1 - .../scripts/baichuan2_7b_chat/qlora/infer.sh | 1 - .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh | 1 - .../internlm_7b_chat/lora_ddp/infer.sh | 1 - .../llm/scripts/qwen_7b/lora_ddp/infer.sh | 1 - .../llm/scripts/qwen_7b_chat/full_mp/infer.sh | 1 - .../scripts/qwen_7b_chat/full_mp_ddp/infer.sh | 1 - .../llm/scripts/qwen_7b_chat/lora/infer.sh | 1 - .../scripts/qwen_7b_chat/lora_ddp/infer.sh | 1 - .../scripts/qwen_7b_chat/lora_mp_ddp/infer.sh | 1 - .../llm/scripts/qwen_7b_chat/qlora/infer.sh | 1 - .../scripts/qwen_7b_chat/qlora_ddp/infer.sh | 1 - .../llm/scripts/qwen_vl/lora_ddp/infer.sh | 1 - .../scripts/qwen_vl_chat/lora_ddp/infer.sh | 1 - .../llm/scripts/seqgpt_560m/full/infer.sh | 1 - examples/pytorch/llm/src/llm_infer.py | 9 +++--- examples/pytorch/llm/src/llm_sft.py | 14 ++++++--- examples/pytorch/llm/src/utils/dataset.py | 29 +++++++------------ examples/pytorch/llm/src/utils/utils.py | 1 - 19 files changed, 24 insertions(+), 44 deletions(-) diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh index 18857c080c..ce54c3ffaa 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset damo-agent-mini-zh \ - --train_dataset_sample -1 \ --max_length 4096 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh index 1dc288fd7c..93225dd116 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh index f95648b337..61443d071c 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset code-python-zh \ - --train_dataset_sample -1 \ --max_length 8192 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh index 4829026106..1d06d09d36 100644 --- a/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/internlm_7b_chat/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/internlm-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset jd-zh \ - --train_dataset_sample -1 \ --max_length 2048 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh index ca5ba377b1..fdd1b03d6d 100644 --- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset dureader-robust-zh \ - --train_dataset_sample -1 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh index 2391406ef2..5d280cf86f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset damo-agent-zh \ - --train_dataset_sample 200000 \ --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh index 1f5e6d7d08..d02ca2471f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset medical-en,medical-zh \ - --train_dataset_sample 200000 \ --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh index 9b6c648f8f..b7ab5137fc 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset cot-en,cot-zh \ - --train_dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh index 0770665cc5..88bfbd2635 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset sharegpt-en,sharegpt-zh \ - --train_dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh index 99e7896e0b..fedd587479 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --train_dataset_sample -1 \ --max_length 2048 \ --use_flash_attn false \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh index ae4bd09242..644c2d5553 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh index ae4bd09242..644c2d5553 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset advertise-gen \ - --train_dataset_sample -1 \ --max_length 2048 \ --quantization_bit 4 \ --bnb_4bit_comp_dtype bf16 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh index 572a6f1bd5..aaf3592d74 100644 --- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-vl/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset coco-en \ - --train_dataset_sample 20000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh index 35e0a78a5d..c9536e1607 100644 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset coco-en \ - --train_dataset_sample 20000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh index 5130210162..7bc8e82d24 100644 --- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh +++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh @@ -7,7 +7,6 @@ python src/llm_infer.py \ --ckpt_dir "runs/seqgpt-560m/vx_xxx/checkpoint-xxx" \ --eval_human false \ --dataset ner-jave-zh \ - --train_dataset_sample -1 \ --max_length 1024 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index b5f39cac26..f0a039895b 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -36,10 +36,9 @@ class InferArguments: dataset: str = field( default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) - dataset_seed: int = 42 - train_dataset_sample: int = -1 # -1: all dataset - test_dataset_sample: int = -1 + dataset_split_seed: int = 42 dataset_test_ratio: float = 0.01 + show_dataset_sample: int = 20 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -141,9 +140,9 @@ def llm_infer(args: InferArguments) -> None: else: _, val_dataset = get_dataset( args.dataset.split(','), args.dataset_test_ratio, - args.dataset_seed, args.train_dataset_sample) + args.dataset_split_seed) mini_val_dataset = val_dataset.select( - range(min(10, val_dataset.shape[0]))) + range(min(args.show_dataset_sample, val_dataset.shape[0]))) for data in mini_val_dataset: response = data['response'] data['response'] = None diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 0811179e58..d4f57c2731 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -49,9 +49,8 @@ class SftArguments: dataset: str = field( default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) - dataset_seed: int = 42 + dataset_split_seed: int = 42 train_dataset_sample: int = -1 # -1: all dataset - test_dataset_sample: int = -1 dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -238,10 +237,17 @@ def llm_sft(args: SftArguments) -> None: # ### Loading Dataset train_dataset, val_dataset = get_dataset( - args.dataset.split(','), args.dataset_test_ratio, args.dataset_seed, - args.train_dataset_sample) + args.dataset.split(','), args.dataset_test_ratio, + args.dataset_split_seed) preprocess_func = get_preprocess(args.template_type, tokenizer, args.system, args.max_length) + if args.train_dataset_sample >= 0: + val_dataset_sample = args.train_dataset_sample * self.dataset_test_ratio + train_idxs = np.random.permutation(args.train_dataset_sample) + train_dataset = train_dataset.select(train_idxs) + if val_dataset.shape[0] > val_dataset_sample: + val_idxs = np.random.permutation(val_dataset_sample) + val_dataset = val_dataset.select(val_idxs) train_dataset = dataset_map(train_dataset, preprocess_func) val_dataset = dataset_map(val_dataset, preprocess_func) if args.test_oom_error: diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 9ab23e7bf1..6804a9dca4 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -12,6 +12,7 @@ from modelscope import MsDataset from tqdm.auto import tqdm +from swift.utils import get_seed from .preprocess import History from .utils import download_dataset @@ -436,7 +437,7 @@ def get_medical_dataset( dataset_dict['val'].to_hf_dataset(), ]) val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() - if train_dataset_sample != -1: + if train_dataset_sample >= 0: idxs = np.random.permutation(train_dataset_sample) train_dataset = train_dataset.select(idxs) return tuple( @@ -595,16 +596,14 @@ def get_code_python_zh_dataset() -> HfDataset: def get_dataset( - dataset_name_list: List[str], - dataset_test_ratio: float = 0., - dataset_seed: int = 42, - train_dataset_sample: int = -1, - test_dataset_sample: int = -1 + dataset_name_list: List[str], + dataset_test_ratio: float = 0., + dataset_split_seed: int = 42, ) -> Tuple[HfDataset, Optional[HfDataset]]: """Returns train_dataset and val_dataset""" train_dataset_list: List[HfDataset] = [] val_dataset_list: List[HfDataset] = [] - random_state = np.random.RandomState(dataset_seed) + random_state = np.random.RandomState(dataset_split_seed) for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] dataset = get_function() @@ -613,8 +612,9 @@ def get_dataset( val_d = dataset[1] else: if dataset_test_ratio > 0: - train_d, val_d = dataset.train_test_split( - dataset_test_size, seed=get_seed(random_state)) + dataset_dict = dataset.train_test_split( + dataset_test_ratio, seed=get_seed(random_state)) + train_d, val_d = dataset_dict['train'], dataset_dict['test'] else: train_d, val_d = dataset, None train_dataset_list.append(train_d) @@ -626,13 +626,4 @@ def get_dataset( if len(val_dataset_list) > 0: val_dataset = concatenate_datasets(val_dataset_list) - dataset_list = [] - for dataset, dataset_sample in zip( - [train_dataset, val_dataset], - [train_dataset_sample, test_dataset_sample], - ): - if dataset_sample >= 0: - index = random_state.permutation(len(dataset))[:dataset_sample] - dataset = dataset.select(index) - dataset_list.append(dataset) - return tuple(dataset_list) + return train_dataset, val_dataset diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index 2f0c2df093..94c85f61bf 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -29,7 +29,6 @@ from swift import get_logger from swift.hub import ModelScopeConfig -from swift.utils import get_seed from swift.utils.tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, read_tensorboard_file, tensorboard_smoothing) from .callback import DefaultFlowCallbackNew, ProgressCallbackNew From 3b5725b320894f4ce1e43340862d42a893596b16 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 09:52:59 +0800 Subject: [PATCH 4/5] fix bugs --- examples/pytorch/llm/src/llm_infer.py | 2 +- examples/pytorch/llm/src/llm_sft.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index f0a039895b..7f852f0056 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -26,7 +26,7 @@ class InferArguments: template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx' - eval_human: bool = False # False: eval test_dataset + eval_human: bool = False # False: eval val_dataset seed: int = 42 dtype: str = field( diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index d4f57c2731..2214f1d6ff 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -7,6 +7,7 @@ from typing import List, Optional import json +import numpy as np import torch import torch.distributed as dist from transformers import BitsAndBytesConfig @@ -50,7 +51,7 @@ class SftArguments: default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_split_seed: int = 42 - train_dataset_sample: int = -1 # -1: all dataset + train_dataset_sample: int = 20000 # -1: all dataset dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -233,7 +234,7 @@ def llm_sft(args: SftArguments) -> None: show_layers(model) print_model_info(model) - logger.info(str(model)) + logger.info(model) # ### Loading Dataset train_dataset, val_dataset = get_dataset( @@ -242,12 +243,15 @@ def llm_sft(args: SftArguments) -> None: preprocess_func = get_preprocess(args.template_type, tokenizer, args.system, args.max_length) if args.train_dataset_sample >= 0: - val_dataset_sample = args.train_dataset_sample * self.dataset_test_ratio + val_dataset_sample = int(args.train_dataset_sample + * args.dataset_test_ratio) train_idxs = np.random.permutation(args.train_dataset_sample) train_dataset = train_dataset.select(train_idxs) if val_dataset.shape[0] > val_dataset_sample: val_idxs = np.random.permutation(val_dataset_sample) val_dataset = val_dataset.select(val_idxs) + logger.info(f'train_dataset: {train_dataset}') + logger.info(f'val_dataset: {val_dataset}') train_dataset = dataset_map(train_dataset, preprocess_func) val_dataset = dataset_map(val_dataset, preprocess_func) if args.test_oom_error: From 647b40bb6973982d2fbdb39a9afdbeb0f3b35619 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 10:06:13 +0800 Subject: [PATCH 5/5] update --- examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh index e7400a43c7..ef7554bdc4 100644 --- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh +++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * 3090 -# not good at Chinese +# llama2 is not good at Chinese, openbuddy llama2 is recommended CUDA_VISIBLE_DEVICES=0,1 \ python src/llm_sft.py \ --model_type llama2-70b-chat \