diff --git a/README.md b/README.md
index c8f91f6e43..7dfbcf89d9 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 <p>
 
 <p align="center">
-<a href="https://modelscope.cn/home">Modelscope Hub</a>
+<a href="https://modelscope.cn/home">ModelScope Hub</a>
 <br>
         <a href="README_CN.md">中文</a>&nbsp ｜ &nbspEnglish
 </p>
@@ -36,24 +36,35 @@ Users can check the [documentation of Swift](docs/source/GetStarted/Introduction
 ## LLM SFT Example
 Press [this link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) to view the detail documentation of these examples.
 
-1. supported SFT methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
-2. supported models:
-   1. qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
-   2. qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-   3. baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
-   4. chatglm2 series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
-   5. llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
-   6. openbuddy-llama series: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
-   7. internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-   8. xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
-   9. other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
-3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
-4. supported datasets:
-   1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
-   3. Multi-Modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
-   4. Other: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
-5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
+- Supported SFT Methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
+- Supported Features: quantization, DDP, model parallelism, gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
+- Supported Models:
+  - qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+  - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+  - chatglm2 series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
+  - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
+  - openbuddy-llama series: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
+  - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
+  - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
+- Supported Datasets:
+  - NLP:
+    - General: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary)
+    - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
+    - Coding: [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)
+    - Medical: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
+    - Law: [lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
+    - Math: [blossom-math-zh](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary), [school-math-zh](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)
+    - SQL: [text2sql-en](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary), [sql-create-context-en](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)
+    - Text Generation: [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)
+    - Classification: [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)
+    - Other: [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
+  - Multi-Modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
+  - Custom Dataset
+- Supported Templates:
+  - Text Generation: default-generation, chatglm2-generation
+  - Chat: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, internlm, xverse
 
 # Installation
 
diff --git a/README_CN.md b/README_CN.md
index d41617eb9b..61b8b05354 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -34,24 +34,36 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 ## 大模型微调的例子
 可以[在这里](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) 查看LLM微调的使用文档。
 
-1. 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
-2. 支持的模型:
-   1. qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
-   2. qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-   3. baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
-   4. chatglm2 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
-   5. llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
-   6. openbuddy-llama 系列: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
-   7. internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-   8. xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
-   9. other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
-3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
-4. 支持的数据集:
-   1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
-   3. 多模态: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
-   4. 其他: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
-5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
+- 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
+- 支持的特性: 模型量化, DDP, 模型并行, gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
+- 支持的模型
+  - qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+  - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+  - chatglm2 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
+  - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
+  - openbuddy-llama 系列: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
+  - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
+  - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
+- 支持的数据集:
+  - NLP:
+    - 通用: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary)
+    - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
+    - 代码: [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)
+    - 医疗: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
+    - 法律: [lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
+    - 数学: [blossom-math-zh](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary), [school-math-zh](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)
+    - SQL: [text2sql-en](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary), [sql-create-context-en](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)
+    - 文本生成: [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)
+    - 分类: [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)
+    - 其他: [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
+  - 多模态: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
+  - 自定义数据集
+- 支持的对话模板:
+  - 文本生成: default-generation, chatglm2-generation
+  - 对话: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, internlm, xverse
+
 
 # 安装
 
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 6396c4f043..330335fea0 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -1,38 +1,63 @@
-
 <h1 align="center">LLM SFT Example</h1>
 
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
 <a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.1-5D91D4.svg"></a>
-<a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-%E2%89%A51.1.0-6FEBB9.svg"></a>
+<a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-Build from source-6FEBB9.svg"></a>
 </p>
 
+
 <p align="center">
-<a href="https://modelscope.cn/home">Modelscope Hub</a>
+<a href="https://modelscope.cn/home">ModelScope Hub</a>
 <br>
         <a href="README_CN.md">中文</a>&nbsp ｜ &nbspEnglish
 </p>
 
+
 ## Features
-1. supported SFT methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
-2. supported models:
-   1. qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
-   2. qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-   3. baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
-   4. chatglm2 series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
-   5. llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
-   6. openbuddy-llama series: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
-   7. internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-   8. xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
-   9. other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
-3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
-4. supported datasets:
-   1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
-   3. Multi-Modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
-   4. Other: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
-5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
+- Supported SFT Methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
+- Supported Features: quantization, DDP, model parallelism, gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
+- Supported Models:
+  - qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+  - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+  - chatglm2 series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
+  - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
+  - openbuddy-llama series: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
+  - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
+  - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
+- Supported Datasets:
+  - NLP:
+    - General: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary)
+    - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
+    - Coding: [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)
+    - Medical: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
+    - Law: [lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
+    - Math: [blossom-math-zh](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary), [school-math-zh](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)
+    - SQL: [text2sql-en](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary), [sql-create-context-en](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)
+    - Text Generation: [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)
+    - Classification: [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)
+    - Other: [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
+  - Multi-Modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
+  - Custom Dataset
+- Supported Templates:
+  - Text Generation: default-generation, chatglm2-generation
+  - Chat: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, internlm, xverse
+
+
+## News
+- 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
+- 2023.9.26: Supported xverse model series: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
+- 2023.9.25: Supported qwen-14b model series: qwen-14b, qwen-14b-chat.
+- 2023.9.20: Supported incremental weight merging from LoRA and QLoRA training methods into base model weights, and saved the complete model weights for easy deployment by users.
+- 2023.9.18: Supported internlm-20b model series: internlm-20b, internlm-20b-chat.
+- 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed. The corresponding shell script can be found at `scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
+- 2023.9.5: Supported training that only saves model weights without saving intermediate states such as optimizer weights required for checkpoint resumption, avoiding long checkpoint-saving times and large storage space in full-parameter fine-tuning.
+- 2023.9.5: Supported openbuddy-llama2-70b model.
+- 2023.9.3: Supported baichuan-13b model series: baichuan-13b, baichuan-13b-chat.
+
 
 ## Prepare the Environment
 Experimental environment: V100, A10, 3090, A100, ...
@@ -45,7 +70,7 @@ sh Miniconda3-latest-Linux-x86_64.sh
 conda create --name ms-sft python=3.10
 conda activate ms-sft
 
-# Setting up a global mirror for pip and installing related Python packages
+# Setting up a global mirror for pip and installing related Python packages (Note the matching version of CUDA)
 pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 
@@ -56,19 +81,20 @@ cd examples/pytorch/llm
 pip install -r requirements.txt -U
 ```
 
+
 ## Run SFT and Inference
 Performace: full(nice) > lora > qlora
 
 Training GPU memory: qlora(low,3090) > lora > full(2*A100)
 
-Note:
-1. You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed.
-2. If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`.
-3. If you want to merge LoRA weights and save during inference, you need to set `--merge_lora_and_save true`.
-4. If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`.
-5. If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16.
-6. qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (V100, 3090, A10 machines do not support flash-attn).
-7. Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`.
+Tips:
+- You can set `--gradient_checkpointing true` during training to save GPU memory, but this will slightly decrease the training speed.
+- If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`.
+- If you want to merge LoRA weights and save during inference, you need to set `--merge_lora_and_save true`.
+- If you want to use quantization, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`.
+- If you are using older GPUs like V100, you need to set `--dtype fp16`, because they do not support bf16.
+- qwen recommends installing [flash-attn](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (V100, 3090, A10 machines do not support flash-attn).
+- Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to customize a shell script, it is recommended to refer to the script in `scripts/qwen_7b_chat`.
 ```bash
 # sft lora and infer qwen-7b-chat, Requires 38GB GPU memory.
 # Recommended experimental environment: A100
@@ -106,7 +132,119 @@ bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 ```
 
-## Extend Datasets
-1. If you want to expand the dataset, you can modify the `DATASET_MAPPING` in the `utils/dataset.py` file by adding a set of mappings. The key represents the name of the dataset, and the value represents the function to retrieve the dataset, which should return an `HfDataset`. For fine-tuning with single-turn conversations, the dataset should include the `query` and `response` fields, representing the user's query and the assistant's response for fine-tuning, respectively. You can refer to the `alpaca-zh` dataset for more details. If you're dealing with multi-turn conversations, you need to include an additional `history` field, which can be referenced from the `damo-agent-mini-zh` dataset. If each example in the dataset has a different `system` field, you'll need to include the `system` field as well.
-2. If you want to expand the model, you can modify the `MODEL_MAPPING` in the `utils/model.py` file. The `model_id` can be specified as a local path, in which case the `revision` parameter is not used.
-3. If you want to expand the templates, you can modify the `TEMPLATE_MAPPING` in the `utils/preprocess.py` file.
+
+## User Guide
+### Introduction to MODEL_MAPPING (Model Expansion)
+`MODEL_MAPPING` is defined in `utils/model.py` and is used to load various types of base models. If you need to **expand the models**, you can add them here. The key represents the unique ID of the model, and the value represents the model configuration. The configuration includes the following:
+
+- `model_id`: Required. It represents the `model_id` in the ModelScope Hub or the local model directory.
+- `revision`: Used to specify the version number of the model. If `model_id` is a local model directory, this parameter is ignored; otherwise, it is required.
+- `get_function`: A function to get the model and tokenizer. By default, it uses `get_model_tokenizer_from_repo` to return the model and tokenizer. If you need to set `flash_attn` or patch the model code, etc., you can customize it.
+- `lora_TM`: The default `lora_target_modules` used. In our settings, it is set to `qkv`.
+- `template`: The default chat template used, such as chatml, baichuan, etc. If not set, the `default` chat template is used.
+- `ignore_file_pattern`: Represents the file content to ignore when downloading. This parameter is passed to `snapshot_download`. For example, `r'.+\.bin$'`, `r'.+\.savetensors$'`, etc.
+
+
+### Introduction to DATASET_MAPPING (Dataset Expansion)
+`DATASET_MAPPING` is defined in `utils/dataset.py` and is used to load various types of data, such as single-turn instruction fine-tuning datasets, multi-turn chat datasets, multimodal datasets, etc. If you need to **expand the datasets**, you can add them here. The key represents the unique ID of the dataset, such as alpaca-en, alpaca-zh, etc. The value is the function to get the dataset. This function does not require any parameters and should return either `HfDataset` or `Tuple[HfDataset, HfDataset]`. In the first case, the dataset processing function will split a portion of the dataset as the validation set (based on the command-line hyperparameter `dataset_test_ratio`). In the second case, the two returned datasets are used as the training set and validation set, respectively. We support fine-tuning with multiple datasets. The training and validation parts of each sub-dataset are concatenated and the merged training set and validation set are returned.
+
+The returned `HfDataset` must comply with certain conventions. In the case of instruction fine-tuning (single-turn dialogue), it should include the `query` and `response` fields, representing the user's query for instruction fine-tuning and the assistant's response, respectively. You can refer to the `alpaca-zh` dataset for more details. In the case of multi-turn dialogue, an additional `history` field is required, representing the history of the dialogue. You can refer to the `damo-agent-mini-zh` dataset for more details. If each example in the dataset has a different `system`, an additional `system` field is required. You can also refer to the `damo-agent-mini-zh` dataset for more details. We only calculate and optimize the loss for the `response` part.
+
+
+### Introduction to TEMPLATE_MAPPING (Dialogue Template Expansion)
+`TEMPLATE_MAPPING` is defined in `utils/preprocess.py` and is used to preprocess text information into token lists. If you need to **expand the dialogue templates**, you can add them here. The key represents the unique ID of the chat template, such as 'default', 'chatml', etc. The value represents the configuration of the chat template, including 'prefix', 'prompt', 'chat_sep', and 'suffix'. This module retrieves the complete chat template based on these four elements, enabling support for pre-training, text generation-style SFT, and various chat-type SFT. The meanings of these four configuration elements are as follows:
+
+- `prefix`: Represents the prefix part of the chat template, usually including the system part and relevant formats, prefix tokens, BOS token, etc. We use `{{SYSTEM}}` as a placeholder for the system part.
+- `prompt`: Represents a round of dialogue in the chat template. We use `{{QUERY}}` as a placeholder for the human inquiry part in each round of dialogue, and `{{ROUND}}` represents the placeholder for the current round of dialogue, counting from 1. The assistant's reply is concatenated after the `prompt`, so we did not design a placeholder for it.
+- `chat_sep`: If multiple rounds of dialogue are needed, `chat_sep` serves as the separator between each round of dialogue, such as a newline, etc.
+- `suffix`: Serves as the suffix part of the chat template, usually the EOS token. It is appended after the last round of dialogue. Only the response part of the last round will calculate the loss and be optimized, while the other parts will not calculate the loss.
+
+
+### sft.sh Command Line Arguments
+- `--model_type`: Represents the chosen model type, default is `'qwen-7b-chat'`. Available `model_type` can be checked using `MODEL_MAPPING.keys()`.
+- `--sft_type`: Represents the fine-tuning method, default is `'lora'`. The possible values are: 'lora', 'full'. If you want to use lora or qlora, you need to select `--sft_type lora`. For qlora, an additional setting `--quantization_bit 4` is required. If you want to use full-parameter fine-tuning, you need to select `--sft_type full`.
+- `--tuner_backend`: Represents the backend support for lora and qlora, default is `'swift'`. The possible values are: 'swift', 'peft'.
+- `--template_type`: Represents the type of dialogue template used, default is `None`, which means it retrieves the template based on `model_type` from `MODEL_MAPPING`. Available `template_type` can be checked using `TEMPLATE_MAPPING.keys()` in `utils/preprocess.py`. By modifying it, you can support pretrain, text-generation-style SFT, and various chat-type SFT.
+- `--output_dir`: Represents the directory for storing checkpoints, default is `'output'`. We will concatenate `model_type` and fine-tuning version number to this directory. This allows users to perform multiple comparative experiments on different models without changing the `output_dir` command-line argument.
+- `--ddp_backend`: Represents the backend support for distributed training, default is `'nccl'`. The possible values are: 'nccl', 'gloo', 'mpi', 'ccl'.
+- `--seed`: Global seed value, default is 42. In distributed training, to avoid each process using the same dropout, etc., we set `seed=seed+rank`.
+- `--resume_from_ckpt`: Used for resuming training from a checkpoint, default is `None`. You can set it to the path of the checkpoint, for example: `'output/qwen-7b-chat/vx_xxx/checkpoint-xxx'`, to resume training from that checkpoint.
+- `--dtype`: torch_dtype when loading the base model, default is `'bf16'`. The possible values are: 'bf16', 'fp16', 'fp32'.
+- `--ignore_args_error`: Whether to ignore errors raised by command-line argument mismatch, default is `False`. If you need to copy the code to a notebook for execution, you should set it to True.
+- `--dataset`: Used to select the training dataset, default is `'advertise-gen'`. Available datasets can be checked using `DATASET_MAPPING.keys()`. If you want to use multiple datasets for training, you can separate them using commas, for example: `alpaca-en,alpaca-zh`.
+- `--dataset_split_seed`: Specifies the seed for splitting the sub-dataset into training and validation sets, default is `42`. This parameter is ignored if the sub-dataset has already been split into training and validation sets. When multiple sub-datasets are specified in `dataset` and the function for retrieving the sub-dataset does not perform the split (i.e., returns `HfDataset` instead of `Tuple[HfDataset, HfDataset]`), we need to split the sub-dataset. Finally, we concatenate the training and validation parts of these sub-datasets to generate the training and validation sets for the complete fine-tuning dataset.
+- `--dataset_test_ratio`: Specifies the ratio for splitting the sub-dataset into training and validation sets, default is `0.01`. This parameter is ignored if the sub-dataset has already been split into training and validation sets. For more information, refer to the section on `dataset_split_seed`.
+- `--train_dataset_sample`: Samples from the complete training dataset, default is `20000`, to speed up training. This parameter is used to avoid the issue of training time being too long for a single epoch when the dataset is large. LoRA convergence is usually fast and does not require a large number of data samples for fine-tuning. If you specify `-1`, the full training dataset will be used for training, which is typically used in the setting of full-parameter fine-tuning.
+- `--system`: The system used in the dialogue template, default is `'you are a helpful assistant!'`.
+- `--max_length`: Maximum token length, default is `2048`. This helps to avoid out-of-memory (OOM) issues caused by individual samples that are too long. If a data sample exceeds the `max_length`, the frontmost tokens will be truncated: `input_ids[-max_length:]`.
+- `--quantization_bit`: Specifies whether to perform quantization and the number of quantization bits, default is `0`, which means no quantization. Quantization is only supported for the lora fine-tuning method and not for full-parameter fine-tuning.
+- `--bnb_4bit_comp_dtype`: When performing 4-bit quantization, we need to dequantize it during the model's forward and backward passes. This parameter specifies the torch_dtype after dequantization. Default is `None`, which means it remains consistent with `dtype`. The possible values are: 'fp16', 'bf16', 'fp32'. This parameter is ignored when `quantization_bit` is 0.
+- `--bnb_4bit_quant_type`: The quantization type for 4-bit quantization, default is `'nf4'`. The possible values are: 'nf4', 'fp4'. This parameter is ignored when `quantization_bit` is 0.
+- `--bnb_4bit_use_double_quant`: Whether to enable double quantization during 4-bit quantization, default is `True`. This parameter is ignored when `quantization_bit` is 0.
+- `--lora_target_modules`: Specifies the LoRA modules, default is `None`, which means it searches for the `lora_TM` (default is qkv) in `MODEL_MAPPING` based on `model_type`. If `ALL` is passed, all linear layers (excluding the head) will be designated as LoRA modules. This parameter only takes effect when `sft_type` is set to `'lora'`.
+- `--lora_rank`: Default is `8`. This parameter only takes effect when `sft_type` is set to `'lora'`.
+- `--lora_alpha`: Default is `32`. This parameter only takes effect when `sft_type` is set to `'lora'`.
+- `--lora_dropout_p`: Default is `0.0`. This parameter only takes effect when `sft_type` is set to `'lora'`.
+- `--gradient_checkpointing`: Whether to enable gradient checkpointing, default is `False`. This parameter can be used to save GPU memory, although it slightly slows down the training speed. This parameter is particularly effective when `max_length` and `batch_size` are large.
+- `--batch_size`: Batch size during training, default is `1`. Increasing the batch size can improve GPU utilization but may not necessarily speed up training because within a batch, padding is applied to shorter sentences based on the length of the longest sentence in the batch, introducing unnecessary computations.
+- `--eval_batch_size`: Batch size during evaluation, default is `None`. If `predict_with_generate` is set to `True`, it is set to `1`; if `predict_with_generate` is `False`, it is set to `batch_size`.
+- `--num_train_epochs`: Number of training epochs, default is `1`. If `max_steps >= 0`, it overrides `num_train_epochs`.
+- `--max_steps`: Maximum number of training steps, default is `-1`. If `max_steps >= 0`, it overrides `num_train_epochs`.
+- `--optim`: Default is `'adamw_torch'`.
+- `--learning_rate`: Default is `None`. If `sft_type` is `'lora'`, it is set to `1e-4`; if `sft_type` is `'full'`, it is set to `2e-5`.
+- `--weight_decay`: Default is `0.01`.
+- `--gradient_accumulation_steps`: Gradient accumulation, default is `16`. `total_batch_size = batch_size * gradient_accumulation_steps * world_size`.
+- `--max_grad_norm`: Gradient clipping, default is `1`.
+- `--predict_with_generate`: Whether to use a generative approach during evaluation, default is `False`. If set to `False`, it uses `loss` for evaluation. If set to `True`, it uses metrics such as `ROUGE-L` for evaluation. Note that using the generative approach for evaluation is time-consuming, so use it with caution.
+- `--lr_scheduler_type`: Default is `'cosine'`.
+- `--warmup_ratio`: Ratio of warmup steps to the total training steps, default is `0.05`.
+- `--eval_steps`: Perform evaluation every specified number of steps, default is `50`.
+- `--save_steps`: Save the model every specified number of steps, default is `None`, which sets it to `eval_steps`.
+- `--only_save_model`: Whether to only save the model parameters without storing the intermediate states required for resuming training. The default value is `None`. If `sft_type` is 'lora', set it to False. If `sft_type` is 'full', set it to True.
+- `--save_total_limit`: The number of checkpoints to save. The default value is `2`, which saves the best and last checkpoints. If set to -1, it saves all checkpoints.
+- `--logging_steps`: Print training information (e.g., loss, learning rate) every specified number of steps. The default value is `5`.
+- `--dataloader_num_workers`: The number of worker processes to use for data loading. The default value is `1`.
+- `--push_to_hub`: Whether to synchronize the training checkpoints to the ModelScope Hub. The default value is `False`.
+- `--hub_model_id`: The model id of the ModelScope Hub to push to. The default value is `None`, which is set to `f'{model_type}-{sft_type}'`. You can set it to a specific model id or repository name. The user name will be inferred from the `hub_token`. If the remote repository does not exist, a new repository will be created. If it exists, the previous repository will be reused. This parameter only takes effect when `push_to_hub` is set to True.
+- `--hub_private_repo`: Whether to set the permission of the model repository in the ModelScope Hub to private. The default value is `True`. This parameter only takes effect when `push_to_hub` is set to True.
+- `--push_hub_strategy`: The push strategy. The default value is `'push_best'`. Available options are: 'end', 'push_best', 'push_last', 'checkpoint', 'all_checkpoints'. 'push_best' means that the best model will be pushed and overwrite the previous weights every time the weights are saved. 'push_last' means that the last weights will be pushed and overwrite the previous weights every time the weights are saved. This parameter only takes effect when `push_to_hub` is set to True.
+- `--hub_token`: The SDK token required for pushing to the ModelScope Hub. You can obtain it from https://modelscope.cn/my/myaccesstoken. The default value is `None`, which retrieves the token from the environment variable `MODELSCOPE_API_TOKEN`. This parameter only takes effect when `push_to_hub` is set to True.
+- `--test_oom_error`: Used to check if training will encounter an out-of-memory (OOM) error. The default value is `False`. If set to True, the training set will be sorted in reverse order of `max_length` to facilitate OOM testing. This parameter is generally used for testing, so please use it with caution.
+- `--use_flash_attn`: Whether to use flash attention. The default value is `None`, which is set to 'auto'. This parameter only takes effect when `model_type.startswith('qwen')` is True. For installation steps of flash attention, please refer to https://github.com/Dao-AILab/flash-attention.
+- `--max_new_tokens`: The maximum number of new tokens to generate. The default value is `1024`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--do_sample`: Whether to use sampling during generation. The default value is `True`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--temperature`: The temperature value for sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--top_k`: The value of k for top-k sampling during generation. The default value is `20`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--top_p`: The cumulative probability threshold for top-p sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--repetition_penalty`: The repetition penalty applied during generation. The default value is `1.0`. This parameter only takes effect when `predict_with_generate` is set to True.
+
+
+### infer.sh Command Line Arguments
+- `--model_type`: Default value is `'qwen-7b-chat'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--sft_type`: Default value is `'lora'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--template_type`: Default value is `None`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--ckpt_dir`: Required field, value is the checkpoint path saved during the SFT phase, e.g., `'/path/to/your/vx_xxx/checkpoint-xxx'`.
+- `--eval_human`: Whether to evaluate using the validation set from the dataset or manually evaluate the model. Default value is `False`. This allows us to get an intuitive understanding of the model's performance after fine-tuning.
+- `--seed`: Default value is `42`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--dtype`: Default value is `'bf16'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--ignore_args_error`: Default value is `False`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--dataset`: Default value is `'advertise-gen'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
+- `--dataset_split_seed`: Default value is `42`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
+- `--dataset_test_ratio`: Default value is `0.01`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
+- `--show_dataset_sample`: Indicates the number of samples from the validation set to evaluate and display. Default value is `20`. This parameter only takes effect when `eval_human` is set to False.
+- `--system`: Default value is `'you are a helpful assistant!'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--max_length`: Default value is `2048`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--quantization_bit`: Default value is 0. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--bnb_4bit_comp_dtype`: Default value is `None`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter is not effective if `quantization_bit` is set to 0.
+- `--bnb_4bit_quant_type`: Default value is `'nf4'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter is not effective if `quantization_bit` is set to 0.
+- `--bnb_4bit_use_double_quant`: Default value is `True`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter is not effective if `quantization_bit` is set to 0.
+- `--max_new_tokens`: Maximum number of new tokens to generate. Default value is `1024`.
+- `--do_sample`: Whether to use greedy decoding or sampling for generation. Default value is `True`.
+- `--temperature`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--top_k`: Default value is `20`. This parameter only takes effect when `do_sample` is set to True.
+- `--top_p`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--repetition_penalty`: Default value is `1.0`.
+- `--use_flash_attn`: Default value is `None`, which means 'auto'. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--use_streamer`: Whether to use streaming output. Default value is `True`.
+- `--merge_lora_and_save`: Whether to merge the lora weights into the base model and save the complete weights. Default value is `False`. The weights will be saved in a directory named `checkpoint-xxx-merged` at the same level as `ckpt_dir`, e.g., `'/path/to/your/vx_xxx/checkpoint-xxx-merged'`.
+- `--save_generation_config`: Whether to save the generation_config used for evaluation as a `generation_config.json` file. Default value is `True`.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index f6a4b0d9df..554dcb344a 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -1,13 +1,13 @@
-
 <h1 align="center">大模型微调的例子</h1>
 
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
 <a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.9.1-5D91D4.svg"></a>
-<a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-%E2%89%A51.1.0-6FEBB9.svg"></a>
+<a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-Build from source-6FEBB9.svg"></a>
 </p>
 
+
 <p align="center">
 <a href="https://modelscope.cn/home">魔搭社区</a>
 <br>
@@ -16,24 +16,48 @@
 
 
 ## 特性
-1. 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
-2. 支持的模型:
-   1. qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
-   2. qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-   3. baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
-   4. chatglm2 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
-   5. llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
-   6. openbuddy-llama 系列: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
-   7. internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-   8. xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
-   9. other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
-3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
-4. 支持的数据集:
-   1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
-   3. 多模态: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
-   4. 其他: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
-5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
+- 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
+- 支持的特性: 模型量化, DDP, 模型并行, gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
+- 支持的模型
+  - qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+  - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+  - chatglm2 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)
+  - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
+  - openbuddy-llama 系列: [openbuddy-llama2-13b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)
+  - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
+  - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
+- 支持的数据集:
+  - NLP:
+    - 通用: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary)
+    - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), [damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)
+    - 代码: [code-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)
+    - 医疗: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
+    - 法律: [lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
+    - 数学: [blossom-math-zh](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary), [school-math-zh](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)
+    - SQL: [text2sql-en](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary), [sql-create-context-en](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)
+    - 文本生成: [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)
+    - 分类: [cmnli-zh](https://www.modelscope.cn/datasets/modelscope/clue/summary), [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)
+    - 其他: [finance-en](https://www.modelscope.cn/datasets/wyj123456/finance_en/summary), [poetry-zh](https://www.modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary), [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
+  - 多模态: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
+  - 自定义数据集
+- 支持的对话模板:
+  - 文本生成: default-generation, chatglm2-generation
+  - 对话: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, internlm, xverse
+
+
+## 新闻
+- 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
+- 2023.9.26: 支持xverse系列模型: xverse-7b, xverse-7b-chat, xverse-13b, xverse-13b-chat.
+- 2023.9.25: 支持**qwen-14b**系列模型: qwen-14b, qwen-14b-chat
+- 2023.9.20: 支持在LoRA, QLoRA的方式训练后, 将其增量权重merge到基模型权重中, 并保存完整的模型权重, 方便用户的部署.
+- 2023.9.18: 支持internlm-20b系列模型: internlm-20b, internlm-20b-chat
+- 2023.9.12: 支持MP+DDP的方式训练, 加快全参数微调的速度, 对应的sh脚本可以查看`scripts/qwen_7b_chat/full_mp_ddp/sft.sh`.
+- 2023.9.5: 支持训练只保存模型权重, 而不保存断点续训所需的优化器权重等中间状态, 避免全参数微调保存checkpoint所需时间过长和空间过大的问题.
+- 2023.9.5: 支持openbuddy-llama2-70b模型.
+- 2023.9.3: 支持baichuan-13b系列模型: baichuan-13b, baichuan-13b-chat.
+
 
 ## 准备实验环境
 实验环境: V100, A10, 3090, A100均可.
@@ -47,7 +71,7 @@ sh Miniconda3-latest-Linux-x86_64.sh
 conda create --name ms-sft python=3.10
 conda activate ms-sft
 
-# pip设置全局镜像与相关python包安装
+# pip设置全局镜像与相关python包安装(注意cuda版本的匹配)
 pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 
@@ -58,19 +82,20 @@ cd examples/pytorch/llm
 pip install -r requirements.txt -U
 ```
 
+
 ## 微调和推理
 性能: full(优) > lora > qlora
 
 训练显存: qlora(低,3090) > lora > full(2*A100)
 
-Note:
-1. 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
-2. 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
-3. 如何你想要在推理时合并LoRA权重并保存，你需要设置`--merge_lora_and_save true`.
-4. 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`.
-5. 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16.
-6. qwen推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(V100, 3090, A10等机器不支持flash-attn).
-7. 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
+提示:
+- 你可以在训练时设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
+- 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
+- 如何你想要在推理时, 合并LoRA权重并保存，你需要设置`--merge_lora_and_save true`.
+- 如果你想要使用量化, 你需要先安装bnb: `pip install bitsandbytes -U`.
+- 如果你使用的是V100等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16.
+- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(V100, 3090, A10等显卡不支持flash-attn进行训练).
+- 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要自定义sh脚本, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
 ```bash
 # 微调(lora)+推理 qwen-7b-chat, 需要38GB显存.
 # 推荐的实验环境: A100
@@ -108,7 +133,119 @@ bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 ```
 
-## 拓展数据集
-1. 如果你想要拓展数据集, 你可以修改`utils/dataset.py`文件中的`DATASET_MAPPING`加入一组映射, key为数据集的名称, value为获取数据集的函数, 该函数需要返回一个`HfDataset`. 其中指令微调(单轮对话)需包含`query`, `response`字段, 分别代表指令微调的用户询问和AI助手的回答, 具体可以参考`alpaca-zh`数据集. 如果是多轮对话, 则需要额外加上`history`字段, 具体可以参考`damo-agent-mini-zh`数据集. 如果每个数据集样例的具有不同的`system`, 则需要额外加上system字段.
-2. 如果你想要拓展模型, 你可以修改`utils/model.py`文件中的`MODEL_MAPPING`. `model_id`可以指定为本地路径, 这种情况下, `revision`参数不起作用.
-3. 如果你想要拓展template, 你可以修改`utils/preprocess.py`文件中的`TEMPLATE_MAPPING`.
+
+## 使用文档
+### MODEL_MAPPING 介绍 (模型拓展)
+`MODEL_MAPPING`定义在`utils/model.py`中, 用于加载各种类型的基模型. 如果你需要**拓展模型**, 你可以在里面进行添加. 其中key表示模型的唯一id, value表示模型的配置. 配置内容如下.
+
+- `model_id`: 必填项. 表示模型在ModelScope Hub中的model_id, 或者是本地的模型目录.
+- `revision`: 用于指定模型的版本号. 如果model_id是本地的模型目录, 则该参数失效, 否则为必填项.
+- `get_function`: 获取model和tokenizer的函数, 默认使用`get_model_tokenizer_from_repo`, 返回model和tokenizer. 如果需要设置flash_attn或对模型代码打补丁等, 则可以通过自定义来实现它.
+- `lora_TM`: 默认使用的lora_target_modules, 在我们的设置中, 会将其设置qkv.
+- `template`: 默认使用的chat template, 例如: chatml, baichuan等. 如果未设置, 则使用`default`的chat template.
+- `ignore_file_pattern`: 表示下载的时候需要忽略的文件内容, 该参数会传递给`snapshot_download`. 例如`r'.+\.bin$'`, `r'.+\.savetensors$'`等.
+
+
+### DATASET_MAPPING 介绍 (数据集拓展)
+`DATASET_MAPPING`定义在`utils/dataset.py`中, 用于加载各种类型的数据, 例如: 单轮指令微调数据集, 多轮chat数据集, 多模态数据集等. 如果你需要**拓展数据集**, 你可以在这里面添加. 其中key表示dataset的唯一id, 例如: alpaca-en, alpaca-zh等. value是获取数据集的函数. 该函数不需要传入任何参数, 需要返回`HfDataset`或`Tuple[HfDataset, HfDataset]`. 第一种情况下, 数据集处理函数会切分一部分的数据集作为验证集 (根据命令行超参数`dataset_test_ratio`); 第二种情况下, 返回的两个数据集分别作为其训练集和验证集. 我们支持使用多个数据集进行微调. 我们会将各个子数据集的训练集和验证集部分分别进行拼接, 最终返回合并后的训练集和验证集.
+
+函数返回的`HfDataset`需要符合一定的规范. 如果是指令微调(单轮对话)的情况下, 需包含`query`, `response`字段, 分别代表指令微调的用户询问和AI助手的回答, 具体可以参考`alpaca-zh`数据集. 如果是多轮对话, 则需要额外加上`history`字段, 代表对话的历史信息, 具体可以参考`damo-agent-mini-zh`数据集. 如果每个数据集样例具有不同的`system`, 则需要额外加上system字段, 具体你也可以参考`damo-agent-mini-zh`数据集. 我们只会对`response`部分进行loss的计算和优化.
+
+
+### TEMPLATE_MAPPING 介绍 (对话模板拓展)
+`TEMPLATE_MAPPING`定义在`utils/preprocess.py`中, 用于将文本信息预处理成token list. 如果你需要**拓展对话模板**, 可以在这里面添加. 其中key表示chat template的唯一id, 例如: 'default', 'chatml'等. value表示对话模板的配置, 分别是'prefix', 'prompt', 'chat_sep', 'suffix'. 此模块会根据这四个内容, 获取完整的chat template, 使其支持预训练, text generation式的SFT, 各种chat类型的SFT. 其中这四个配置内容的含义如下.
+
+- `prefix`: 表示对话模板中的前缀部分, 一般为system部分及其相关格式, 前缀token, bos token等内容. 我们使用`{{SYSTEM}}`作为system部分的占位符.
+- `prompt`: 表示对话模板中的一轮对话. 我们使用`{{QUERY}}`作为每轮对话中, human询问部分的占位符, `{{ROUND}}`则表示本次对话是第几轮的占位符, 从1开始计数. AI助手的回复部分会拼接在`prompt`的后面, 因此我们没有设计其占位符.
+- `chat_sep`: 如果需要进行多轮对话, `chat_sep`会作为每轮对话之间的分隔符, 例如: 换行等.
+- `suffix`: 作为对话模板的后缀部分, 一般为eos token. 会拼接在最后一轮的对话后面. 只有最后一轮对话的reponse部分和`suffix`会计算loss并优化, 其余部分不计算损失.
+
+
+### sft.sh 命令行参数
+- `--model_type`: 表示你选择的模型类型, 默认是`'qwen-7b-chat'`. 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`.
+- `--sft_type`: 表示微调的方式, 默认是`'lora'`. 你可以选择的值包括: 'lora', 'full'. 如果你要使用lora或qlora, 你需要选择`--sft_type lora`. qlora需额外设置`--quantization_bit 4`. 如果你要使用全参数微调, 则需选择`--sft_type full`.
+- `--tuner_bankend`: 表示lora, qlora的后端支持, 默认是`'swift'`. 你可以选择的值包括: 'swift', 'peft'.
+- `--template_type`: 表示使用的对话模板的类型, 默认是`None`, 即根据`model_type`查找`MODEL_MAPPING`中的`template`. 可以选择的`template_type`可以查看`utils/preprocss.py`的`TEMPLATE_MAPPING.keys()`. 通过修改它, 可以支持pretrain, text-generation式的SFT, 各种chat类型的SFT.
+- `--output_dir`: 表示ckpt存储的目录, 默认是`'output'`. 我们会在该目录后拼接`model_type`和微调版本号. 方便用户对不同模型进行多次对比实验, 而不需要改变`output_dir`命令行参数.
+- `--ddp_backend`: 表示分布式的后端支持, 默认是`'nccl'`. 你可以选择的值包括: 'nccl', 'gloo', 'mpi', 'ccl'.
+- `--seed`: 全局的seed, 默认使用42. 在分布式训练中, 为避免每个进程使用相同的dropout等情况, 我们会令`seed=seed+rank`.
+- `--resume_from_ckpt`: 用于断点续训, 默认为`None`. 你可以将其设置为checkpoint的路径, 例如: `'output/qwen-7b-chat/vx_xxx/checkpoint-xxx'`, 来进行断点续训.
+- `--dtype`: 基模型载入时的torch_dtype, 默认为`'bf16'`. 你可以选择的值包括: 'bf16', 'fp16', 'fp32'.
+- `--ignore_args_error`: 是否忽略命令行传参错误抛出的Error, 默认为`False`. 如果需要拷贝代码到notebook中运行, 需要设置成True.
+- `--dataset`: 用于选择训练的数据集, 默认为`'advertise-gen'`. 可以选择的数据集可以查看`DATASET_MAPPING.keys()`. 如果需要使用多个数据集进行训练, 你可以使用`,`进行分割, 例如: `alpaca-en,alpaca-zh`.
+- `--dataset_split_seed`: 用于指定子数据集切分成训练集和验证集的seed, 默认为`42`. 如果子数据集已经进行了训练集和验证集的切分, 则此参数无效. 当`dataset`中指定了多个子数据集时, 且获取子数据集的函数没有进行训练集和验证集的切分(即返回的是`HfDataset`而不是`Tuple[HfDataset, HfDataset]`), 则我们需要对该子数据集进行切分. 最后, 我们会将这些子数据集的训练集和验证集部分分别进行拼接, 生成完整微调数据集的训练集和验证集.
+- `--dataset_test_ratio`: 用于指定子数据集切分成训练集和验证集的比例, 默认为`0.01`. 如果子数据集已经进行了训练集和验证集的切分, 则此参数无效. 更多的介绍可以参考`dataset_split_seed`的部分.
+- `--train_dataset_sample`: 对完整训练集进行采样, 默认是`20000`, 用于加快训练的速度. 该参数是为了避免数据集过大, 单个epoch训练时间过长的问题. LoRA的收敛通常较快, 不需要过多数据样本的微调. 如果你指定为`-1`, 则使用完整的训练集进行训练, 该情况一般出现在全参数微调的设置下.
+- `--system`: 对话模板中使用的system, 默认为`'you are a helpful assistant!'`.
+- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 如果某数据样本长度超过max_length, 我们会切除最前面的token: `input_ids[-max_length:]`.
+- `--quantization_bit`: 用于指定是否进行量化和量化的bit数, 默认为`0`, 即不进行量化. 量化情况下, 只支持lora的微调方式, 不支持全参数的微调方式.
+- `--bnb_4bit_comp_dtype`: 在进行4bit量化时, 我们需要在模型的forward和backward时, 将其进行反量化. 该参数用于指定反量化后的torch_dtype. 默认为`None`, 即与`dtype`保持一致. 可选择的值包括: 'fp16', 'bf16', 'fp32'. 当quantization_bit为0时, 该参数无效.
+- `--bnb_4bit_quant_type`: 4bit量化时的量化方式, 默认是`'nf4'`. 可选择的值包括: 'nf4', 'fp4'. 当quantization_bit为0时, 该参数无效.
+- `--bnb_4bit_use_double_quant`: 是否在4bit量化时开启double量化, 默认为`True`. 当quantization_bit为0时, 该参数无效.
+- `--lora_target_modules`: 指定lora模块, 默认为`None`, 即根据`model_type`查找`MODEL_MAPPING`中的`lora_TM`(默认指定为qkv). 如果传入`ALL`, 则将所有的Linear层都指定为lora模块(不含head). 该参数只有当`sft_type`指定为'lora'时才生效.
+- `--lora_rank`: 默认为`8`. 只有当`sft_type`指定为'lora'时才生效.
+- `--lora_alpha`: 默认为`32`. 只有当`sft_type`指定为'lora'时才生效.
+- `--lora_dropout_p`: 默认为`0.`, 只有当`sft_type`指定为'lora'时才生效.
+- `--gradient_checkpointing`: 是否开启gradient checkpointing, 默认为`False`. 该参数可以用于节约显存, 虽然这会略微降低训练速度. 该参数在max_length较大, batch_size较大时作用显著.
+- `--batch_size`: 训练时的batch_size, 默认为`1`. 增大batch_size可以增加GPU的利用率, 但不一定会增加训练速度, 因为在一个batch中, 需要对较短的句子按该batch中最长句子的长度进行padding, 从而引入无效的计算量.
+- `--eval_batch_size`: 评估时的batch_size, 默认为`None`, 即当`predict_with_generate`为True时, 设置为1, 为False时, 设置为`batch_size`.
+- `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`.
+- `--max_steps`: 训练的max_steps数, 默认为`-1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`.
+- `--optim`: 默认为`'adamw_torch'`.
+- `--learning_rate`: 默认值为`None`, 即如果`sft_type`为lora, 则设置为1e-4, 如果`sft_type`为full, 则设置为2e-5.
+- `--weight_decay`: 默认值为`0.01`.
+- ` --gradient_accumulation_steps`: 梯度累加, 默认值为`16`. `total_batch_size =  batch_size * gradient_accumulation_steps * world_size`.
+- `--max_grad_norm`: 梯度裁剪, 默认值为`1`.
+- `--predict_with_generate`: 评估时是否使用生成式的方式, 默认为`False`. 如果设置为False, 则使用`loss`进行评估. 如果设置为True, 则使用`ROUGE-L`等指标进行评估. 使用生成式评估耗费的时间很长, 请谨慎选择.
+- `--lr_scheduler_type`: 默认值为`'cosine'`.
+- `--warmup_ratio`: warmup占用总的训练steps的比例, 默认为`0.05`.
+- `--eval_steps`: 每训练多少steps进行评估, 默认为`50`.
+- `--save_steps`: 每训练多少个steps进行保存, 默认为`None`, 即设置为`eval_steps`.
+- `--only_save_model`: 是否只保存模型参数, 而不存储断点续训所需的中间状态, 默认为`None`, 即如果`sft_type`为'lora', 设置为False, 如果`sft_type`为'full', 则设置为True.
+- `--save_total_limit`: 保存的checkpoint的数量, 默认为`2`, 即保存best和last的checkpoint. 如果设置为-1, 则保存所有的checkpoint.
+- `--logging_steps`: 每训练多少步进行打印 (e.g. loss, learning_rate等), 默认为`5`.
+- `--dataloader_num_workers`: 默认值为`1`.
+- `--push_to_hub`: 是否将训练的checkpoint同步推送到ModelScope Hub中, 默认为`False`.
+- `--hub_model_id`: 推送到的ModelScope Hub的model_id, 默认为`None`, 即设置为`f'{model_type}-{sft_type}'`. 你可以将其设置为model_id, 也可以设置为repo_name. 我们会根据hub_token推断出user_name. 推送的远程仓库如果不存在, 则会创建一个新的仓库, 如果存在, 则复用之前的仓库. 该参数只有在`push_to_hub`设置为True时才生效.
+- `--hub_private_repo`: 推送的ModelScope Hub中的模型仓库的权限是否设置为私有, 默认为`True`. 该参数只有在`push_to_hub`设置为True时才生效.
+- `--push_hub_strategy`: 推送策略, 默认为`'push_best'`. 可选择的值包括: 'end', 'push_best', 'push_last', 'checkpoint', 'all_checkpoints'. 'push_best'表示在每次保存权重时, 将最好的模型进行推送并覆盖之前的权重, 'push_last'表示在每次保存权重时, 将最后的权重进行推送并覆盖之前的权重. 该参数只有在`push_to_hub`设置为True时才生效.
+- `--hub_token`: 推送时需要的SDK token. 可以从[https://modelscope.cn/my/myaccesstoken](https://modelscope.cn/my/myaccesstoken)获取, 默认为`None`, 即从环境变量`MODELSCOPE_API_TOKEN`中获取. 该参数只有在`push_to_hub`设置为True时才生效.
+- `--test_oom_error`: 用于检测训练是否会发生OOM, 默认为`False`. 如果设置为True, 则会将训练集按max_length倒序进行排列, 方便OOM的测试. 该参数一般用于测试, 请谨慎设置.
+- `--use_flash_attn`: 是否使用flash attn, 默认为`None`, 即为'auto'. 该参数只在`model_type.startswith('qwen')`的情况下才生效. 安装flash_attn的步骤可以查看[https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)
+- `--max_new_tokens`: 默认为`1024`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--do_sample`: 默认为`True`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--temperature`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--top_k`: 默认为`20`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--top_p`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--repetition_penalty`: 默认为`1.`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+
+
+### infer.sh 命令行参数
+- `--model_type`: 默认值为`'qwen-7b-chat'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--sft_type`: 默认值为`'lora'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--template_type`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
+- `--eval_human`: 使用数据集中的验证集部分进行评估还是使用人工的方式评估, 默认值为`False`. 我们可以直观感受到微调后模型的效果.
+- `--seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--dtype`: 默认值为`'bf16'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--ignore_args_error`: 默认值为`False`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--dataset`: 默认值为`'advertise-gen'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
+- `--dataset_split_seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
+- `--dataset_test_ratio`: 默认值为`0.01`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
+- `--show_dataset_sample`: 表示想要评估和展示的验证集的数量, 默认值为`20`. 该参数只有在`eval_human`设置为False时才生效.
+- `--system`: 默认值为`'you are a helpful assistant!'`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--max_length`: 默认值为`2048`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--quantization_bit`: 默认值为0. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--bnb_4bit_comp_dtype`: 默认值为`None`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
+- `--bnb_4bit_quant_type`: 默认值为`'nf4'`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
+- `--bnb_4bit_use_double_quant`: 默认值为`True`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
+- `--max_new_tokens`: 生成新token的最大数量, 默认值为`1024`.
+- `--do_sample`: 是使用贪婪生成的方式还是采样生成的方式, 默认值为`True`.
+- `--temperature`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--top_k`: 默认值为`20`. 该参数只有在`do_sample`设置为True时才生效.
+- `--top_p`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--repetition_penalty`: 默认值为`1.`.
+- `--use_flash_attn`: 默认值为`None`, 即为'auto'. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--use_streamer`: 是否使用流式输出, 默认为`True`.
+- `--merge_lora_and_save`: 是否将lora权重merge到基模型中, 并保存完整的权重, 默认为`False`. 权重会保存在`ckpt_dir`的同级目录中,  e.g. `'/path/to/your/vx_xxx/checkpoint-xxx-merged'`目录下.
+- `--save_generation_config`: 是否将评估所使用的generation_config保存成`generation_config.json`文件, 默认为`True`.
diff --git a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b/qlora/infer.sh
similarity index 100%
rename from examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/infer.sh
rename to examples/pytorch/llm/scripts/openbuddy_llama2_70b/qlora/infer.sh
diff --git a/examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b/qlora/sft.sh
similarity index 100%
rename from examples/pytorch/llm/scripts/openbuddy-llama2-70b/qlora/sft.sh
rename to examples/pytorch/llm/scripts/openbuddy_llama2_70b/qlora/sft.sh
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
new file mode 100644
index 0000000000..f874a0c199
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
@@ -0,0 +1,21 @@
+# If you want to merge LoRA weight and save it, you need to set `--merge_lora_and_save true`.
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type qwen-14b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "output/qwen-14b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset blossom-math-zh \
+    --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
+    --max_new_tokens 1024 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/sft.sh
new file mode 100644
index 0000000000..12d203fc6c
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: A10 or 3090
+# 16GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+    --model_type qwen-14b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir output \
+    --dataset blossom-math-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0. \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn false \
+    --push_to_hub false \
+    --hub_model_id qwen-14b-chat-qlora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
index 48225aaf5c..a93f46b9ac 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -8,12 +8,12 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset advertise-gen \
-    --max_length 2048 \
+    --dataset leetcode-python-en \
+    --max_length 4096 \
     --quantization_bit 4 \
     --bnb_4bit_comp_dtype bf16 \
     --use_flash_attn false \
-    --max_new_tokens 1024 \
+    --max_new_tokens 2048 \
     --temperature 0.9 \
     --top_k 20 \
     --top_p 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
index 7bbdf7d9bb..11d3172dd7 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/sft.sh
@@ -1,5 +1,5 @@
-# Experimental environment: 3090
-# 10GB GPU memory
+# Experimental environment: A10 or 3090
+# 20GB GPU memory
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_sft.py \
@@ -8,10 +8,10 @@ python src/llm_sft.py \
     --template_type chatml \
     --dtype bf16 \
     --output_dir output \
-    --dataset advertise-gen \
+    --dataset leetcode-python-en \
     --train_dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 4096 \
     --quantization_bit 4 \
     --bnb_4bit_comp_dtype bf16 \
     --lora_rank 8 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
new file mode 100644
index 0000000000..5360f1197a
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
@@ -0,0 +1,21 @@
+# If you want to merge LoRA weight and save it, you need to set `--merge_lora_and_save true`.
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type qwen-vl-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "output/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset coco-en \
+    --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
+    --max_new_tokens 1024 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
new file mode 100644
index 0000000000..b37e9799c2
--- /dev/null
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: A10
+# 10GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+    --model_type qwen-vl-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir output \
+    --dataset coco-en \
+    --train_dataset_sample 20000 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0. \
+    --lora_target_modules c_attn attn.c_proj \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn false \
+    --push_to_hub false \
+    --hub_model_id qwen-vl-chat-qlora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/src/utils/argument.py b/examples/pytorch/llm/src/utils/argument.py
index def36c0042..03f528144d 100644
--- a/examples/pytorch/llm/src/utils/argument.py
+++ b/examples/pytorch/llm/src/utils/argument.py
@@ -80,7 +80,7 @@ class SftArguments:
     eval_steps: int = 50
     save_steps: Optional[int] = None
     only_save_model: Optional[bool] = None
-    save_total_limit: int = 2  # save last and best
+    save_total_limit: int = 2  # save last and best. -1: all checkpoints
     logging_steps: int = 5
     dataloader_num_workers: int = 1
 
@@ -193,6 +193,8 @@ def init_argument(self):
                 self.eval_batch_size = 1
             else:
                 self.eval_batch_size = self.batch_size
+        if self.save_total_limit == -1:
+            self.save_total_limit = None
 
 
 @dataclass
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 6804a9dca4..96a20168dd 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -22,7 +22,7 @@ def _preprocess_alpaca_dataset(
         preprocess_input: Optional[Callable[[str], str]] = None) -> HfDataset:
     query: List[str] = []
     response = []
-    for d in dataset:
+    for d in tqdm(dataset):
         inst, inp, output = d['instruction'], d['input'], d['output']
         if output is None:
             continue
@@ -493,19 +493,17 @@ def get_sharegpt_all_en_dataset():
 def get_cls_fudan_news_zh() -> HfDataset:
     """Sequence Classification """
     dataset = MsDataset.load('damo/zh_cls_fudan-news').to_hf_dataset()
-    return HfDataset.from_dict({
-        'query': dataset['prompt'],
-        'response': dataset['answer']
-    })
+    dataset = dataset.rename_column('prompt', 'query')
+    dataset = dataset.rename_column('answer', 'response')
+    return dataset
 
 
 def get_ner_jave_zh() -> HfDataset:
     """Named Entity Recognition"""
     dataset = MsDataset.load('damo/zh_ner-JAVE').to_hf_dataset()
-    return HfDataset.from_dict({
-        'query': dataset['prompt'],
-        'response': dataset['answer']
-    })
+    dataset = dataset.rename_column('prompt', 'query')
+    dataset = dataset.rename_column('answer', 'response')
+    return dataset
 
 
 def _preprocess_code_python_dataset(dataset: HfDataset) -> HfDataset:
@@ -525,6 +523,93 @@ def get_code_python_zh_dataset() -> HfDataset:
     return _preprocess_code_python_dataset(dataset)
 
 
+def get_blossom_math_v2_dataset() -> HfDataset:
+    dataset = MsDataset.load('AI-ModelScope/blossom-math-v2').to_hf_dataset()
+    query = []
+    response = []
+    for i, d in enumerate(dataset):
+        query.append(d['input'])
+        output, answer = d['output'], d['answer']
+        response.append(f'{output}\n\nAnswer: {answer}')
+    return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_school_math_dataset() -> HfDataset:
+    dataset = MsDataset.load('AI-ModelScope/school_math_0.25M').to_hf_dataset()
+    return _preprocess_alpaca_dataset(dataset)
+
+
+def get_text2sql_v2_en_dataset() -> HfDataset:
+    dataset = MsDataset.load(
+        'AI-ModelScope/texttosqlv2_25000_v2').to_hf_dataset()
+    return _preprocess_alpaca_dataset(dataset)
+
+
+def get_sql_create_context_dataset() -> HfDataset:
+    dataset = MsDataset.load(
+        'AI-ModelScope/sql-create-context').to_hf_dataset()
+    dataset = dataset.rename_column('question', 'instruction')
+    dataset = dataset.rename_column('context', 'input')
+    dataset = dataset.rename_column('answer', 'output')
+    return _preprocess_alpaca_dataset(dataset)
+
+
+def get_lawyer_llama_dataset() -> HfDataset:
+    dataset = MsDataset.load('AI-ModelScope/lawyer_llama_data').to_hf_dataset()
+    query = []
+    response = []
+    for d in tqdm(dataset):
+        h = d['history']
+        h = ast.literal_eval(h)
+        if len(h) > 0:
+            continue  # ignore dirty data
+        query.append(d['instruction'])
+        response.append(d['output'])
+    return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_tigerbot_law_plugin() -> HfDataset:
+    """Pretrain Fromat"""
+    dataset = MsDataset.load(
+        'AI-ModelScope/tigerbot-law-plugin').to_hf_dataset()
+    prompt = """Type: {type}
+Title: {title}
+"""
+    response = []
+    for d in tqdm(dataset):
+        cur_prompt = prompt.format(type=d['type'], title=d['title'])
+        for i in range(1, 4):
+            chapter = d[f'chapter{i}']
+            if chapter is not None:
+                cur_prompt += f'Chapter{i}: {chapter}'
+        cur_prompt += f'Content: {d["content"]}'
+        response.append(cur_prompt)
+    return HfDataset.from_dict({
+        'query': [''] * len(response),
+        'response': response,
+    })
+
+
+def get_leetcode_python_dataset() -> HfDataset:
+    dataset = MsDataset.load(
+        'AI-ModelScope/leetcode-solutions-python').to_hf_dataset()
+    query = []
+    response = []
+    for d in dataset:
+        code_with_problem = d['code_with_problem']
+        idx = code_with_problem.find('```python')
+        idx2 = code_with_problem.rfind('```python')
+        assert idx == idx2
+        problem = code_with_problem[:idx]
+        if problem.startswith('# '):
+            problem = problem[2:]
+        code = code_with_problem[idx:].strip()
+        explanation = d['explanation_only']
+        query.append(problem)
+        response.append(f'{code}\n\n{explanation}')
+    return HfDataset.from_dict({'query': query, 'response': response})
+
+
 DATASET_MAPPING = {
     # nlp chat
     'alpaca-en':
@@ -562,6 +647,20 @@ def get_code_python_zh_dataset() -> HfDataset:
         get_medical_dataset, subset_name='zh', train_dataset_sample=100000),
     'code-python-zh':
     get_code_python_zh_dataset,
+    'blossom-math-zh':
+    get_blossom_math_v2_dataset,
+    'school-math-zh':
+    get_school_math_dataset,
+    'text2sql-en':
+    get_text2sql_v2_en_dataset,
+    'sql-create-context-en':
+    get_sql_create_context_dataset,
+    'lawyer-llama-zh':
+    get_lawyer_llama_dataset,
+    'tigerbot-law-zh':
+    get_tigerbot_law_plugin,
+    'leetcode-python-en':
+    get_leetcode_python_dataset,
 
     # multi-round chat
     'damo-agent-mini-zh':
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index 194bddf18e..5123aac1fa 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -209,8 +209,15 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
         kwargs['quantization_config'].llm_int8_skip_modules = [
             'lm_head', 'attn_pool.attn'
         ]
-    return get_model_tokenizer_qwen(model_dir, torch_dtype, load_model,
-                                    **kwargs)
+    model, tokenizer = get_model_tokenizer_qwen(model_dir, torch_dtype,
+                                                load_model, **kwargs)
+    first_drop = model.transformer.drop
+    if first_drop.p == 0.:
+        # fix gradient_checkpointing bug
+        _old_forward = first_drop.forward
+        first_drop.forward = lambda *args, **kwargs: _old_forward(
+            *args, **kwargs).clone()
+    return model, tokenizer
 
 
 class LoRATM(NamedTuple):
@@ -279,26 +286,26 @@ class ResTunerTM(NamedTuple):
     # qwen series
     'qwen-7b': {
         'model_id': 'qwen/Qwen-7B',
-        'revision': 'v1.1',
+        'revision': 'v1.1.4',
         'get_function': get_model_tokenizer_qwen,
         'lora_TM': LoRATM.qwen,
     },
     'qwen-7b-chat': {
         'model_id': 'qwen/Qwen-7B-Chat',
-        'revision': 'v1.1',
+        'revision': 'v1.1.4',
         'get_function': get_model_tokenizer_qwen,
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
     },
     'qwen-14b': {
         'model_id': 'qwen/Qwen-14B',
-        'revision': 'v1.0.0',
+        'revision': 'v1.0.4',
         'get_function': get_model_tokenizer_qwen,
         'lora_TM': LoRATM.qwen,
     },
     'qwen-14b-chat': {
         'model_id': 'qwen/Qwen-14B-Chat',
-        'revision': 'v1.0.0',
+        'revision': 'v1.0.4',
         'get_function': get_model_tokenizer_qwen,
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
@@ -312,7 +319,7 @@ class ResTunerTM(NamedTuple):
     },
     'qwen-vl-chat': {
         'model_id': 'qwen/Qwen-VL-Chat',
-        'revision': 'v1.0.3',
+        'revision': 'v1.1.0',
         'get_function': get_model_tokenizer_qwen_vl,
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 746bb098e6..f9bac30f73 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -104,6 +104,9 @@ def prediction_step(
 
         gen_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
         gen_kwargs['eos_token_id'] = self.tokenizer.eos_token_id
+        if ('max_length' in gen_kwargs and 'max_new_tokens' in gen_kwargs
+                and gen_kwargs['max_length'] is None):
+            gen_kwargs.pop('max_length')
         gen_time = time.time()
         generated_tokens = self.model.generate(**inputs, **gen_kwargs)
         gen_time = time.time() - gen_time