From 0495c4c708035371f95ed36a00c4e8a0835dc7b8 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 18:45:35 +0800
Subject: [PATCH 1/6] fix

---
 .../llm/scripts/dpo/{lora => lora_ddp_mp}/dpo.sh   | 14 ++++++++++----
 .../llm/scripts/dpo/{lora => lora_ddp_mp}/infer.sh |  0
 swift/llm/dpo.py                                   |  4 +++-
 swift/llm/tuner.py                                 |  2 +-
 swift/tuners/base.py                               |  4 ++--
 swift/tuners/neftune.py                            |  2 +-
 6 files changed, 17 insertions(+), 9 deletions(-)
 rename examples/pytorch/llm/scripts/dpo/{lora => lora_ddp_mp}/dpo.sh (76%)
 rename examples/pytorch/llm/scripts/dpo/{lora => lora_ddp_mp}/infer.sh (100%)

diff --git a/examples/pytorch/llm/scripts/dpo/lora/dpo.sh b/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh
similarity index 76%
rename from examples/pytorch/llm/scripts/dpo/lora/dpo.sh
rename to examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh
index a1949b6ae3..c667dff744 100644
--- a/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
+++ b/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh
@@ -1,7 +1,13 @@
-# Experimental environment: 8*A100
-# Memory usage: 8 * 50G
+# Experimental environment: 4*A100
+# Memory usage: 4 * 20G
+nproc_per_node=2
+
 PYTHONPATH=../../.. \
-accelerate launch llm_dpo.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    llm_dpo.py \
     --model_type  mistral-7b \
     --ref_model_type  mistral-7b \
     --model_revision  master \
@@ -25,7 +31,7 @@ accelerate launch llm_dpo.py \
     --batch_size  1  \
     --weight_decay  0.01  \
     --learning_rate  5e-5  \
-    --gradient_accumulation_steps  16  \
+    --gradient_accumulation_steps  $(expr 16 / $nproc_per_node)  \
     --max_grad_norm  1.0  \
     --warmup_ratio  0.03  \
     --eval_steps  2000  \
diff --git a/examples/pytorch/llm/scripts/dpo/lora/infer.sh b/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/infer.sh
similarity index 100%
rename from examples/pytorch/llm/scripts/dpo/lora/infer.sh
rename to examples/pytorch/llm/scripts/dpo/lora_ddp_mp/infer.sh
diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
index 580e15ed85..7c88ce3649 100644
--- a/swift/llm/dpo.py
+++ b/swift/llm/dpo.py
@@ -31,7 +31,7 @@ def llm_dpo(args: DPOArguments) -> str:
 
     # Loading Model and Tokenizer
     model_kwargs = {'low_cpu_mem_usage': True}
-    if (is_dist() and not is_ddp_plus_mp()) or 'HF_ACCELERATOR' in os.environ:
+    if is_dist() and not is_ddp_plus_mp():
         model_kwargs['device_map'] = {'': local_rank}
     else:
         model_kwargs['device_map'] = 'auto'
@@ -61,6 +61,8 @@ def llm_dpo(args: DPOArguments) -> str:
         ref_model = deepcopy(model)
 
     logger.info(f'model_config: {model.config}')
+    if hasattr(model, 'hf_device_map'):
+        logger.info(f'model device_map {model.hf_device_map}')
     generation_config = GenerationConfig(
         max_new_tokens=args.max_new_tokens,
         temperature=args.temperature,
diff --git a/swift/llm/tuner.py b/swift/llm/tuner.py
index 7bd93beb0e..95989c8812 100644
--- a/swift/llm/tuner.py
+++ b/swift/llm/tuner.py
@@ -85,7 +85,7 @@ def prepare_model(model, args: SftArguments):
 
     if args.neftune_alpha > 0.001:
         neftune_config = NEFTuneConfig(noise_alpha=args.neftune_alpha)
-        model = Swift.prepare_model(model, neftune_config)
+        model = Swift.prepare_model(model, {'neftune': neftune_config})
         logger.info(f'neftune_config: {neftune_config}')
 
     class TrainerAdapterCallback(TrainerCallback):
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index d6810445a5..6e1f0edab7 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -56,7 +56,7 @@ def __init__(self,
                 new_adapters.append(DEFAULT_ADAPTER)
             else:
                 logger.warn(
-                    f'Adater {DEFAULT_ADAPTER} has been patched, skip.')
+                    f'Adapter {DEFAULT_ADAPTER} has been patched, skip.')
         elif isinstance(config, dict):
             assert (all(isinstance(c, SwiftConfig) for c in config.values()))
             for adapter_name, _config in config.items():
@@ -66,7 +66,7 @@ def __init__(self,
                     new_adapters.append(adapter_name)
                 else:
                     logger.warn(
-                        f'Adater {adapter_name} has been patched, skip.')
+                        f'Adapter {adapter_name} has been patched, skip.')
         self.model = model
 
         self.extra_state_keys = extra_state_keys or []
diff --git a/swift/tuners/neftune.py b/swift/tuners/neftune.py
index 300f6646e2..e49924e53a 100644
--- a/swift/tuners/neftune.py
+++ b/swift/tuners/neftune.py
@@ -55,7 +55,7 @@ def neftune_hook(module, args, output):
                 sub_module.nef_activated = True
 
         def state_dict_callback(state_dict, adapter_name):
-            return state_dict
+            return {}
 
         def mark_trainable_callback(model):
             return

From 1652466d59a89aacf98a39456de9b87da57dde35 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 19:36:13 +0800
Subject: [PATCH 2/6] add docs

---
 README.md                                     |  4 +-
 README_CN.md                                  |  4 +-
 ...55\347\273\203\346\216\250\347\220\206.md" |  5 ++
 ...56\350\260\203\346\226\207\346\241\243.md" | 85 ++++++++++++++++++-
 ...50\347\220\206\346\226\207\346\241\243.md" | 23 ++++-
 ...44\350\241\214\345\217\202\346\225\260.md" |  7 ++
 ...11\344\270\216\346\213\223\345\261\225.md" |  9 ++
 7 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 7a9ef05053..4561df6bbe 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 2023.1.4: Support for **VLLM deployment**, compatible with the **OpenAI API** style. For more details, please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署)
 - 2023.1.4: Update [Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md) to facilitate viewing the training speed and GPU memory required for different models.
 - 🔥 2023.12.29: Support web-ui for training and inference, use `swift web-ui` after the installation of ms-swift.
-- 🔥 2023.12.29: Support DPO RLHF(Reinforcement Learning from Human Feedback) and two datasets: AI-ModelScope/stack-exchange-paired and AI-ModelScope/hh-rlhf for this task. Use [this script](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora/dpo.sh) to start training!
+- 🔥 2023.12.29: Support DPO RLHF(Reinforcement Learning from Human Feedback) and two datasets: AI-ModelScope/stack-exchange-paired and AI-ModelScope/hh-rlhf for this task. Use [this script](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh) to start training!
 - 🔥 2023.12.28: Support SCEdit! This framework can easily reduce memory usage in training and inference, and replace ControlNet for controllable image generating scenarios, view the following chapter for details.
 - 2023.12.23: Support [codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b).
 - 2023.12.19: Support [phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b).
@@ -113,7 +113,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - Quickly perform **inference** on LLM and build a **Web-UI**, see the [LLM Inference Documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM推理文档.md).
 - Rapidly **fine-tune** and perform inference on LLM, and build a Web-UI, see the [LLM Fine-tuning Documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM微调文档.md).
 - Using **interface** to fine-tuning and perform inference, see the [WEB-UI Documentation](https://github.com/modelscope/swift/blob/main/docs/source/GetStarted/%E7%95%8C%E9%9D%A2%E8%AE%AD%E7%BB%83%E6%8E%A8%E7%90%86.md).
-- **DPO training** supported, start by using [this script](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora/dpo.sh).
+- **DPO training** supported, start by using [this script](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh).
 - Utilize VLLM for **inference acceleration** and **deployment(OpenAI API)**. Please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md) for more information.
 - View the models and datasets supported by Swift. You can check [supported models and datasets](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md).
 - Expand and customize models, datasets, and dialogue templates in Swift, see [Customization and Expansion](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
diff --git a/README_CN.md b/README_CN.md
index fe3bd5691f..d2bf279788 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -63,7 +63,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 2023.1.4: 支持**VLLM部署**, 兼容**OpenAI API**样式, 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署).
 - 2023.1.4: 更新[Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md), 方便查看不同模型训练的速度和所需显存.
 - 🔥 2023.12.29: 支持web-ui进行sft训练和推理，安装ms-swift后使用`swift web-ui`开启
-- 🔥 2023.12.29: 支持 DPO RLHF(Reinforcement Learning from Human Feedback) 和两个用于此任务的数据集: AI-ModelScope/stack-exchange-paired 以及 AI-ModelScope/hh-rlhf. 使用[这个脚本](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora/dpo.sh)开启训练！
+- 🔥 2023.12.29: 支持 DPO RLHF(Reinforcement Learning from Human Feedback) 和两个用于此任务的数据集: AI-ModelScope/stack-exchange-paired 以及 AI-ModelScope/hh-rlhf. 使用[这个脚本](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh)开启训练！
 - 🔥 2023.12.28: 支持SCEdit! 该tuner可显著降低U-Net中的显存占用，并支持低显存可控图像生成（取代ControlNet），阅读下面的章节来了解详细信息
 - 2023.12.23: 支持[codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b).
 - 2023.12.19: 支持[phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b).
@@ -111,7 +111,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 快速对LLM进行**推理**, 搭建**Web-UI**, 可以查看[LLM推理文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM推理文档.md).
 - 快速对LLM进行**微调**, 推理并搭建Web-UI, 可以查看[LLM微调文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM微调文档.md).
 - 使用**界面**方式进行微调和推理, 可以查看[WEB-UI文档](https://github.com/modelscope/swift/blob/main/docs/source/GetStarted/%E7%95%8C%E9%9D%A2%E8%AE%AD%E7%BB%83%E6%8E%A8%E7%90%86.md).
-- 支持**DPO训练**, 使用[这个脚本](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora/dpo.sh)开启训练
+- 支持**DPO训练**, 使用[这个脚本](https://github.com/modelscope/swift/blob/v1.5.0/examples/pytorch/llm/scripts/dpo/lora_ddp_mp/dpo.sh)开启训练
 - 使用VLLM进行**推理加速**和**部署(OpenAI API)**. 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md).
 - 查看swift支持的模型和数据集. 可以查看[支持的模型和数据集](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md).
 - 对swift中的模型, 数据集, 对话模板进行**拓展**, 可以查看[自定义与拓展](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
diff --git "a/docs/source/GetStarted/\347\225\214\351\235\242\350\256\255\347\273\203\346\216\250\347\220\206.md" "b/docs/source/GetStarted/\347\225\214\351\235\242\350\256\255\347\273\203\346\216\250\347\220\206.md"
index d7b14889dc..8ce33f7713 100644
--- "a/docs/source/GetStarted/\347\225\214\351\235\242\350\256\255\347\273\203\346\216\250\347\220\206.md"
+++ "b/docs/source/GetStarted/\347\225\214\351\235\242\350\256\255\347\273\203\346\216\250\347\220\206.md"
@@ -5,3 +5,8 @@ swift web-ui
 ```
 
 开启界面训练和推理。
+
+web-ui没有传入参数，所有可控部分都在界面中。但是有几个环境变量可以使用：
+
+> WEBUI_SHARE=1 控制gradio是否是share状态
+> SWIFT_UI_LANG=en/zh 控制web-ui界面语言
diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index 87819b912e..4c9566bb4b 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -68,7 +68,63 @@ torch.cuda.empty_cache()
 app_ui_main(infer_args)
 ```
 
+## DPO（人类对齐训练）
+
+下面的shell脚本运行了一个人类对齐训练。首先需要切换到运行目录：
+
+```shell
+cd examples/pytorch/llm
+```
+
+运行下面的命令：
+
+```shell
+# Experimental environment: 4*A100
+# Memory usage: 4 * 20G
+nproc_per_node=2
+
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    llm_dpo.py \
+    --model_type  mistral-7b \
+    --ref_model_type  mistral-7b \
+    --model_revision  master \
+    --sft_type  lora \
+    --tuner_backend  swift \
+    --dtype  AUTO  \
+    --output_dir  output  \
+    --dataset  hh-rlhf  \
+    --train_dataset_sample  -1  \
+    --truncation_strategy  truncation_left  \
+    --val_dataset_sample  2000  \
+    --num_train_epochs  3  \
+    --max_length  1024  \
+    --max_prompt_length  512  \
+    --check_dataset_strategy  none  \
+    --lora_rank  8  \
+    --lora_alpha  32  \
+    --lora_dropout_p  0.05  \
+    --lora_target_modules  ALL  \
+    --gradient_checkpointing  true  \
+    --batch_size  1  \
+    --weight_decay  0.01  \
+    --learning_rate  5e-5  \
+    --gradient_accumulation_steps  $(expr 16 / $nproc_per_node)  \
+    --max_grad_norm  1.0  \
+    --warmup_ratio  0.03  \
+    --eval_steps  2000  \
+    --save_steps  2000  \
+    --save_total_limit  2  \
+    --logging_steps  10 \
+```
+
+DPO训练需要在一张显卡上加载两个模型，因此推荐显存至少24G以上。DPO训练后的模型推理和SFT的推理流程相同。
+
 ### 使用CLI
+
 ```bash
 # Experimental environment: A10, 3090, V100, ...
 # 20GB GPU memory
@@ -307,13 +363,36 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged'
 ```
 
-## Web-UI
+## 界面
+
+目前界面化展示分为两个部分，分别是：
+
+```shell
+swift web-ui
+swift app-ui
+```
+
+其中，web-ui用于构建训练参数和训练后本地推理实验，app-ui用于将训练后模型发布创空间等。
+
+### web-ui
+
+web-ui没有传入参数，所有可控部分都在界面中。但是有几个环境变量可以使用：
+
+```text
+WEBUI_SHARE=1 控制gradio是否是share状态
+SWIFT_UI_LANG=en/zh 控制web-ui界面语言
+```
+
+### app-ui
+
 如果你要使用VLLM进行部署并提供**API**接口, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#部署)
 
-### 原始模型
+#### 原始模型
+
 使用原始模型的web-ui可以查看[LLM推理文档](./LLM推理文档.md#-Web-UI)
 
-### 微调后模型
+#### 微调后模型
+
 ```bash
 # 直接使用app-ui
 CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index e5c212169b..c88fb76de6 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -401,8 +401,29 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type yi-6b-chat
 如果你要使用微调后模型进行推理, 可以查看[LLM微调文档](./LLM微调文档.md#微调后模型)
 
 
-## Web-UI
+## 界面
+目前界面化展示分为两个部分，分别是：
+
+```shell
+swift web-ui
+swift app-ui
+```
+
+其中，web-ui用于构建训练参数和训练后本地推理实验，app-ui用于将训练后模型发布创空间等。
+
+## web-ui
+
+web-ui没有传入参数，所有可控部分都在界面中。但是有几个环境变量可以使用：
+
+```text
+WEBUI_SHARE=1 控制gradio是否是share状态
+SWIFT_UI_LANG=en/zh 控制web-ui界面语言
+```
+
+## app-ui
+
 ### qwen-7b-chat
+
 使用CLI:
 ```bash
 CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b-chat
diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index a24e7541e1..f28e7bb83f 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -92,6 +92,13 @@
 - `--repetition_penalty`: 默认为`1.05`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--num_beams`: 默认为`1`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 
+## DPO参数
+
+DPO参数继承了上面的SFT参数，除此之外增加了以下参数：
+
+- `--ref_model_type` 对比模型类型，可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`
+- `--max_prompt_length` 最大的提示长度,该参数会传入DPOTrainer中，使prompt长度不超过该值的设置，默认值1024
+
 
 ## merge-lora infer app-ui 命令行参数
 - `--model_type`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
index 2a7dea3ca2..65dee97901 100644
--- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
+++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
@@ -99,7 +99,16 @@ AAAAA,BBBBB,CCCCC
 {"messages": [{"role": "user", "content": "AAAAA"}, {"role": "assistant", "content": "BBBBB"}, {"role": "user", "content": "CCCCC"}, {"role": "assistant", "content": "DDDDD"}]}
 ```
 
+**强化学习（DPO）**
+
+```jsonl
+{"query": "11111", "response": "22222", "rejected_response": "33333"}
+{"query": "aaaaa", "response": "bbbbb", "rejected_response": "ccccc"}
+{"query": "AAAAA", "response": "BBBBB", "rejected_response": "CCCCC"}
+```
+
 ### 注册数据集的方式
+
 以下是一个**注册数据集**的案例. 完整的py文件可以查看[custom.py](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/custom.py), sh脚本可以查看[custom](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/custom).
 
 ```python

From 7e7b6270ee443fb35528c318be966a0c876b529e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 20:30:40 +0800
Subject: [PATCH 3/6] fix

---
 ...55\347\273\203\346\226\207\346\241\243.md" | 98 +++++++++++++++++++
 ...56\350\260\203\346\226\207\346\241\243.md" | 55 -----------
 examples/pytorch/llm/scripts/dpo/lora/dpo.sh  | 34 +++++++
 .../pytorch/llm/scripts/dpo/lora/infer.sh     | 14 +++
 4 files changed, 146 insertions(+), 55 deletions(-)
 create mode 100644 "docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
 create mode 100644 examples/pytorch/llm/scripts/dpo/lora/dpo.sh
 create mode 100644 examples/pytorch/llm/scripts/dpo/lora/infer.sh

diff --git "a/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
new file mode 100644
index 0000000000..41c34bec4a
--- /dev/null
+++ "b/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
@@ -0,0 +1,98 @@
+# LLM人类对齐训练文档
+## 目录
+- [环境准备](#环境准备)
+- [人类对齐训练](#人类对齐训练)
+
+## 环境准备
+GPU设备: A10, 3090, V100, A100均可，如果是显存<=24G的GPU最少需要双卡环境。由于人类对齐训练在一张卡上加载两个模型，因此比微调的显存多占用一个推理模型的显存使用量。
+```bash
+# 设置pip全局镜像
+pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+# 安装ms-swift
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e .[llm]
+
+# 环境对齐 (如果你运行错误, 可以跑下面的代码, 仓库使用最新环境测试)
+pip install -r requirements/framework.txt  -U
+pip install -r requirements/llm.txt  -U
+```
+
+## 人类对齐训练
+下面的shell脚本运行了一个人类对齐训练。首先需要切换到运行目录：
+
+```shell
+cd examples/pytorch/llm
+```
+
+运行下面的命令：
+
+```shell
+# Experimental environment: 4*A100
+# Memory usage: 4 * 20G，双卡device_map * 2ddp
+nproc_per_node=2
+
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    llm_dpo.py \
+    --model_type  mistral-7b \
+    --ref_model_type  mistral-7b \
+    --model_revision  master \
+    --sft_type  lora \
+    --tuner_backend  swift \
+    --dtype  AUTO  \
+    --output_dir  output  \
+    --dataset  hh-rlhf  \
+    --train_dataset_sample  -1  \
+    --truncation_strategy  truncation_left  \
+    --val_dataset_sample  2000  \
+    --num_train_epochs  3  \
+    --max_length  1024  \
+    --max_prompt_length  512  \
+    --check_dataset_strategy  none  \
+    --lora_rank  8  \
+    --lora_alpha  32  \
+    --lora_dropout_p  0.05  \
+    --lora_target_modules  ALL  \
+    --gradient_checkpointing  true  \
+    --batch_size  1  \
+    --weight_decay  0.01  \
+    --learning_rate  5e-5  \
+    --gradient_accumulation_steps  $(expr 16 / $nproc_per_node)  \
+    --max_grad_norm  1.0  \
+    --warmup_ratio  0.03  \
+    --eval_steps  2000  \
+    --save_steps  2000  \
+    --save_total_limit  2  \
+    --logging_steps  10 \
+```
+
+### sh脚本
+
+sh脚本可以查看[这里](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/dpo)。
+
+```bash
+# 下面的脚本需要在此目录下执行
+cd examples/pytorch/llm
+```
+
+**提示**:
+
+- 我们默认在训练时设置`--gradient_checkpointing true`来**节约显存**, 这会略微降低训练速度.
+- 如果你使用的是**V100**等较老的GPU, 你需要设置`--dtype AUTO`或者`--dtype fp16`, 因为其不支持bf16.
+- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). 支持flash-attn的模型可以查看[LLM支持的模型](./支持的模型和数据集.md#模型)
+- 如果你需要断网进行训练, 请使用`--model_cache_dir`和设置`--check_model_is_latest false`. 具体参数含义请查看[命令行参数](./命令行参数.md).
+- 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
+
+```bash
+# dpo训练 mistral-7b max_length=1024，bs=1
+# 推荐的实验环境: V100, A10, 3090，2卡4卡或8卡
+bash scripts/dpo/lora_ddp_mp/dpo.sh
+bash scripts/dpo/lora_ddp_mp/infer.sh
+```
+
+由于DPO训练后会得到一个完整模型或者adapter的weights，因此LoRA合并、推理的步骤和微调步骤相同，因此请参考[微调文档](./LLM微调文档#Merge LoRA)对应的步骤。
+
diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index 4c9566bb4b..e88d953aa2 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -68,61 +68,6 @@ torch.cuda.empty_cache()
 app_ui_main(infer_args)
 ```
 
-## DPO（人类对齐训练）
-
-下面的shell脚本运行了一个人类对齐训练。首先需要切换到运行目录：
-
-```shell
-cd examples/pytorch/llm
-```
-
-运行下面的命令：
-
-```shell
-# Experimental environment: 4*A100
-# Memory usage: 4 * 20G
-nproc_per_node=2
-
-PYTHONPATH=../../.. \
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
-torchrun \
-    --nproc_per_node=$nproc_per_node \
-    --master_port 29500 \
-    llm_dpo.py \
-    --model_type  mistral-7b \
-    --ref_model_type  mistral-7b \
-    --model_revision  master \
-    --sft_type  lora \
-    --tuner_backend  swift \
-    --dtype  AUTO  \
-    --output_dir  output  \
-    --dataset  hh-rlhf  \
-    --train_dataset_sample  -1  \
-    --truncation_strategy  truncation_left  \
-    --val_dataset_sample  2000  \
-    --num_train_epochs  3  \
-    --max_length  1024  \
-    --max_prompt_length  512  \
-    --check_dataset_strategy  none  \
-    --lora_rank  8  \
-    --lora_alpha  32  \
-    --lora_dropout_p  0.05  \
-    --lora_target_modules  ALL  \
-    --gradient_checkpointing  true  \
-    --batch_size  1  \
-    --weight_decay  0.01  \
-    --learning_rate  5e-5  \
-    --gradient_accumulation_steps  $(expr 16 / $nproc_per_node)  \
-    --max_grad_norm  1.0  \
-    --warmup_ratio  0.03  \
-    --eval_steps  2000  \
-    --save_steps  2000  \
-    --save_total_limit  2  \
-    --logging_steps  10 \
-```
-
-DPO训练需要在一张显卡上加载两个模型，因此推荐显存至少24G以上。DPO训练后的模型推理和SFT的推理流程相同。
-
 ### 使用CLI
 
 ```bash
diff --git a/examples/pytorch/llm/scripts/dpo/lora/dpo.sh b/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
new file mode 100644
index 0000000000..22ff30b05e
--- /dev/null
+++ b/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
@@ -0,0 +1,34 @@
+# Experimental environment: 4*A100
+# Memory usage: 4 * 20G
+PYTHONPATH=../../.. \
+python llm_dpo.py \
+    --model_type  mistral-7b \
+    --ref_model_type  mistral-7b \
+    --model_revision  master \
+    --sft_type  lora \
+    --tuner_backend  swift \
+    --dtype  AUTO  \
+    --output_dir  output  \
+    --dataset  hh-rlhf  \
+    --train_dataset_sample  -1  \
+    --truncation_strategy  truncation_left  \
+    --val_dataset_sample  2000  \
+    --num_train_epochs  3  \
+    --max_length  1024  \
+    --max_prompt_length  512  \
+    --check_dataset_strategy  none  \
+    --lora_rank  8  \
+    --lora_alpha  32  \
+    --lora_dropout_p  0.05  \
+    --lora_target_modules  ALL  \
+    --gradient_checkpointing  true  \
+    --batch_size  1  \
+    --weight_decay  0.01  \
+    --learning_rate  5e-5  \
+    --gradient_accumulation_steps  16  \
+    --max_grad_norm  1.0  \
+    --warmup_ratio  0.03  \
+    --eval_steps  2000  \
+    --save_steps  2000  \
+    --save_total_limit  2  \
+    --logging_steps  10 \
diff --git a/examples/pytorch/llm/scripts/dpo/lora/infer.sh b/examples/pytorch/llm/scripts/dpo/lora/infer.sh
new file mode 100644
index 0000000000..8ed9b69b6e
--- /dev/null
+++ b/examples/pytorch/llm/scripts/dpo/lora/infer.sh
@@ -0,0 +1,14 @@
+# Experimental environment: A10, 3090
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir output/mistral-7b/vx-xxx-xxx/checkpoint-xxx \
+    --load_dataset_config true \
+    --eval_human true \
+    --use_flash_attn false \
+    --max_new_tokens 1024 \
+    --temperature 0.3 \
+    --top_p 0.7 \
+    --repetition_penalty 1.05 \
+    --do_sample true \
+    --merge_lora_and_save false \

From b636e83e12e82545176002d3a3f7964b4dbf2a19 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 20:34:17 +0800
Subject: [PATCH 4/6] pre-commit passed

---
 ...75\220\350\256\255\347\273\203\346\226\207\346\241\243.md" | 1 -
 examples/pytorch/llm/scripts/dpo/lora/dpo.sh                  | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git "a/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
index 41c34bec4a..6a9b57441a 100644
--- "a/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\344\272\272\347\261\273\345\257\271\351\275\220\350\256\255\347\273\203\346\226\207\346\241\243.md"
@@ -95,4 +95,3 @@ bash scripts/dpo/lora_ddp_mp/infer.sh
 ```
 
 由于DPO训练后会得到一个完整模型或者adapter的weights，因此LoRA合并、推理的步骤和微调步骤相同，因此请参考[微调文档](./LLM微调文档#Merge LoRA)对应的步骤。
-
diff --git a/examples/pytorch/llm/scripts/dpo/lora/dpo.sh b/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
index 22ff30b05e..815acd0cbb 100644
--- a/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
+++ b/examples/pytorch/llm/scripts/dpo/lora/dpo.sh
@@ -1,5 +1,5 @@
-# Experimental environment: 4*A100
-# Memory usage: 4 * 20G
+# Experimental environment: 2*A100
+# Memory usage: 2 * 20G
 PYTHONPATH=../../.. \
 python llm_dpo.py \
     --model_type  mistral-7b \

From 37278c6f39e49773de1549a4132ffc937aabec8e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 20:37:49 +0800
Subject: [PATCH 5/6] fix index

---
 .../LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"   | 4 ++--
 .../LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index e88d953aa2..b338679070 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -4,7 +4,7 @@
 - [微调](#微调)
 - [Merge LoRA](#merge-lora)
 - [推理](#推理)
-- [Web-UI](#web-ui)
+- [界面运行](#界面运行)
 
 ## 环境准备
 GPU设备: A10, 3090, V100, A100均可.
@@ -308,7 +308,7 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged'
 ```
 
-## 界面
+## 界面运行
 
 目前界面化展示分为两个部分，分别是：
 
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index c88fb76de6..5d7c3b843a 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -4,7 +4,7 @@
 ## 目录
 - [环境准备](#环境准备)
 - [推理](#推理)
-- [Web-UI](#web-ui)
+- [界面推理](#界面推理)
 
 ## 环境准备
 GPU设备: A10, 3090, V100, A100均可.
@@ -401,7 +401,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type yi-6b-chat
 如果你要使用微调后模型进行推理, 可以查看[LLM微调文档](./LLM微调文档.md#微调后模型)
 
 
-## 界面
+## 界面推理
 目前界面化展示分为两个部分，分别是：
 
 ```shell

From 70f4262ff454f647cf9602894939eefd552f9f4b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 10 Jan 2024 20:49:28 +0800
Subject: [PATCH 6/6] revert files

---
 ...56\350\260\203\346\226\207\346\241\243.md" | 32 +++----------------
 ...50\347\220\206\346\226\207\346\241\243.md" | 25 ++-------------
 2 files changed, 6 insertions(+), 51 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index b338679070..87819b912e 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -4,7 +4,7 @@
 - [微调](#微调)
 - [Merge LoRA](#merge-lora)
 - [推理](#推理)
-- [界面运行](#界面运行)
+- [Web-UI](#web-ui)
 
 ## 环境准备
 GPU设备: A10, 3090, V100, A100均可.
@@ -69,7 +69,6 @@ app_ui_main(infer_args)
 ```
 
 ### 使用CLI
-
 ```bash
 # Experimental environment: A10, 3090, V100, ...
 # 20GB GPU memory
@@ -308,36 +307,13 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged'
 ```
 
-## 界面运行
-
-目前界面化展示分为两个部分，分别是：
-
-```shell
-swift web-ui
-swift app-ui
-```
-
-其中，web-ui用于构建训练参数和训练后本地推理实验，app-ui用于将训练后模型发布创空间等。
-
-### web-ui
-
-web-ui没有传入参数，所有可控部分都在界面中。但是有几个环境变量可以使用：
-
-```text
-WEBUI_SHARE=1 控制gradio是否是share状态
-SWIFT_UI_LANG=en/zh 控制web-ui界面语言
-```
-
-### app-ui
-
+## Web-UI
 如果你要使用VLLM进行部署并提供**API**接口, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#部署)
 
-#### 原始模型
-
+### 原始模型
 使用原始模型的web-ui可以查看[LLM推理文档](./LLM推理文档.md#-Web-UI)
 
-#### 微调后模型
-
+### 微调后模型
 ```bash
 # 直接使用app-ui
 CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index 5d7c3b843a..e5c212169b 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -4,7 +4,7 @@
 ## 目录
 - [环境准备](#环境准备)
 - [推理](#推理)
-- [界面推理](#界面推理)
+- [Web-UI](#web-ui)
 
 ## 环境准备
 GPU设备: A10, 3090, V100, A100均可.
@@ -401,29 +401,8 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type yi-6b-chat
 如果你要使用微调后模型进行推理, 可以查看[LLM微调文档](./LLM微调文档.md#微调后模型)
 
 
-## 界面推理
-目前界面化展示分为两个部分，分别是：
-
-```shell
-swift web-ui
-swift app-ui
-```
-
-其中，web-ui用于构建训练参数和训练后本地推理实验，app-ui用于将训练后模型发布创空间等。
-
-## web-ui
-
-web-ui没有传入参数，所有可控部分都在界面中。但是有几个环境变量可以使用：
-
-```text
-WEBUI_SHARE=1 控制gradio是否是share状态
-SWIFT_UI_LANG=en/zh 控制web-ui界面语言
-```
-
-## app-ui
-
+## Web-UI
 ### qwen-7b-chat
-
 使用CLI:
 ```bash
 CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b-chat