From ad3f9b1c618793eed93e0910e4ec57e622cf1237 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 10:13:38 +0800
Subject: [PATCH 01/47] support ppo

---
 swift/llm/argument/rlhf_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 8d54f9eb78..68dbd9f7d6 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -48,7 +48,7 @@ def __post_init__(self):
         self._set_default()
         super().__post_init__()
 
-        if self.rlhf_type not in ['cpo', 'orpo', 'rm'] and (self.train_type == 'full' or self.rlhf_type == 'ppo'):
+        if self.rlhf_type in ['dpo', 'kto'] and self.train_type == 'full' or self.rlhf_type == 'ppo':
             self.ref_model = self.ref_model or self.model
             self.ref_model_type = self.ref_model_type or self.model_type
             self.ref_model_revision = self.ref_model_revision or self.model_revision

From 1e0e17ddac7feebea3f187f726e24bdbb0e07e66 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 10:33:35 +0800
Subject: [PATCH 02/47] update

---
 README.md                                         |  4 ++--
 README_CN.md                                      |  4 ++--
 ...71\211\346\225\260\346\215\256\351\233\206.md" |  2 +-
 ...77\253\351\200\237\345\274\200\345\247\213.md" |  2 +-
 docs/source/Instruction/ReleaseNote3.0.md         |  7 +++----
 ...73\244\350\241\214\345\217\202\346\225\260.md" |  2 +-
 docs/source_en/Customization/Custom-dataset.md    |  2 +-
 docs/source_en/GetStarted/Quick-start.md          |  2 +-
 .../Instruction/Command-line-parameters.md        |  2 +-
 docs/source_en/Instruction/ReleaseNote3.0.md      |  7 +++----
 swift/llm/argument/rlhf_args.py                   | 15 ++++++++++++++-
 11 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index a572beaed2..a3ed32bb47 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ You can contact us and communicate with us by adding our group:
 - 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel.
 - **Distributed Training**: Supports distributed data parallel (DDP), device_map simple model parallelism, DeepSpeed ZeRO2/ZeRO3, FSDP, and other distributed training techniques.
 - **Quantization Training**: Supports training quantized models like BNB, AWQ, GPTQ, AQLM, HQQ, EETQ.
-- **RLHF Training**: Supports human alignment training methods such as DPO, CPO, SimPO, ORPO, KTO, RM for both pure text and multi-modal large models.
+- **RLHF Training**: Supports human alignment training methods such as DPO, CPO, SimPO, ORPO, KTO, RM, PPO for both pure text and multi-modal large models.
 - 🍓 **Multi-Modal Training**: Supports training on different modalities like images, videos, and audio, for tasks like VQA, captioning, OCR, and grounding.
 - **Interface Training**: Provides capabilities for training, inference, evaluation, quantization through an interface, completing the whole large model pipeline.
 - **Plugin and Extension**: Supports custom model and dataset extensions, as well as customization of components like loss, metric, trainer, loss-scale, callback, optimizer.
@@ -83,7 +83,7 @@ You can contact us and communicate with us by adding our group:
 - 🎉 2024.08.12: The SWIFT paper has been published on arXiv, and you can read it [here](https://arxiv.org/abs/2408.05517).
 - 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
 - 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
-- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM.
+- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
 - 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
 
 
diff --git a/README_CN.md b/README_CN.md
index c3796e0a5f..fdfe9d5ed0 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -64,7 +64,7 @@
 - 🍊 **轻量训练**：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
 - **分布式训练**：支持分布式数据并行（DDP）、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP等分布式训练技术。
 - **量化训练**：支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练。
-- **RLHF训练**：支持纯文本大模型和多模态大模型的DPO、CPO、SimPO、ORPO、KTO、RM等人类对齐训练方法。
+- **RLHF训练**：支持纯文本大模型和多模态大模型的DPO、CPO、SimPO、ORPO、KTO、RM、PPO等人类对齐训练方法。
 - 🍓 **多模态训练**：支持对图像、视频和语音不同模态模型进行训练，支持VQA、Caption、OCR、Grounding任务的训练。
 - **界面训练**：以界面的方式提供训练、推理、评测、量化的能力，完成大模型的全链路。
 - **插件化与拓展**：支持自定义模型和数据集拓展，支持对loss、metric、trainer、loss-scale、callback、optimizer等组件进行自定义。
@@ -78,7 +78,7 @@
 - 🎉 2024.08.12: SWIFT论文已经发布到arXiv上，可以点击[这里](https://arxiv.org/abs/2408.05517)阅读。
 - 🔥 2024.08.05: 支持使用[evalscope](https://github.com/modelscope/evalscope/)作为后端进行大模型和多模态模型的评测。
 - 🔥 2024.07.29: 支持使用[vllm](https://github.com/vllm-project/vllm), [lmdeploy](https://github.com/InternLM/lmdeploy)对大模型和多模态大模型进行推理加速，在infer/deploy/eval时额外指定`--infer_backend vllm/lmdeploy`即可。
-- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练，包括DPO/ORPO/SimPO/CPO/KTO/RM。
+- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练，包括DPO/ORPO/SimPO/CPO/KTO/RM/PPO。
 - 🔥 2024.02.01: 支持Agent训练！训练算法源自这篇[论文](https://arxiv.org/pdf/2309.00986.pdf)。
 
 ## 🛠️ 安装
diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
index 16b54234be..df19d99391 100644
--- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
@@ -53,7 +53,7 @@ query-response格式：
 
 ### RLHF
 
-#### DPO/ORPO/CPO/SimPO/RM
+#### DPO/ORPO/CPO/SimPO/RM/PPO
 
 ```jsonl
 {"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}], "rejected_response": "我不知道"}
diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index c69597316e..9715f248c3 100644
--- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -8,7 +8,7 @@ ms-swift是魔搭社区提供的大模型与多模态大模型训练部署框架
 - 🍊 轻量训练：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
 - 分布式训练：支持分布式数据并行（DDP）、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP等分布式训练技术。
 - 量化训练：支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练。
-- RLHF训练：支持纯文本大模型和多模态大模型的DPO、CPO、SimPO、ORPO、KTO、RM等人类对齐训练方法。
+- RLHF训练：支持纯文本大模型和多模态大模型的DPO、CPO、SimPO、ORPO、KTO、RM、PPO等人类对齐训练方法。
 - 🍓 多模态训练：支持对图像、视频和语音不同模态模型进行训练，支持VQA、Caption、OCR、Grounding任务的训练。
 - 界面训练：以界面的方式提供训练、推理、评测、量化的能力，完成大模型的全链路。
 - 插件化与拓展：支持自定义模型和数据集拓展，支持对loss、metric、trainer、loss-scale、callback、optimizer等组件进行自定义。
diff --git a/docs/source/Instruction/ReleaseNote3.0.md b/docs/source/Instruction/ReleaseNote3.0.md
index feda24658a..85a8f7e8ad 100644
--- a/docs/source/Instruction/ReleaseNote3.0.md
+++ b/docs/source/Instruction/ReleaseNote3.0.md
@@ -81,7 +81,6 @@
 
 ## 待完成
 
-1. RM/PPO能力3.0版本尚不支持，请使用2.6.1版本
-2. 自定义数据集评测3.0版本尚不支持，请使用2.6.1版本
-3. Megatron预训练能力3.0版本尚不支持，请使用2.6.1版本
-4. 文档和README，尤其是英文部分暂时未更新完整
+1. 自定义数据集评测3.0版本尚不支持，请使用2.6.1版本
+2. Megatron预训练能力3.0版本尚不支持，请使用2.6.1版本
+3. 文档和README暂时未更新完整
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index db8dd42017..1c652751ff 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -304,7 +304,7 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数.
 ### RLHF参数
 RLHF参数继承于[训练参数](#训练参数)
 
-- 🔥rlhf_type: 对齐算法类型，支持`dpo`, `orpo`, `simpo`, `kto`, `cpo`
+- 🔥rlhf_type: 对齐算法类型，支持`dpo`, `orpo`, `simpo`, `kto`, `cpo`, `rm`, `ppo`
 - ref_model: DPO等算法中的原始对比模型
 - ref_model_type: 同model_type
 - ref_model_revision: 同model_revision
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
index f7b883f46a..3bb38cfe3c 100644
--- a/docs/source_en/Customization/Custom-dataset.md
+++ b/docs/source_en/Customization/Custom-dataset.md
@@ -52,7 +52,7 @@ The following provides the recommended dataset format for ms-swift, where the sy
 
 ### RLHF
 
-#### DPO/ORPO/CPO/SimPO/RM
+#### DPO/ORPO/CPO/SimPO/RM/PPO
 
 ```jsonl
 {"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "Tell me tomorrow's weather"}, {"role": "assistant", "content": "Tomorrow's weather will be sunny"}], "rejected_response": "I don't know"}
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
index c410e4484e..38e69b32e9 100644
--- a/docs/source_en/GetStarted/Quick-start.md
+++ b/docs/source_en/GetStarted/Quick-start.md
@@ -8,7 +8,7 @@ ms-swift is a comprehensive training and deployment framework for large language
 - 🍊 Lightweight Training: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel, and more.
 - Distributed Training: Supports distributed data parallel (DDP), simple model parallelism via device_map, DeepSpeed ZeRO2 ZeRO3, FSDP, and other distributed training technologies.
 - Quantization Training: Provides training for quantized models like BNB, AWQ, GPTQ, AQLM, HQQ, EETQ.
-- RLHF Training: Supports human alignment training methods like DPO, CPO, SimPO, ORPO, KTO, RM for both text-based and multimodal large models.
+- RLHF Training: Supports human alignment training methods like DPO, CPO, SimPO, ORPO, KTO, RM, PPO for both text-based and multimodal large models.
 - 🍓 Multimodal Training: Capable of training models for different modalities such as images, videos, and audios; supports tasks like VQA (Visual Question Answering), Captioning, OCR (Optical Character Recognition), and Grounding.
 - Interface-driven Training: Offers training, inference, evaluation, and quantization capabilities through an interface, enabling a complete workflow for large models.
 - Plugins and Extensions: Allows customization and extension of models and datasets, and supports customizations for components like loss, metric, trainer, loss-scale, callback, optimizer, etc.
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 28be6ae393..d163b10a61 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -308,7 +308,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
 
 RLHF arguments inherit from the [training arguments](#training-arguments).
 
-- 🔥rlhf_type: Alignment algorithm type, supports `dpo`, `orpo`, `simpo`, `kto`, `cpo`.
+- 🔥rlhf_type: Alignment algorithm type, supports `dpo`, `orpo`, `simpo`, `kto`, `cpo`, `rm`, `ppo`.
 - ref_model: Original comparison model in algorithms like DPO.
 - ref_model_type: Same as model_type.
 - ref_model_revision: Same as model_revision.
diff --git a/docs/source_en/Instruction/ReleaseNote3.0.md b/docs/source_en/Instruction/ReleaseNote3.0.md
index c6c1c9cec8..f46728886c 100644
--- a/docs/source_en/Instruction/ReleaseNote3.0.md
+++ b/docs/source_en/Instruction/ReleaseNote3.0.md
@@ -94,7 +94,6 @@ The parameters marked as compatible in version 2.0 have been entirely removed.
 
 ## Pending Tasks
 
-1. RM/PPO capabilities are not supported in version 3.0. Please use version 2.6.1.
-2. Custom dataset evaluation is not supported in version 3.0. Please use version 2.6.1.
-3. Megatron pre-training capabilities are not supported in version 3.0. Please use version 2.6.1.
-4. Documentation and README, especially the English portions, are temporarily incomplete and will be updated.
+1. Custom dataset evaluation is not supported in version 3.0. Please use version 2.6.1.
+2. Megatron pre-training capabilities are not supported in version 3.0. Please use version 2.6.1.
+3. Documentation and README are temporarily incomplete and will be updated.
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 68dbd9f7d6..45924c3531 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -25,7 +25,7 @@ class RLHFArguments(TrainArguments):
         desirable_weight (float): Weight for desirable outcomes in KTO. Default is 1.0.
         undesirable_weight (float): Weight for undesirable outcomes in KTO. Default is 1.0.
     """
-    rlhf_type: Literal['dpo', 'orpo', 'simpo', 'kto', 'cpo', 'rm'] = 'dpo'
+    rlhf_type: Literal['dpo', 'orpo', 'simpo', 'kto', 'cpo', 'rm', 'ppo'] = 'dpo'
     ref_model: Optional[str] = None
     ref_model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
@@ -42,6 +42,19 @@ class RLHFArguments(TrainArguments):
     # KTO
     desirable_weight: float = 1.0
     undesirable_weight: float = 1.0
+    # PPO
+    reward_model: Optional[str] = None
+    reward_model_type: Optional[str] = field(
+        default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
+    reward_model_revision: Optional[str] = None
+    local_rollout_forward_batch_size: int = 64
+    kl_coef: float = 0.05
+    cliprange: float = 0.2
+    cliprange_value: float = 0.2
+    vf_coef: float = 0.1
+    gamma: float = 1.0
+    lam: float = 0.95
+    num_sample_generations: int = 10
 
     def __post_init__(self):
         self._init_simpo()

From 29cd2d05dccae3869da5d5165866108344486c4b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 10:56:18 +0800
Subject: [PATCH 03/47] update

---
 requirements/framework.txt      | 2 +-
 swift/llm/argument/rlhf_args.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index ee068f99a5..aa45abbf71 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -30,5 +30,5 @@ tiktoken
 tqdm
 transformers>=4.33,<4.49
 transformers_stream_generator
-trl>=0.11,<0.12
+trl>=0.13,<0.14
 uvicorn
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 45924c3531..edd6170843 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -50,14 +50,15 @@ class RLHFArguments(TrainArguments):
     local_rollout_forward_batch_size: int = 64
     kl_coef: float = 0.05
     cliprange: float = 0.2
-    cliprange_value: float = 0.2
     vf_coef: float = 0.1
+    cliprange_value: float = 0.2
     gamma: float = 1.0
     lam: float = 0.95
     num_sample_generations: int = 10
 
     def __post_init__(self):
         self._init_simpo()
+        self._init_ppo()
         self._set_default()
         super().__post_init__()
 
@@ -68,6 +69,10 @@ def __post_init__(self):
         elif self.ref_model is not None:
             raise ValueError('CPO/ORPO or LoRA training does not require a ref_model to be passed in.')
 
+    def _init_ppo(self):
+        pass
+
+
     def _init_simpo(self):
         if self.rlhf_type != 'simpo':
             return

From 0492943fe04c49cddca97b68b5330ad5d6e9abfa Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 14:52:40 +0800
Subject: [PATCH 04/47] update

---
 .../notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb   | 2 +-
 swift/llm/argument/base_args/base_args.py                      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
index 1ecd96cad3..fee5144b2f 100644
--- a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
+++ b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -10,7 +10,7 @@
     "\n",
     "Are you ready? Let's begin the journey...\n",
     "\n",
-    "中文版：https://modelscope.cn/notebook/share/ipynb/4340fdeb/self-cognition-sft.ipynb"
+    "中文版：https://modelscope.cn/notebook/share/ipynb/313f6116/self-cognition-sft.ipynb"
    ]
   },
   {
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index 5a2187dbe9..7e82845303 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -109,7 +109,8 @@ def _init_custom_register(self) -> None:
             folder, fname = os.path.split(path)
             sys.path.append(folder)
             __import__(fname.rstrip('.py'))
-        logger.info(f'Successfully registered `{self.custom_register_path}`')
+        if self.custom_register_path:
+            logger.info(f'Successfully registered `{self.custom_register_path}`')
 
     def _init_adapters(self):
         if isinstance(self.adapters, str):

From 0214e497dd62ba231a1bd1c8201a3c3fe6e8d05e Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 15:30:32 +0800
Subject: [PATCH 05/47] fix

---
 requirements/framework.txt      | 2 +-
 swift/llm/argument/rlhf_args.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index aa45abbf71..ee068f99a5 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -30,5 +30,5 @@ tiktoken
 tqdm
 transformers>=4.33,<4.49
 transformers_stream_generator
-trl>=0.13,<0.14
+trl>=0.11,<0.12
 uvicorn
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index edd6170843..fdc1862954 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -72,7 +72,6 @@ def __post_init__(self):
     def _init_ppo(self):
         pass
 
-
     def _init_simpo(self):
         if self.rlhf_type != 'simpo':
             return

From df183a24ea9f4d24ab75a948b02808f597a87ef2 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 27 Dec 2024 16:37:10 +0800
Subject: [PATCH 06/47] update

---
 swift/llm/argument/rlhf_args.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index fdc1862954..48fba539f4 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -70,7 +70,9 @@ def __post_init__(self):
             raise ValueError('CPO/ORPO or LoRA training does not require a ref_model to be passed in.')
 
     def _init_ppo(self):
-        pass
+        self.response_length = self.max_new_tokens
+        self.num_ppo_epochs = self.num_train_epochs
+        # TODO: streaming, MLLM
 
     def _init_simpo(self):
         if self.rlhf_type != 'simpo':

From 0a54bb8fa20941c2b6cc036fd79011c86b277ee1 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sat, 28 Dec 2024 10:14:17 +0800
Subject: [PATCH 07/47] update

---
 swift/llm/model/__init__.py           |  2 +-
 swift/llm/model/constant.py           |  4 +-
 swift/llm/model/model/__init__.py     |  2 +-
 swift/llm/model/model/reward_model.py | 33 ++++++++++++++++
 swift/llm/model/register.py           | 56 ---------------------------
 swift/llm/train/rlhf.py               | 11 ------
 6 files changed, 38 insertions(+), 70 deletions(-)
 create mode 100644 swift/llm/model/model/reward_model.py

diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py
index 754d715207..939db750a3 100644
--- a/swift/llm/model/__init__.py
+++ b/swift/llm/model/__init__.py
@@ -4,6 +4,6 @@
 from .model_arch import MODEL_ARCH_MAPPING, ModelArch, ModelKeys, MultiModelKeys, get_model_arch, register_model_arch
 from .register import (MODEL_MAPPING, Model, ModelGroup, ModelMeta, fix_do_sample_warning, get_default_device_map,
                        get_default_torch_dtype, get_model_info_meta, get_model_name, get_model_tokenizer,
-                       get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, get_model_with_value_head,
+                       get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn,
                        load_by_unsloth, register_model)
 from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index a87f901c76..82a89bd036 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -93,9 +93,11 @@ class LLMModelType:
     mamba = 'mamba'
     polylm = 'polylm'
     aya = 'aya'
-
+    # bert
     modern_bert = 'modern_bert'
     bert = 'bert'
+    # reward model
+    reward_model = 'reward_model'
 
 
 class MLLMModelType:
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
index a972ec64ef..82ebf432f9 100644
--- a/swift/llm/model/model/__init__.py
+++ b/swift/llm/model/model/__init__.py
@@ -1,2 +1,2 @@
 from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
-               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
+               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi, reward_model)
diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
new file mode 100644
index 0000000000..63b0bb0c90
--- /dev/null
+++ b/swift/llm/model/model/reward_model.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import AutoConfig
+from transformers import AutoModel
+from swift.utils import get_logger
+from ..constant import LLMModelType
+from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
+
+logger = get_logger()
+
+
+def get_model_tokenizer_reward_model(model_dir, *args, **kwargs):
+    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    if 'AutoModel' in (getattr(model_config, 'auto_map', None) or {}):
+        kwargs['automodel_class'] = AutoModel
+    return get_model_tokenizer_from_local(model_dir, *args, **kwargs)
+
+
+register_model(
+    ModelMeta(
+        LLMModelType.reward_model, [
+            ModelGroup([
+                Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
+                Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
+            ]),
+            ModelGroup([
+                Model('Shanghai_AI_Laboratory/internlm2-1_8b-reward', 'internlm/internlm2-1_8b-reward'),
+                Model('Shanghai_AI_Laboratory/internlm2-7b-reward', 'internlm/internlm2-7b-reward'),
+                Model('Shanghai_AI_Laboratory/internlm2-20b-reward', 'internlm/internlm2-20b-reward'),
+            ]),
+        ],
+        None,
+        get_model_tokenizer_reward_model,
+        tags=['reward_model']))
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index a98406eb80..8cf83a5547 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -196,62 +196,6 @@ def get_model_tokenizer_from_local(model_dir: str,
     return model, tokenizer
 
 
-def get_model_with_value_head(model) -> 'AutoModelForCausalLMWithValueHead':
-    from trl import AutoModelForCausalLMWithValueHead
-    lm_head_namings = ['lm_head', 'embed_out']
-    if not any(hasattr(model, attribute) for attribute in lm_head_namings):
-        setattr(model, 'lm_head', None)  # avoid ValueError
-
-    model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
-
-    def patch_valuehead_model(model):
-        attr_list = [
-            'get_input_embeddings', 'vis_processor', 'extract_feature', 'get_rope_index', 'model', 'vision_tower',
-            'img2emb', '_encode_image', '_merge_input_ids_with_image_features', 'prepare_inputs_embeds',
-            'build_conversation_input_ids', 'config', 'get_slice_image_placeholder', 'transform', 'get_vllm_embedding',
-            'forward_image', 'dtype', 'base_model_prefix', 'device', 'visual'
-        ]
-        for attr in attr_list:
-            if hasattr(model.pretrained_model, attr) and not hasattr(model, attr):
-                setattr(model, attr, getattr(model.pretrained_model, attr))
-
-        # PPO compatible
-        if not hasattr(model, 'score'):
-            setattr(model, 'score', model.v_head)
-        if model.base_model_prefix == '' and hasattr(model.pretrained_model, 'language_model'):
-            model.base_model_prefix = model.pretrained_model.language_model.base_model_prefix
-
-        base_model_prefix = model.pretrained_model.base_model_prefix
-        if hasattr(model.pretrained_model, base_model_prefix):
-            setattr(model, base_model_prefix, getattr(model.pretrained_model, base_model_prefix))
-
-    patch_valuehead_model(model)
-
-    # try to load local vhead weights
-    vhead_params = None
-    try:
-        from safetensors import safe_open
-        vhead_file = os.path.join(model.pretrained_model.model_dir, 'value_head.safetensors')
-        with safe_open(vhead_file, framework='pt', device='cpu') as f:
-            vhead_params = {key: f.get_tensor(key) for key in f.keys()}
-    except Exception:
-        pass
-
-    try:
-        vhead_file = os.path.join(model.pretrained_model.model_dir, 'value_head.bin')
-        vhead_params = torch.load(vhead_file, map_location='cpu')
-    except Exception:
-        pass
-
-    if vhead_params is not None:
-        model.load_state_dict(vhead_params, strict=False)
-        logger.info(f'Loading value head weights from {vhead_file}')
-    else:
-        logger.info('The local value head weight file was not detected.'
-                    'Ignore it if this is during the reward modeling phase,')
-    return model
-
-
 def get_model_tokenizer_with_flash_attn(model_dir: str,
                                         model_info: ModelInfo,
                                         model_kwargs: Dict[str, Any],
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 906a3e1166..3ec858b1b3 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -32,17 +32,6 @@ def _prepare_template(self) -> None:
             # Avoid padding labels during the model's forward pass in multimodal models.
             self.template.loss_scale = 'last_round'
 
-    @classmethod
-    def prepare_model(cls, args, model, *_args, **kwargs):
-        model = super().prepare_model(args, model, *_args, **kwargs)
-        if args.rlhf_type == 'rm':
-            from trl import AutoModelForCausalLMWithValueHead
-            lm_head_namings = ['lm_head', 'embed_out']
-            if not any(hasattr(model, attribute) for attribute in lm_head_namings):
-                model.lm_head = None  # avoid error
-            model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
-            patch_getattr(AutoModelForCausalLMWithValueHead, 'pretrained_model')
-        return model
 
     def _get_dataset(self):
         args = self.args

From 8e90c4ea03d8d3c32611d65f285a67fc74eefebb Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 30 Dec 2024 10:52:14 +0800
Subject: [PATCH 08/47] update

---
 swift/llm/__init__.py | 4 ++--
 swift/ui/base.py      | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
index d098e849a8..8397076413 100644
--- a/swift/llm/__init__.py
+++ b/swift/llm/__init__.py
@@ -20,7 +20,7 @@
                         HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys,
                         ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup,
                         Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth,
-                        git_clone_github)
+                        git_clone_github, get_matched_model_meta)
     from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor,
                           DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor,
                           LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE,
@@ -54,7 +54,7 @@
             'ModelInfo', 'ModelMeta', 'ModelKeys', 'register_model_arch', 'MultiModelKeys', 'ModelArch',
             'MODEL_ARCH_MAPPING', 'get_model_arch', 'get_model_info_meta', 'get_model_name', 'register_model',
             'ModelGroup', 'Model', 'get_model_tokenizer_with_flash_attn', 'get_model_tokenizer_multimodal',
-            'load_by_unsloth', 'git_clone_github'
+            'load_by_unsloth', 'git_clone_github', 'get_matched_model_meta'
         ],
         'dataset': [
             'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING',
diff --git a/swift/ui/base.py b/swift/ui/base.py
index 508b61f4b4..6e1c847137 100644
--- a/swift/ui/base.py
+++ b/swift/ui/base.py
@@ -15,8 +15,7 @@
 from gradio import Accordion, Audio, Button, Checkbox, Dropdown, File, Image, Slider, Tab, TabItem, Textbox, Video
 from modelscope.hub.utils.utils import get_cache_dir
 
-from swift.llm import TEMPLATE_MAPPING, BaseArguments
-from swift.llm.model.register import get_matched_model_meta
+from swift.llm import TEMPLATE_MAPPING, BaseArguments, get_matched_model_meta
 
 all_langs = ['zh', 'en']
 builder: Type['BaseUI'] = None

From 1bdec8ab77f9bbd0cc03ba91fd3aad37d77056a6 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 30 Dec 2024 14:43:55 +0800
Subject: [PATCH 09/47] update

---
 .gitignore                                    |  1 +
 docs/source/Instruction/ReleaseNote3.0.md     |  2 +-
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +-
 .../Instruction/Command-line-parameters.md    |  2 +-
 docs/source_en/Instruction/ReleaseNote3.0.md  |  2 +-
 swift/llm/argument/base_args/base_args.py     | 10 +++----
 swift/llm/argument/base_args/data_args.py     |  6 ++---
 swift/llm/argument/train_args.py              |  2 +-
 swift/llm/dataset/loader.py                   | 26 +++++++++----------
 swift/llm/dataset/preprocessor/core.py        | 15 ++++++-----
 swift/llm/infer/infer.py                      |  6 ++---
 swift/llm/model/__init__.py                   |  2 +-
 swift/llm/model/model/glm.py                  |  1 +
 swift/llm/model/register.py                   |  2 +-
 swift/llm/template/template/glm.py            |  1 +
 swift/llm/train/sft.py                        |  6 ++---
 swift/utils/torchacc_utils.py                 |  4 +--
 17 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/.gitignore b/.gitignore
index ab9f4f2efa..7d7f5ab085 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,6 +139,7 @@ my_model/
 /data
 result/
 images
+/custom
 
 # Pytorch
 *.pth
diff --git a/docs/source/Instruction/ReleaseNote3.0.md b/docs/source/Instruction/ReleaseNote3.0.md
index 2730e45c34..4afd54d6bf 100644
--- a/docs/source/Instruction/ReleaseNote3.0.md
+++ b/docs/source/Instruction/ReleaseNote3.0.md
@@ -6,7 +6,7 @@
 
 1. 数据集模块重构。数据集加载速度提升2-20倍，encode速度提升2-4倍，支持streaming模式
     - 移除了dataset_name机制，采用dataset_id、dataset_dir、dataset_path方式指定数据集
-    - 使用`--dataset_num_proc`支持多进程加速处理、使用`--load_from_cache_file true`支持使用数据前处理缓存
+    - 使用`--dataset_num_proc`支持多进程加速处理、使用`--enable_cache true`支持使用数据前处理缓存
     - 使用`--streaming`支持流式加载hub端和本地数据集
     - 支持`--packing`命令以获得更稳定的训练效率
     - 指定`--dataset <dataset_dir>`支持本地加载开源数据集
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 91c2b6d567..4470c545cb 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -34,7 +34,7 @@
 - data_seed: 数据集随机种子，默认为42
 - 🔥dataset_num_proc: 数据集预处理的进程数，默认为1
 - 🔥streaming: 流式读取并处理数据集，默认False
-- load_from_cache_file: 数据集预处理使用cache，默认False
+- enable_cache: 数据集预处理使用cache，默认False
   - 注意: 如果改为True，在数据集有更改时可能无法生效，如果修改本参数发现训练不正常请考虑设置为False
 - download_mode: 数据集下载模式，包含`reuse_dataset_if_exists`和`force_redownload`，默认为reuse_dataset_if_exists
 - strict: 如果为True，则数据集只要某行有问题直接抛错，否则会丢弃出错行。默认False
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 7975cb7eed..af33a1d6c1 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -34,7 +34,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - data_seed: Random seed for the dataset, default is 42.
 - 🔥dataset_num_proc: Number of processes for dataset preprocessing, default is 1.
 - 🔥streaming: Stream read and process the dataset, default is False.
-- load_from_cache_file: Use cache for dataset preprocessing, default is False.
+- enable_cache: Use cache for dataset preprocessing, default is False.
   - Note: If set to True, it may not take effect if the dataset changes. If modifying this parameter leads to issues during training, consider setting it to False.
 - download_mode: Dataset download mode, including `reuse_dataset_if_exists` and `force_redownload`, default is reuse_dataset_if_exists.
 - strict: If True, the dataset will throw an error if any row has a problem; otherwise, it will discard the erroneous row. Default is False.
diff --git a/docs/source_en/Instruction/ReleaseNote3.0.md b/docs/source_en/Instruction/ReleaseNote3.0.md
index 0211a97eb3..037137af34 100644
--- a/docs/source_en/Instruction/ReleaseNote3.0.md
+++ b/docs/source_en/Instruction/ReleaseNote3.0.md
@@ -6,7 +6,7 @@
 
 1. Dataset module refactoring. The dataset loading speed has improved by 2-20 times, and encoding speed has improved by 2-4 times, with support for streaming mode.
     - Removed the dataset_name mechanism; now use dataset_id, dataset_dir, or dataset_path to specify the dataset.
-    - Use `--dataset_num_proc` to support multi-process acceleration and `--load_from_cache_file true` to support cache processing before using the data.
+    - Use `--dataset_num_proc` to support multi-process acceleration and `--enable_cache true` to support cache processing before using the data.
     - Use `--streaming` to support streaming loading of hub and local datasets.
     - Support `--packing` command for more stable training efficiency.
     - Use `--dataset <dataset_dir>` to support local loading of open-source datasets.
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index a2f88deaa9..00e5cb908d 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -130,14 +130,10 @@ def __post_init__(self):
         self._init_ckpt_dir()
         self._init_custom_register()
         self._init_model_kwargs()
-        self.rank, self.local_rank, world_size, self.local_world_size = get_dist_setting()
-        # The Seq2SeqTrainingArguments has a property called world_size, which cannot be assigned a value.
-        try:
-            self.world_size = world_size
-        except AttributeError:
-            pass
+        self.rank, self.local_rank, self.global_world_size, self.local_world_size = get_dist_setting()
+        # The Seq2SeqTrainingArguments has a property called world_size
         logger.info(f'rank: {self.rank}, local_rank: {self.local_rank}, '
-                    f'world_size: {world_size}, local_world_size: {self.local_world_size}')
+                    f'world_size: {self.global_world_size}, local_world_size: {self.local_world_size}')
         assert len(self.adapters) <= 1, f'args.adapters: {self.adapters}'
         ModelArguments.__post_init__(self)
         QuantizeArguments.__post_init__(self)
diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
index 3afb3683d5..0ab70a8f33 100644
--- a/swift/llm/argument/base_args/data_args.py
+++ b/swift/llm/argument/base_args/data_args.py
@@ -20,7 +20,7 @@ class DataArguments:
         data_seed (Optional[int]): Seed for dataset shuffling. Default is None.
         dataset_num_proc (int): Number of processes to use for data loading and preprocessing. Default is 1.
         streaming (bool): Flag to enable streaming of datasets. Default is False.
-        load_from_cache_file (bool): Flag to load dataset from cache file. Default is False.
+        enable_cache (bool): Flag to load dataset from cache file. Default is False.
         download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
         model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None].
         model_author (List[str]): List containing Chinese and English names of the model author.
@@ -38,7 +38,7 @@ class DataArguments:
     dataset_num_proc: int = 1
     streaming: bool = False
 
-    load_from_cache_file: bool = False
+    enable_cache: bool = False
     download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists'
     strict: bool = False
     # Chinese name and English name
@@ -74,7 +74,7 @@ def get_dataset_kwargs(self):
             'streaming': self.streaming,
             'use_hf': self.use_hf,
             'hub_token': self.hub_token,
-            'load_from_cache_file': self.load_from_cache_file,
+            'enable_cache': self.enable_cache,
             'download_mode': self.download_mode,
             'strict': self.strict,
             'model_name': self.model_name,
diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
index f99fbe0621..5d3d52b4a7 100644
--- a/swift/llm/argument/train_args.py
+++ b/swift/llm/argument/train_args.py
@@ -57,7 +57,7 @@ def _init_eval_strategy(self):
     def __post_init__(self):
         self._init_output_dir()
         if self.average_tokens_across_devices is None:
-            self.average_tokens_across_devices = self.world_size > 1
+            self.average_tokens_across_devices = self.global_world_size > 1
         if self.metric_for_best_model is None:
             self.metric_for_best_model = 'rouge-l' if self.predict_with_generate else 'loss'
         if self.greater_is_better is None:
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
index 7aaa9f9e0b..6e0de90348 100644
--- a/swift/llm/dataset/loader.py
+++ b/swift/llm/dataset/loader.py
@@ -165,7 +165,7 @@ def _load_dataset_path(dataset_meta: DatasetMeta,
                            *,
                            num_proc: int = 1,
                            strict: bool = False,
-                           load_from_cache_file: bool = False,
+                           enable_cache: bool = False,
                            streaming: bool = False) -> HfDataset:
         dataset_path = dataset_meta.dataset_path
 
@@ -177,7 +177,7 @@ def _load_dataset_path(dataset_meta: DatasetMeta,
         dataset = hf_load_dataset(file_type, data_files=dataset_path, **kwargs)
 
         dataset = dataset_meta.preprocess_func(
-            dataset, num_proc=num_proc, strict=strict, load_from_cache_file=load_from_cache_file)
+            dataset, num_proc=num_proc, strict=strict, enable_cache=enable_cache)
         dataset = DatasetLoader._remove_useless_columns(dataset)
         return dataset
 
@@ -191,7 +191,7 @@ def _load_repo_dataset(
         use_hf: Optional[bool] = None,
         hub_token: Optional[str] = None,
         strict: bool = False,
-        load_from_cache_file: bool = False,
+        enable_cache: bool = False,
         revision: Optional[str] = None,
         download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists',
     ) -> HfDataset:
@@ -245,7 +245,7 @@ def _load_repo_dataset(
                     if streaming and isinstance(dataset, HfDataset):
                         dataset = dataset.to_iterable_dataset()
             dataset = subset.preprocess_func(
-                dataset, num_proc=num_proc, strict=strict, load_from_cache_file=load_from_cache_file)
+                dataset, num_proc=num_proc, strict=strict, enable_cache=enable_cache)
             dataset = DatasetLoader._remove_useless_columns(dataset)
             datasets.append(dataset)
         return DatasetLoader._concat_datasets(datasets, streaming)
@@ -278,7 +278,7 @@ def post_process(
         split_dataset_ratio: float = 0.,
         streaming: bool = False,
         random_state: Optional[np.random.RandomState] = None,
-        load_from_cache_file: bool = False,
+        enable_cache: bool = False,
     ) -> Tuple[DATASET_TYPE, Optional[DATASET_TYPE]]:
         """Split into train/val datasets and perform dataset sampling."""
         assert dataset_sample is None or dataset_sample > 0
@@ -319,7 +319,7 @@ def post_process(
                 assert train_sample > 0
                 train_dataset, val_dataset = train_dataset.train_test_split(
                     test_size=val_sample, seed=get_seed(random_state),
-                    load_from_cache_file=load_from_cache_file).values()
+                    enable_cache=enable_cache).values()
                 train_dataset = sample_dataset(train_dataset, train_sample, random_state)
         return train_dataset, val_dataset
 
@@ -342,7 +342,7 @@ def load(
         use_hf: Optional[bool] = None,
         hub_token: Optional[str] = None,
         strict: bool = False,
-        load_from_cache_file: bool = False,
+        enable_cache: bool = False,
         download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists',
     ) -> HfDataset:
 
@@ -351,7 +351,7 @@ def load(
                 dataset_meta=dataset_meta,
                 num_proc=num_proc,
                 strict=strict,
-                load_from_cache_file=load_from_cache_file,
+                enable_cache=enable_cache,
                 streaming=streaming,
             )
         else:
@@ -373,7 +373,7 @@ def load(
                     hub_token=hub_token,
                     num_proc=num_proc,
                     strict=strict,
-                    load_from_cache_file=load_from_cache_file,
+                    enable_cache=enable_cache,
                     revision=revision,
                     streaming=streaming,
                     download_mode=download_mode)
@@ -407,7 +407,7 @@ def load_dataset(
     use_hf: Optional[bool] = None,
     hub_token: Optional[str] = None,
     strict: bool = False,
-    load_from_cache_file: bool = False,
+    enable_cache: bool = False,
     download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists',
     # self-cognition
     model_name: Union[Tuple[str, str], List[str], None] = None,  # zh, en
@@ -417,7 +417,7 @@ def load_dataset(
 
     Args:
         download_mode: Download mode, default is `reuse_dataset_if_exists`.
-        load_from_cache_file: Use cache file or not, Default False.
+        enable_cache: Use cache file or not, Default False.
         strict: Raise if any row is not correct.
         hub_token: The token of the hub.
         use_hf: Use hf dataset or ms dataset.
@@ -444,7 +444,7 @@ def load_dataset(
         'num_proc': num_proc,
         'use_hf': use_hf,
         'strict': strict,
-        'load_from_cache_file': load_from_cache_file,
+        'enable_cache': enable_cache,
         'download_mode': download_mode,
         'streaming': streaming,
         'hub_token': hub_token
@@ -461,7 +461,7 @@ def load_dataset(
             split_dataset_ratio=split_dataset_ratio,
             random_state=seed,
             streaming=streaming,
-            load_from_cache_file=load_from_cache_file)
+            enable_cache=enable_cache)
         if train_dataset is not None:
             train_datasets.append(train_dataset)
         if val_dataset is not None:
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index c3d692f4c3..9b54df91ee 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -6,9 +6,8 @@
 
 import numpy as np
 from datasets import Dataset as HfDataset
-from datasets import Image
+from datasets import Image, Value, enable_caching, disable_caching
 from datasets import IterableDataset as HfIterableDataset
-from datasets import Value
 
 from swift.llm import history_to_messages
 from swift.utils import get_logger
@@ -246,19 +245,23 @@ def __call__(
         *,
         num_proc: int = 1,
         strict: bool = False,
-        load_from_cache_file: bool = False,
+        enable_cache: bool = False,
         batch_size: int = 1000,
     ) -> DATASET_TYPE:
         from ..utils import sample_dataset
         if self.dataset_sample is not None:
             dataset = sample_dataset(dataset, self.dataset_sample, self.random_state)
 
+        if enable_cache:
+            enable_caching()
+        else:
+            disable_caching()
         dataset = self._rename_columns(dataset)
         dataset = self.prepare_dataset(dataset)
         dataset = self._cast_pil_image(dataset)
         map_kwargs = {}
         if isinstance(dataset, HfDataset):
-            map_kwargs.update({'num_proc': num_proc, 'load_from_cache_file': load_from_cache_file})
+            map_kwargs.update({'num_proc': num_proc})
         with self._patch_arrow_writer():
             try:
                 dataset_mapped = dataset.map(
@@ -462,9 +465,9 @@ def __call__(
         *,
         num_proc: int = 1,
         strict: bool = False,
-        load_from_cache_file: bool = False,
+        enable_cache: bool = False,
     ) -> DATASET_TYPE:
         dataset = get_features_dataset(dataset)
         dataset = dataset.rename_columns(self.columns_mapping)
         preprocessor = self._get_preprocessor(dataset)
-        return preprocessor(dataset, num_proc=num_proc, load_from_cache_file=load_from_cache_file, strict=strict)
+        return preprocessor(dataset, num_proc=num_proc, enable_cache=enable_cache, strict=strict)
diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
index 1802ec402d..9741c042bf 100644
--- a/swift/llm/infer/infer.py
+++ b/swift/llm/infer/infer.py
@@ -182,9 +182,9 @@ def infer_dataset(self) -> List[Dict[str, Any]]:
                 if self.jsonl_writer:
                     self.jsonl_writer.append(data)
         else:
-            is_dist = args.world_size > 1 and dist.is_initialized()
+            is_dist = args.global_world_size > 1 and dist.is_initialized()
             if is_dist:
-                val_dataset = val_dataset.shard(args.world_size, args.rank, contiguous=True)
+                val_dataset = val_dataset.shard(args.global_world_size, args.rank, contiguous=True)
             val_dataset = list(val_dataset)
             labels_list = [InferRequest.remove_response(data['messages']) for data in val_dataset]
 
@@ -197,7 +197,7 @@ def infer_dataset(self) -> List[Dict[str, Any]]:
                 data = {'response': response, 'logprobs': resp.choices[0].logprobs, **data}
                 result_list.append(data)
             if is_dist:
-                total_result_list = [None for _ in range(args.world_size)] if args.rank == 0 else None
+                total_result_list = [None for _ in range(args.global_world_size)] if args.rank == 0 else None
                 dist.gather_object(result_list, total_result_list)
                 result_list = total_result_list and list(chain.from_iterable(total_result_list))
 
diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py
index 58ba7500c7..5e6a49cbbc 100644
--- a/swift/llm/model/__init__.py
+++ b/swift/llm/model/__init__.py
@@ -5,5 +5,5 @@
 from .register import (MODEL_MAPPING, Model, ModelGroup, ModelMeta, fix_do_sample_warning, get_default_device_map,
                        get_default_torch_dtype, get_matched_model_meta, get_model_info_meta, get_model_name,
                        get_model_tokenizer, get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn,
-                       get_model_with_value_head, load_by_unsloth, register_model)
+                       load_by_unsloth, register_model)
 from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download
diff --git a/swift/llm/model/model/glm.py b/swift/llm/model/model/glm.py
index bb22dfc793..3b9a699769 100644
--- a/swift/llm/model/model/glm.py
+++ b/swift/llm/model/model/glm.py
@@ -180,6 +180,7 @@ def get_model_tokenizer_glm4v(model_dir: str,
         MLLMModelType.glm4v,
         [ModelGroup([
             Model('ZhipuAI/glm-4v-9b', 'THUDM/glm-4v-9b'),
+            Model('ZhipuAI/cogagent-9b-20241220', 'THUDM/cogagent-9b-20241220'),
         ])],
         TemplateType.glm4v,
         get_model_tokenizer_glm4v,
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 8cf83a5547..8e3428cc6a 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -374,7 +374,7 @@ def get_model_info_meta(
     if model_meta is None and model_type is not None:
         model_meta = MODEL_MAPPING[model_type]
     if model_meta is None:
-        model_meta = ModelMeta('', [], 'dummy', get_model_tokenizer_from_local, model_arch=None)
+        model_meta = ModelMeta(None, [], 'dummy', get_model_tokenizer_from_local, model_arch=None)
         logger.info(f'Temporarily create model_meta: {model_meta}')
 
     if torch_dtype is None:
diff --git a/swift/llm/template/template/glm.py b/swift/llm/template/template/glm.py
index 46b73043dc..e9ee3d008a 100644
--- a/swift/llm/template/template/glm.py
+++ b/swift/llm/template/template/glm.py
@@ -64,6 +64,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             encoded['images'] = inputs2['images']
         encoded['input_ids'] = input_ids
         encoded['labels'] = labels
+        encoded['position_ids'] = list(range(len(input_ids)))
         return encoded
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
index 56ce3d87cb..c8ffefa33d 100644
--- a/swift/llm/train/sft.py
+++ b/swift/llm/train/sft.py
@@ -225,7 +225,7 @@ def _prepare_callbacks(self):
     def _stat_dataset(self, dataset: HfDataset):
         args = self.args
         dataset = GetLengthPreprocessor()(
-            dataset, num_proc=args.dataset_num_proc, load_from_cache_file=args.load_from_cache_file)
+            dataset, num_proc=args.dataset_num_proc, enable_cache=args.enable_cache)
         _, stat_str = stat_array(dataset['length'])
         logger.info(f'Dataset Token Length: {stat_str}')
         return stat_str
@@ -247,13 +247,13 @@ def _encode_dataset(self, train_dataset, val_dataset):
                 train_dataset,
                 num_proc=args.dataset_num_proc,
                 strict=args.strict,
-                load_from_cache_file=args.load_from_cache_file)
+                enable_cache=args.enable_cache)
             if val_dataset is not None and not args.predict_with_generate:
                 val_dataset = preprocessor(
                     val_dataset,
                     num_proc=args.dataset_num_proc,
                     strict=args.strict,
-                    load_from_cache_file=args.load_from_cache_file)
+                    enable_cache=args.enable_cache)
 
         inputs = train_dataset[0] if hasattr(train_dataset, '__len__') else next(iter(train_dataset))
         template.print_inputs(inputs, tokenizer_kwargs=inputs.pop('tokenizer_kwargs', None) or {})
diff --git a/swift/utils/torchacc_utils.py b/swift/utils/torchacc_utils.py
index d009ce65da..cd21f4b7bc 100644
--- a/swift/utils/torchacc_utils.py
+++ b/swift/utils/torchacc_utils.py
@@ -254,9 +254,9 @@ def save_ta_fsdp_checkpoint(self_model, tokenizer, args, output_dir):
         'shard_metadata': self_model._get_underlay_model().get_shard_metadata(),
     }
     if isinstance(model, PeftModel):
-        ckpt_path = os.path.join(output_dir, f'rank{args.process_index}-of-{args.world_size}-adapter_model.bin')
+        ckpt_path = os.path.join(output_dir, f'rank{args.process_index}-of-{args.global_world_size}-adapter_model.bin')
     else:
-        ckpt_path = os.path.join(output_dir, f'rank{args.process_index}-of-{args.world_size}-pytorch_model.bin')
+        ckpt_path = os.path.join(output_dir, f'rank{args.process_index}-of-{args.global_world_size}-pytorch_model.bin')
     xm.save(ckpt, ckpt_path, master_only=False)
     # Make sure all ranks have saved checkpoints
     xm.rendezvous('save_full_checkpoints')

From e19bffce3e22e5c617291caaafc643eea7f43af3 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 31 Dec 2024 17:50:58 +0800
Subject: [PATCH 10/47] update

---
 swift/llm/dataset/preprocessor/core.py | 7 ++-----
 swift/llm/model/model/__init__.py      | 2 +-
 swift/llm/model/model/reward_model.py  | 4 ++--
 swift/llm/train/rlhf.py                | 1 -
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index d4c01416ab..1f42f5f4a1 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -6,8 +6,9 @@
 
 import numpy as np
 from datasets import Dataset as HfDataset
-from datasets import Image, Value, enable_caching, disable_caching
+from datasets import Image
 from datasets import IterableDataset as HfIterableDataset
+from datasets import Value
 
 from swift.llm import history_to_messages
 from swift.utils import get_logger
@@ -251,10 +252,6 @@ def __call__(
         if self.dataset_sample is not None:
             dataset = sample_dataset(dataset, self.dataset_sample, self.random_state)
 
-        if enable_cache:
-            enable_caching()
-        else:
-            disable_caching()
         dataset = self._rename_columns(dataset)
         dataset = self.prepare_dataset(dataset)
         dataset = self._cast_pil_image(dataset)
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
index 82ebf432f9..3f190926e4 100644
--- a/swift/llm/model/model/__init__.py
+++ b/swift/llm/model/model/__init__.py
@@ -1,2 +1,2 @@
 from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
-               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi, reward_model)
+               minicpm, mistral, mllm, mplug, openbuddy, qwen, reward_model, telechat, yi)
diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
index 63b0bb0c90..522b6a9ac3 100644
--- a/swift/llm/model/model/reward_model.py
+++ b/swift/llm/model/model/reward_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from transformers import AutoConfig
-from transformers import AutoModel
+from transformers import AutoConfig, AutoModel
+
 from swift.utils import get_logger
 from ..constant import LLMModelType
 from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 3ec858b1b3..2e5bff8910 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -32,7 +32,6 @@ def _prepare_template(self) -> None:
             # Avoid padding labels during the model's forward pass in multimodal models.
             self.template.loss_scale = 'last_round'
 
-
     def _get_dataset(self):
         args = self.args
         train_dataset, val_dataset = super()._get_dataset()

From b7c28aa8ced422688c795f640cd23588dc37a1d9 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 13:33:52 +0800
Subject: [PATCH 11/47] update

---
 ...\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 6 +++---
 docs/source_en/Instruction/Command-line-parameters.md       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 1f8804dfd5..f26a169257 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -45,7 +45,7 @@
 ### 模板参数
 - 🔥template: 对话模板类型，默认使用model对应的template类型。`swift pt`会将对话模版转为生成模板使用
 - 🔥system: 自定义system字段，默认为None，使用template的默认system
-- 🔥max_length: 单样本的tokens最大长度，默认为None，不做限制
+- 🔥max_length: 单样本的tokens最大长度。默认为None，设置为模型支持的tokens最大长度(max_model_len)
 - truncation_strategy: 如果超长如何处理，支持`delete`, `left`和`right`，代表删除、左侧裁剪和右侧裁剪，默认为'delete'
 - 🔥max_pixels: 多模态模型图片前处理的最大像素数（H\*W），默认不缩放。
 - tools_prompt: 智能体训练时的工具列表转为system的格式，请参考[智能体训练](./智能体的支持.md)，默认为'react_en'
@@ -96,7 +96,7 @@
 - lr_scheduler_type: lr_scheduler类型，默认为cosine
 - lr_scheduler_kwargs: lr_scheduler其他参数
 - 🔥gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数. 例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`
-- report_to: 默认值为`tensorboard`
+- report_to: 默认值为`tensorboard`。你也可以指定`--report_to tensorboard wandb`
 - remove_unused_columns: 默认值False
 - logging_first_step: 是否记录第一个step的打印，默认值True
 - logging_steps: 日志打印间隔，默认值5
@@ -139,7 +139,7 @@
 #### 全参
 - freeze_parameters: 被冻结参数的前缀, 默认为`[]`
 - freeze_parameters_ratio: 从下往上冻结的参数比例, 默认为0. 可设置为1将所有参数冻结, 结合`trainable_parameters`设置可训练参数.
-- trainable_parameters: 可训练参数的前缀, 默认为`[]`
+- trainable_parameters: 可训练参数的前缀, 默认为`[]`. `trainable_parameters`的优先级高于`freeze_parameters`和`freeze_parameters_ratio`
 
 #### LoRA
 - 🔥lora_rank: 默认为`8`
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index f6f572230f..2325e4aeae 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -45,7 +45,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 ### Template Arguments
 - 🔥template: Type of dialogue template, which defaults to the template type corresponding to the model. `swift pt` will convert the dialogue template into a generation template for use.
 - 🔥system: Custom system field, default is None, uses the default system of the template.
-- 🔥max_length: Maximum length of tokens for a single sample, default is None (no limit).
+- 🔥max_length: The maximum length of tokens for a single sample. Defaults to None, set to the maximum length of tokens supported by the model (max_model_len).
 - truncation_strategy: How to handle overly long tokens, supports `delete`, `left`, `right`, representing deletion, left trimming, and right trimming, default is 'delete'.
 - 🔥max_pixels: Maximum pixel count for pre-processing images in multimodal models (H*W), default is no scaling.
 - tools_prompt: The list of tools for agent training converted to system format, refer to [Agent Training](./Agent-support.md), default is 'react_en'.
@@ -97,7 +97,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - lr_scheduler_type: LR scheduler type, default is cosine.
 - lr_scheduler_kwargs: Other parameters for the LR scheduler.
 - 🔥gradient_checkpointing_kwargs: Parameters passed to `torch.utils.checkpoint`. For example, set to `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`.
-- report_to: Default is `tensorboard`.
+- report_to: Default is `tensorboard`. You can also specify `--report_to tensorboard wandb`.
 - remove_unused_columns: Default is False.
 - logging_first_step: Whether to log the first step print, default is True.
 - logging_steps: Interval for logging prints, default is 5.
@@ -141,7 +141,7 @@ Other important parameters:
 
 - freeze_parameters: Prefix of parameters to be frozen, default is `[]`.
 - freeze_parameters_ratio: Ratio of parameters to freeze from the bottom up, default is 0. Setting it to 1 will freeze all parameters. Combine with `trainable_parameters` to set trainable parameters.
-- trainable_parameters: Prefix of trainable parameters, default is `[]`.
+- trainable_parameters: Prefix of trainable parameters, default is `[]`. The priority of `trainable_parameters` is higher than that of `freeze_parameters` and `freeze_parameters_ratio`.
 
 #### LoRA
 

From d8b210532f236dfcabfe4ad6a8fc214d5a3d1cce Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 14:04:34 +0800
Subject: [PATCH 12/47] fix

---
 examples/train/multimodal/caption.sh | 2 +-
 examples/train/multimodal/infer.sh   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/train/multimodal/caption.sh b/examples/train/multimodal/caption.sh
index e75ce7f78a..c9d496fc0f 100644
--- a/examples/train/multimodal/caption.sh
+++ b/examples/train/multimodal/caption.sh
@@ -5,7 +5,7 @@ CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
-    --dataset 'modelscope/coco_2014_caption#20000' \
+    --dataset 'modelscope/coco_2014_caption:validation#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
diff --git a/examples/train/multimodal/infer.sh b/examples/train/multimodal/infer.sh
index 2e8627319f..699ede32d7 100644
--- a/examples/train/multimodal/infer.sh
+++ b/examples/train/multimodal/infer.sh
@@ -1,5 +1,6 @@
 # Perform inference using the validation set from the training phase.
 CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \

From f726d0adcc35b50cc991c8d8078515d683dc20a5 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:16:44 +0800
Subject: [PATCH 13/47] update

---
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +-
 .../Instruction/Command-line-parameters.md    |  2 +-
 swift/llm/infer/infer_engine/pt_engine.py     | 20 ++++++++++++-----
 swift/llm/model/constant.py                   | 10 ++++-----
 swift/llm/model/model/__init__.py             |  2 +-
 swift/llm/model/model/bert.py                 |  6 ++---
 swift/llm/model/model/internlm.py             | 22 +++++++++++++++++--
 swift/llm/model/model/reward_model.py         | 15 ++++++++-----
 swift/llm/model/register.py                   |  7 +++++-
 swift/llm/template/constant.py                |  6 ++++-
 swift/llm/template/template/internlm.py       |  3 ++-
 11 files changed, 68 insertions(+), 27 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index f26a169257..e7fa6af9d5 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -19,7 +19,7 @@
 - 🔥model: 模型id或模型本地路径。如果是自定义模型请配合`model_type`和`template`使用，具体可以参考[自定义模型](../Customization/自定义模型.md)
 - model_type: 模型类型。相同的模型架构、template、模型加载过程被定义为一个model_type
 - model_revision: 模型版本
-- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
+- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. seq_cls的例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`，默认从config文件中读取
 - attn_impl: attention类型，支持`flash_attn`, `sdpa`, `eager`，默认使用sdpa
 - num_labels: 分类模型需要指定。代表标签数量，默认为None
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 2325e4aeae..d3a3cc1599 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -20,7 +20,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - model_type: Model type. The same model architecture, template, and loading process define a model_type.
 - model_revision: Model version.
 - 🔥torch_dtype: Data type for model weights, supports `float16`, `bfloat16`, `float32`, default is read from the config file.
-- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
+- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples of seq_cls [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - attn_impl: Attention type, supports `flash_attn`, `sdpa`, `eager`, default is sdpa.
 - num_labels: To be specified for classification models, representing the number of labels, default is None.
 - rope_scaling: Rope type, supports `linear` and `dynamic`, to be used with `max_length`.
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index f68f32da86..63317a8d20 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -159,11 +159,13 @@ def _infer_stream(self,
             raise ValueError(error_msg)
         streamer = TokensIteratorStreamer()
         generate_kwargs = {
-            'adapter_names': self._get_adapter_names(adapter_request),
             'generation_config': generation_config,
             'streamer': streamer,
             **inputs,
         }
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            generate_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
 
         logits_streamer = None
@@ -272,12 +274,18 @@ def _infer_seq_cls(self,
                        inputs: Dict[str, Any],
                        adapter_request: Optional[AdapterRequest] = None,
                        **kwargs):
-        call_kwargs = {'adapter_names': self._get_adapter_names(adapter_request)}
+        call_kwargs = {}
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            call_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
         inputs.pop('labels')
         logits = self.model(**inputs, **call_kwargs).logits
-        logprobs = torch.log_softmax(logits, -1)
-        preds = torch.argmax(logits, dim=-1).tolist()
+        if logits.shape[-1] > 1:
+            logprobs = torch.log_softmax(logits, -1)
+            preds = torch.argmax(logits, dim=-1).tolist()
+        else:
+            preds = logits.squeeze(dim=-1).tolist()
         res = []
         for i, pred in enumerate(preds):
             usage_info = self._get_usage_info(num_prompt_tokens, 1)
@@ -300,10 +308,12 @@ def _infer_full(self,
                     template_inputs=None) -> Union[List[ChatCompletionResponse]]:
         # bos_token TODO: encoder-decoder
         generate_kwargs = {
-            'adapter_names': self._get_adapter_names(adapter_request),
             'generation_config': generation_config,
             **inputs
         }
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            generate_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
 
         generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model)
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 82a89bd036..63c0a82f90 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -2,7 +2,6 @@
 # Classification criteria for model_type: same model architecture, tokenizer (get function), template.
 from typing import List
 
-
 class LLMModelType:
     qwen = 'qwen'
     qwen2 = 'qwen2'
@@ -93,12 +92,13 @@ class LLMModelType:
     mamba = 'mamba'
     polylm = 'polylm'
     aya = 'aya'
-    # bert
+
+class BertModelType:
     modern_bert = 'modern_bert'
     bert = 'bert'
-    # reward model
-    reward_model = 'reward_model'
 
+class RMModelType:
+    internlm2_reward = 'internlm2_reward'
 
 class MLLMModelType:
     qwen_vl = 'qwen_vl'
@@ -176,7 +176,7 @@ class MLLMModelType:
     megrez_omni = 'megrez_omni'
 
 
-class ModelType(LLMModelType, MLLMModelType):
+class ModelType(LLMModelType, MLLMModelType, BertModelType, RMModelType):
 
     @classmethod
     def get_model_name_list(cls) -> List[str]:
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
index 3f190926e4..a972ec64ef 100644
--- a/swift/llm/model/model/__init__.py
+++ b/swift/llm/model/model/__init__.py
@@ -1,2 +1,2 @@
 from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
-               minicpm, mistral, mllm, mplug, openbuddy, qwen, reward_model, telechat, yi)
+               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
diff --git a/swift/llm/model/model/bert.py b/swift/llm/model/model/bert.py
index f83aef3536..785a3fa137 100644
--- a/swift/llm/model/model/bert.py
+++ b/swift/llm/model/model/bert.py
@@ -2,7 +2,7 @@
 from transformers import AutoConfig
 
 from swift.utils import get_logger
-from ..constant import LLMModelType
+from ..constant import BertModelType
 from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
 
 logger = get_logger()
@@ -17,7 +17,7 @@ def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
 
 register_model(
     ModelMeta(
-        LLMModelType.modern_bert, [
+        BertModelType.modern_bert, [
             ModelGroup([
                 Model('answerdotai/ModernBERT-base', 'answerdotai/ModernBERT-base'),
                 Model('answerdotai/ModernBERT-large', 'answerdotai/ModernBERT-large'),
@@ -30,7 +30,7 @@ def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
 
 register_model(
     ModelMeta(
-        LLMModelType.bert, [ModelGroup([
+        BertModelType.bert, [ModelGroup([
             Model('iic/nlp_structbert_backbone_base_std'),
         ])],
         None,
diff --git a/swift/llm/model/model/internlm.py b/swift/llm/model/model/internlm.py
index 590744455f..45b65f03a9 100644
--- a/swift/llm/model/model/internlm.py
+++ b/swift/llm/model/model/internlm.py
@@ -5,10 +5,12 @@
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
 from swift.llm import TemplateType
-from ..constant import LLMModelType, MLLMModelType
+from ..constant import LLMModelType, MLLMModelType, RMModelType
 from ..model_arch import ModelArch
 from ..patcher import patch_output_clone, patch_output_to_input_device
-from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_with_flash_attn, register_model
+from ..register import (
+    Model, ModelGroup, ModelMeta, get_model_tokenizer_with_flash_attn, register_model, get_model_tokenizer_reward_model
+)
 from ..utils import ModelInfo, safe_snapshot_download, use_submodel_func
 
 register_model(
@@ -332,3 +334,19 @@ def get_model_tokenizer_xcomposer_ol(model_dir, *args, **kwargs):
         model_arch=ModelArch.qwen2_audio,
         tags=['audio'],
     ))
+
+
+register_model(
+    ModelMeta(
+        RMModelType.internlm2_reward, [
+            ModelGroup([
+                Model('Shanghai_AI_Laboratory/internlm2-1_8b-reward', 'internlm/internlm2-1_8b-reward'),
+                Model('Shanghai_AI_Laboratory/internlm2-7b-reward', 'internlm/internlm2-7b-reward'),
+                Model('Shanghai_AI_Laboratory/internlm2-20b-reward', 'internlm/internlm2-20b-reward'),
+            ]),
+        ],
+        TemplateType.internlm2_reward,
+        get_model_tokenizer_reward_model,
+        requires=['transformers>=4.38'],
+        architectures=['InternLM2ForRewardModel'],
+        tags=['reward_model']))
diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
index 522b6a9ac3..85435d3ea1 100644
--- a/swift/llm/model/model/reward_model.py
+++ b/swift/llm/model/model/reward_model.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from transformers import AutoConfig, AutoModel
 
+from swift.llm import TemplateType
 from swift.utils import get_logger
 from ..constant import LLMModelType
 from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
@@ -17,17 +18,19 @@ def get_model_tokenizer_reward_model(model_dir, *args, **kwargs):
 
 register_model(
     ModelMeta(
-        LLMModelType.reward_model, [
-            ModelGroup([
-                Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
-                Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
-            ]),
+        LLMModelType.internlm2_reward, [
             ModelGroup([
                 Model('Shanghai_AI_Laboratory/internlm2-1_8b-reward', 'internlm/internlm2-1_8b-reward'),
                 Model('Shanghai_AI_Laboratory/internlm2-7b-reward', 'internlm/internlm2-7b-reward'),
                 Model('Shanghai_AI_Laboratory/internlm2-20b-reward', 'internlm/internlm2-20b-reward'),
             ]),
         ],
-        None,
+        TemplateType.internlm2_reward,
         get_model_tokenizer_reward_model,
         tags=['reward_model']))
+
+
+            # ModelGroup([
+            #     Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
+            #     Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
+            # ]),
\ No newline at end of file
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 8e3428cc6a..6fed4f7073 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -9,7 +9,7 @@
 import torch
 from peft import PeftModel
 from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, PretrainedConfig,
-                          PreTrainedModel, PreTrainedTokenizerBase)
+                          PreTrainedModel, PreTrainedTokenizerBase, AutoModel)
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import is_torch_bf16_gpu_available, is_torch_cuda_available, is_torch_npu_available, strtobool
 from transformers.utils.versions import require_version
@@ -216,6 +216,11 @@ def get_model_tokenizer_multimodal(model_dir: str, *args, **kwargs):
     model, _ = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
     return model, processor
 
+def get_model_tokenizer_reward_model(model_dir, *args, **kwargs):
+    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    if 'AutoModel' in (getattr(model_config, 'auto_map', None) or {}):
+        kwargs['automodel_class'] = AutoModel
+    return get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
 
 def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
     # Use the default values of temperature/top_p/top_k in generation_config.
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
index 963ad75de3..6b106fb081 100644
--- a/swift/llm/template/constant.py
+++ b/swift/llm/template/constant.py
@@ -70,6 +70,10 @@ class LLMTemplateType:
     dbrx = 'dbrx'
 
 
+class RMTemplateType:
+    internlm2_reward = 'internlm2_reward'
+
+
 class MLLMTemplateType:
     qwen_vl = 'qwen_vl'
     qwen_audio = 'qwen_audio'
@@ -144,7 +148,7 @@ class MLLMTemplateType:
     megrez_omni = 'megrez_omni'
 
 
-class TemplateType(LLMTemplateType, MLLMTemplateType):
+class TemplateType(LLMTemplateType, MLLMTemplateType, RMTemplateType):
 
     @classmethod
     def get_template_name_list(cls) -> List[str]:
diff --git a/swift/llm/template/template/internlm.py b/swift/llm/template/template/internlm.py
index 9d07844089..ba917f766c 100644
--- a/swift/llm/template/template/internlm.py
+++ b/swift/llm/template/template/internlm.py
@@ -8,7 +8,7 @@
 
 from swift.utils import get_env_args
 from ..base import Template
-from ..constant import LLMTemplateType, MLLMTemplateType
+from ..constant import LLMTemplateType, MLLMTemplateType, RMTemplateType
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
 from ..utils import Context, Prompt, Word
@@ -34,6 +34,7 @@
 
 register_template(ChatmlTemplateMeta(LLMTemplateType.internlm2, default_system=INTERNLM_SYSTEM))
 
+register_template(ChatmlTemplateMeta(RMTemplateType.internlm2_reward, default_system=INTERNLM_SYSTEM))
 
 class InternLMXComposer2Template(Template):
     image_placeholder = ['</s>']

From d5dfcabc671e86c593c7ac33a5dd2025a6082346 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:16:59 +0800
Subject: [PATCH 14/47] update

---
 swift/llm/infer/infer_engine/pt_engine.py |  6 ++++--
 swift/llm/model/model/reward_model.py     | 20 +-------------------
 2 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index 63317a8d20..b355abccd7 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -282,10 +282,12 @@ def _infer_seq_cls(self,
         inputs.pop('labels')
         logits = self.model(**inputs, **call_kwargs).logits
         if logits.shape[-1] > 1:
-            logprobs = torch.log_softmax(logits, -1)
             preds = torch.argmax(logits, dim=-1).tolist()
+            logprobs = torch.log_softmax(logits, -1)
+            logprobs = [self._get_seq_cls_logprobs(logprobs[i]) for i in range(preds)]
         else:
             preds = logits.squeeze(dim=-1).tolist()
+            logprobs = [None] * len(preds)
         res = []
         for i, pred in enumerate(preds):
             usage_info = self._get_usage_info(num_prompt_tokens, 1)
@@ -294,7 +296,7 @@ def _infer_seq_cls(self,
                     index=0,
                     message=ChatMessage(role='assistant', content=str(pred), tool_calls=None),
                     finish_reason='stop',
-                    logprobs=self._get_seq_cls_logprobs(logprobs[i]))
+                    logprobs=logprobs[i])
             ]
             res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info))
         return res
diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
index 85435d3ea1..b5a72c12db 100644
--- a/swift/llm/model/model/reward_model.py
+++ b/swift/llm/model/model/reward_model.py
@@ -3,32 +3,14 @@
 
 from swift.llm import TemplateType
 from swift.utils import get_logger
-from ..constant import LLMModelType
+from ..constant import RMModelType
 from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
 
 logger = get_logger()
 
 
-def get_model_tokenizer_reward_model(model_dir, *args, **kwargs):
-    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-    if 'AutoModel' in (getattr(model_config, 'auto_map', None) or {}):
-        kwargs['automodel_class'] = AutoModel
-    return get_model_tokenizer_from_local(model_dir, *args, **kwargs)
 
 
-register_model(
-    ModelMeta(
-        LLMModelType.internlm2_reward, [
-            ModelGroup([
-                Model('Shanghai_AI_Laboratory/internlm2-1_8b-reward', 'internlm/internlm2-1_8b-reward'),
-                Model('Shanghai_AI_Laboratory/internlm2-7b-reward', 'internlm/internlm2-7b-reward'),
-                Model('Shanghai_AI_Laboratory/internlm2-20b-reward', 'internlm/internlm2-20b-reward'),
-            ]),
-        ],
-        TemplateType.internlm2_reward,
-        get_model_tokenizer_reward_model,
-        tags=['reward_model']))
-
 
             # ModelGroup([
             #     Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),

From d6097529eb7db87a2f24e70459a327124e74a710 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:17:44 +0800
Subject: [PATCH 15/47] update

---
 ...5\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 1 +
 docs/source_en/Instruction/Command-line-parameters.md            | 1 +
 2 files changed, 2 insertions(+)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 5197429f21..4ed3931f77 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -419,6 +419,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 - INPUT_SIZE: 默认为448
 
 ### internvl2, internvl2_phi3, internvl2_5
+参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: 默认为12
 - INPUT_SIZE: 默认为448
 - VIDEO_MAX_NUM: 默认为1。视频的MAX_NUM
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index a0eab37fc4..c4345b6405 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -419,6 +419,7 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - INPUT_SIZE: Default is 448
 
 ### internvl2, internvl2_phi3, internvl2_5
+For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
 - VIDEO_MAX_NUM: Default is 1, which is the MAX_NUM for videos

From 80157a61516d0c3f900a8621861c01a517b98776 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:22:29 +0800
Subject: [PATCH 16/47] fix bugs

---
 ...44\350\241\214\345\217\202\346\225\260.md" | 10 +++----
 .../Instruction/Command-line-parameters.md    | 10 +++----
 examples/train/multimodal/caption.sh          |  2 +-
 examples/train/multimodal/infer.sh            |  1 +
 swift/llm/infer/infer_engine/pt_engine.py     | 29 ++++++++++++-------
 swift/llm/model/constant.py                   |  6 ++--
 swift/llm/model/model/bert.py                 |  6 ++--
 7 files changed, 38 insertions(+), 26 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 4ed3931f77..4af7091fee 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -19,7 +19,7 @@
 - 🔥model: 模型id或模型本地路径。如果是自定义模型请配合`model_type`和`template`使用，具体可以参考[自定义模型](../Customization/自定义模型.md)
 - model_type: 模型类型。相同的模型架构、template、模型加载过程被定义为一个model_type
 - model_revision: 模型版本
-- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
+- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. seq_cls的例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`，默认从config文件中读取
 - attn_impl: attention类型，支持`flash_attn`, `sdpa`, `eager`，默认使用sdpa
 - num_labels: 分类模型需要指定。代表标签数量，默认为None
@@ -45,7 +45,7 @@
 ### 模板参数
 - 🔥template: 对话模板类型，默认使用model对应的template类型。`swift pt`会将对话模版转为生成模板使用
 - 🔥system: 自定义system字段，默认为None，使用template的默认system
-- 🔥max_length: 单样本的tokens最大长度，默认为None，不做限制
+- 🔥max_length: 单样本的tokens最大长度。默认为None，设置为模型支持的tokens最大长度(max_model_len)
 - truncation_strategy: 如果超长如何处理，支持`delete`, `left`和`right`，代表删除、左侧裁剪和右侧裁剪，默认为'delete'
 - 🔥max_pixels: 多模态模型图片前处理的最大像素数（H\*W），默认不缩放。
 - tools_prompt: 智能体训练时的工具列表转为system的格式，请参考[智能体训练](./智能体的支持.md)，默认为'react_en'
@@ -96,7 +96,7 @@
 - lr_scheduler_type: lr_scheduler类型，默认为cosine
 - lr_scheduler_kwargs: lr_scheduler其他参数
 - 🔥gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数. 例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`
-- report_to: 默认值为`tensorboard`
+- report_to: 默认值为`tensorboard`。你也可以指定`--report_to tensorboard wandb`, `--report_to all`
 - remove_unused_columns: 默认值False
 - logging_first_step: 是否记录第一个step的打印，默认值True
 - logging_steps: 日志打印间隔，默认值5
@@ -139,7 +139,7 @@
 #### 全参
 - freeze_parameters: 被冻结参数的前缀, 默认为`[]`
 - freeze_parameters_ratio: 从下往上冻结的参数比例, 默认为0. 可设置为1将所有参数冻结, 结合`trainable_parameters`设置可训练参数.
-- trainable_parameters: 可训练参数的前缀, 默认为`[]`
+- trainable_parameters: 可训练参数的前缀, 默认为`[]`. `trainable_parameters`的优先级高于`freeze_parameters`和`freeze_parameters_ratio`
 
 #### LoRA
 - 🔥lora_rank: 默认为`8`
@@ -306,7 +306,7 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数.
 ### RLHF参数
 RLHF参数继承于[训练参数](#训练参数)
 
-- 🔥rlhf_type: 对齐算法类型，支持`dpo`, `orpo`, `simpo`, `kto`, `cpo`
+- 🔥rlhf_type: 对齐算法类型，支持`dpo`, `orpo`, `simpo`, `kto`, `cpo`, `rm`
 - ref_model: DPO等算法中的原始对比模型
 - ref_model_type: 同model_type
 - ref_model_revision: 同model_revision
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index c4345b6405..5fba09ac7b 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -20,7 +20,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - model_type: Model type. The same model architecture, template, and loading process define a model_type.
 - model_revision: Model version.
 - 🔥torch_dtype: Data type for model weights, supports `float16`, `bfloat16`, `float32`, default is read from the config file.
-- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
+- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples of seq_cls [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - attn_impl: Attention type, supports `flash_attn`, `sdpa`, `eager`, default is sdpa.
 - num_labels: To be specified for classification models, representing the number of labels, default is None.
 - rope_scaling: Rope type, supports `linear` and `dynamic`, to be used with `max_length`.
@@ -45,7 +45,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 ### Template Arguments
 - 🔥template: Type of dialogue template, which defaults to the template type corresponding to the model. `swift pt` will convert the dialogue template into a generation template for use.
 - 🔥system: Custom system field, default is None, uses the default system of the template.
-- 🔥max_length: Maximum length of tokens for a single sample, default is None (no limit).
+- 🔥max_length: The maximum length of tokens for a single sample. Defaults to None, set to the maximum length of tokens supported by the model (max_model_len).
 - truncation_strategy: How to handle overly long tokens, supports `delete`, `left`, `right`, representing deletion, left trimming, and right trimming, default is 'delete'.
 - 🔥max_pixels: Maximum pixel count for pre-processing images in multimodal models (H*W), default is no scaling.
 - tools_prompt: The list of tools for agent training converted to system format, refer to [Agent Training](./Agent-support.md), default is 'react_en'.
@@ -97,7 +97,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - lr_scheduler_type: LR scheduler type, default is cosine.
 - lr_scheduler_kwargs: Other parameters for the LR scheduler.
 - 🔥gradient_checkpointing_kwargs: Parameters passed to `torch.utils.checkpoint`. For example, set to `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`.
-- report_to: Default is `tensorboard`.
+- report_to: Default is `tensorboard`. You can also specify `--report_to tensorboard wandb`, `--report_to all`.
 - remove_unused_columns: Default is False.
 - logging_first_step: Whether to log the first step print, default is True.
 - logging_steps: Interval for logging prints, default is 5.
@@ -141,7 +141,7 @@ Other important parameters:
 
 - freeze_parameters: Prefix of parameters to be frozen, default is `[]`.
 - freeze_parameters_ratio: Ratio of parameters to freeze from the bottom up, default is 0. Setting it to 1 will freeze all parameters. Combine with `trainable_parameters` to set trainable parameters.
-- trainable_parameters: Prefix of trainable parameters, default is `[]`.
+- trainable_parameters: Prefix of trainable parameters, default is `[]`. The priority of `trainable_parameters` is higher than that of `freeze_parameters` and `freeze_parameters_ratio`.
 
 #### LoRA
 
@@ -310,7 +310,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
 
 RLHF arguments inherit from the [training arguments](#training-arguments).
 
-- 🔥rlhf_type: Alignment algorithm type, supports `dpo`, `orpo`, `simpo`, `kto`, `cpo`.
+- 🔥rlhf_type: Alignment algorithm type, supports `dpo`, `orpo`, `simpo`, `kto`, `cpo`, `rm`.
 - ref_model: Original comparison model in algorithms like DPO.
 - ref_model_type: Same as model_type.
 - ref_model_revision: Same as model_revision.
diff --git a/examples/train/multimodal/caption.sh b/examples/train/multimodal/caption.sh
index e75ce7f78a..c9d496fc0f 100644
--- a/examples/train/multimodal/caption.sh
+++ b/examples/train/multimodal/caption.sh
@@ -5,7 +5,7 @@ CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
-    --dataset 'modelscope/coco_2014_caption#20000' \
+    --dataset 'modelscope/coco_2014_caption:validation#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
diff --git a/examples/train/multimodal/infer.sh b/examples/train/multimodal/infer.sh
index 2e8627319f..699ede32d7 100644
--- a/examples/train/multimodal/infer.sh
+++ b/examples/train/multimodal/infer.sh
@@ -1,5 +1,6 @@
 # Perform inference using the validation set from the training phase.
 CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index f68f32da86..fdf2063a52 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -159,11 +159,13 @@ def _infer_stream(self,
             raise ValueError(error_msg)
         streamer = TokensIteratorStreamer()
         generate_kwargs = {
-            'adapter_names': self._get_adapter_names(adapter_request),
             'generation_config': generation_config,
             'streamer': streamer,
             **inputs,
         }
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            generate_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
 
         logits_streamer = None
@@ -272,12 +274,20 @@ def _infer_seq_cls(self,
                        inputs: Dict[str, Any],
                        adapter_request: Optional[AdapterRequest] = None,
                        **kwargs):
-        call_kwargs = {'adapter_names': self._get_adapter_names(adapter_request)}
+        call_kwargs = {}
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            call_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
         inputs.pop('labels')
         logits = self.model(**inputs, **call_kwargs).logits
-        logprobs = torch.log_softmax(logits, -1)
-        preds = torch.argmax(logits, dim=-1).tolist()
+        if logits.shape[-1] > 1:
+            preds = torch.argmax(logits, dim=-1).tolist()
+            logprobs = torch.log_softmax(logits, -1)
+            logprobs = [self._get_seq_cls_logprobs(logprobs[i]) for i in range(preds)]
+        else:
+            preds = logits.squeeze(dim=-1).tolist()
+            logprobs = [None] * len(preds)
         res = []
         for i, pred in enumerate(preds):
             usage_info = self._get_usage_info(num_prompt_tokens, 1)
@@ -286,7 +296,7 @@ def _infer_seq_cls(self,
                     index=0,
                     message=ChatMessage(role='assistant', content=str(pred), tool_calls=None),
                     finish_reason='stop',
-                    logprobs=self._get_seq_cls_logprobs(logprobs[i]))
+                    logprobs=logprobs[i])
             ]
             res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info))
         return res
@@ -299,11 +309,10 @@ def _infer_full(self,
                     adapter_request: Optional[AdapterRequest] = None,
                     template_inputs=None) -> Union[List[ChatCompletionResponse]]:
         # bos_token TODO: encoder-decoder
-        generate_kwargs = {
-            'adapter_names': self._get_adapter_names(adapter_request),
-            'generation_config': generation_config,
-            **inputs
-        }
+        generate_kwargs = {'generation_config': generation_config, **inputs}
+        adapter_names = self._get_adapter_names(adapter_request)
+        if adapter_names is not None:
+            generate_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
 
         generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model)
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index c7b6dad4c1..ed07cf875b 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -93,7 +93,9 @@ class LLMModelType:
     mamba = 'mamba'
     polylm = 'polylm'
     aya = 'aya'
-    # bert
+
+
+class BertModelType:
     modern_bert = 'modern_bert'
     bert = 'bert'
 
@@ -174,7 +176,7 @@ class MLLMModelType:
     megrez_omni = 'megrez_omni'
 
 
-class ModelType(LLMModelType, MLLMModelType):
+class ModelType(LLMModelType, MLLMModelType, BertModelType):
 
     @classmethod
     def get_model_name_list(cls) -> List[str]:
diff --git a/swift/llm/model/model/bert.py b/swift/llm/model/model/bert.py
index f83aef3536..785a3fa137 100644
--- a/swift/llm/model/model/bert.py
+++ b/swift/llm/model/model/bert.py
@@ -2,7 +2,7 @@
 from transformers import AutoConfig
 
 from swift.utils import get_logger
-from ..constant import LLMModelType
+from ..constant import BertModelType
 from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
 
 logger = get_logger()
@@ -17,7 +17,7 @@ def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
 
 register_model(
     ModelMeta(
-        LLMModelType.modern_bert, [
+        BertModelType.modern_bert, [
             ModelGroup([
                 Model('answerdotai/ModernBERT-base', 'answerdotai/ModernBERT-base'),
                 Model('answerdotai/ModernBERT-large', 'answerdotai/ModernBERT-large'),
@@ -30,7 +30,7 @@ def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
 
 register_model(
     ModelMeta(
-        LLMModelType.bert, [ModelGroup([
+        BertModelType.bert, [ModelGroup([
             Model('iic/nlp_structbert_backbone_base_std'),
         ])],
         None,

From 106f588a12acdaa8ff0ff6b78b1b444c5047cf99 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:26:47 +0800
Subject: [PATCH 17/47] fix

---
 swift/llm/infer/infer_engine/pt_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index fdf2063a52..227fd13ca9 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -284,7 +284,7 @@ def _infer_seq_cls(self,
         if logits.shape[-1] > 1:
             preds = torch.argmax(logits, dim=-1).tolist()
             logprobs = torch.log_softmax(logits, -1)
-            logprobs = [self._get_seq_cls_logprobs(logprobs[i]) for i in range(preds)]
+            logprobs = [self._get_seq_cls_logprobs(logprobs[i]) for i in range(len(preds))]
         else:
             preds = logits.squeeze(dim=-1).tolist()
             logprobs = [None] * len(preds)

From 774b115bd23a89292e7769cbc40e923fa7da6fd4 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 15:51:02 +0800
Subject: [PATCH 18/47] update

---
 ...\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 6 +++---
 docs/source_en/Instruction/Command-line-parameters.md       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 4af7091fee..ed0cc39fc1 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -403,15 +403,15 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 
 - IMAGE_FACTOR: 默认为28
 - MIN_PIXELS: 默认为`4 * 28 * 28`
-- MAX_PIXELS: 默认为`16384 * 28 * 28`，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/ocr.sh#L3)
+- 🔥MAX_PIXELS: 默认为`16384 * 28 * 28`，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/ocr.sh#L3)
 - MAX_RATIO: 默认为200
 - VIDEO_MIN_PIXELS: 默认为`128 * 28 * 28`
-- VIDEO_MAX_PIXELS: 默认为`768 * 28 * 28`，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L7)
+- 🔥VIDEO_MAX_PIXELS: 默认为`768 * 28 * 28`，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L7)
 - VIDEO_TOTAL_PIXELS: 默认为`24576 * 28 * 28`
 - FRAME_FACTOR: 默认为2
 - FPS: 默认为2.0
 - FPS_MIN_FRAMES: 默认为4
-- FPS_MAX_FRAMES: 默认为768，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L8)
+- 🔥FPS_MAX_FRAMES: 默认为768，参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L8)
 
 ### internvl, internvl_phi3
 参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 5fba09ac7b..310f9b0406 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -403,15 +403,15 @@ For the meaning of the arguments, please refer to [here](https://github.com/Qwen
 
 - IMAGE_FACTOR: Default is 28
 - MIN_PIXELS: Default is `4 * 28 * 28`
-- MAX_PIXELS: Default is `16384 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/ocr.sh#L3)
+- 🔥MAX_PIXELS: Default is `16384 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/ocr.sh#L3)
 - MAX_RATIO: Default is 200
 - VIDEO_MIN_PIXELS: Default is `128 * 28 * 28`
-- VIDEO_MAX_PIXELS: Default is `768 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L7)
+- 🔥VIDEO_MAX_PIXELS: Default is `768 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L7)
 - VIDEO_TOTAL_PIXELS: Default is `24576 * 28 * 28`
 - FRAME_FACTOR: Default is 2
 - FPS: Default is 2.0
 - FPS_MIN_FRAMES: Default is 4
-- FPS_MAX_FRAMES: Default is 768, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L8)
+- 🔥FPS_MAX_FRAMES: Default is 768, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L8)
 
 ### internvl, internvl_phi3
 For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)

From 8e00e42536f22e15d7777ec1c5ee8bb9113a332d Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 16:35:42 +0800
Subject: [PATCH 19/47] update

---
 swift/llm/model/constant.py                |  4 ++++
 swift/llm/model/model/internlm.py          |  6 ++----
 swift/llm/model/register.py                |  6 ++++--
 swift/llm/template/base.py                 |  3 ++-
 swift/llm/template/template/internlm.py    |  5 ++++-
 tests/test_align/test_template/test_llm.py | 25 ++++++++++++++++++++--
 6 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 63c0a82f90..83d438ec71 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -2,6 +2,7 @@
 # Classification criteria for model_type: same model architecture, tokenizer (get function), template.
 from typing import List
 
+
 class LLMModelType:
     qwen = 'qwen'
     qwen2 = 'qwen2'
@@ -93,13 +94,16 @@ class LLMModelType:
     polylm = 'polylm'
     aya = 'aya'
 
+
 class BertModelType:
     modern_bert = 'modern_bert'
     bert = 'bert'
 
+
 class RMModelType:
     internlm2_reward = 'internlm2_reward'
 
+
 class MLLMModelType:
     qwen_vl = 'qwen_vl'
     qwen_audio = 'qwen_audio'
diff --git a/swift/llm/model/model/internlm.py b/swift/llm/model/model/internlm.py
index 45b65f03a9..868fa25cb8 100644
--- a/swift/llm/model/model/internlm.py
+++ b/swift/llm/model/model/internlm.py
@@ -8,9 +8,8 @@
 from ..constant import LLMModelType, MLLMModelType, RMModelType
 from ..model_arch import ModelArch
 from ..patcher import patch_output_clone, patch_output_to_input_device
-from ..register import (
-    Model, ModelGroup, ModelMeta, get_model_tokenizer_with_flash_attn, register_model, get_model_tokenizer_reward_model
-)
+from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_reward_model,
+                        get_model_tokenizer_with_flash_attn, register_model)
 from ..utils import ModelInfo, safe_snapshot_download, use_submodel_func
 
 register_model(
@@ -335,7 +334,6 @@ def get_model_tokenizer_xcomposer_ol(model_dir, *args, **kwargs):
         tags=['audio'],
     ))
 
-
 register_model(
     ModelMeta(
         RMModelType.internlm2_reward, [
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 6fed4f7073..46da0015d4 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -8,8 +8,8 @@
 
 import torch
 from peft import PeftModel
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, PretrainedConfig,
-                          PreTrainedModel, PreTrainedTokenizerBase, AutoModel)
+from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, GenerationConfig,
+                          PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase)
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import is_torch_bf16_gpu_available, is_torch_cuda_available, is_torch_npu_available, strtobool
 from transformers.utils.versions import require_version
@@ -216,12 +216,14 @@ def get_model_tokenizer_multimodal(model_dir: str, *args, **kwargs):
     model, _ = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
     return model, processor
 
+
 def get_model_tokenizer_reward_model(model_dir, *args, **kwargs):
     model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
     if 'AutoModel' in (getattr(model_config, 'auto_map', None) or {}):
         kwargs['automodel_class'] = AutoModel
     return get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
 
+
 def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
     # Use the default values of temperature/top_p/top_k in generation_config.
     if generation_config.temperature == 0:
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index d0ceec287d..ba41907066 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -541,7 +541,8 @@ def _jinja_encode(self, inputs: StdTemplateInputs):
             messages.insert(0, {'role': 'system', 'content': inputs.system})
         if messages[-1]['content'] is None:
             messages.pop()
-        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        add_generation_prompt = messages[-1]['role'] != 'assistant'
+        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt)
         answer_len = 1 if self.is_training else 0
         return [text], [1.], answer_len
 
diff --git a/swift/llm/template/template/internlm.py b/swift/llm/template/template/internlm.py
index ba917f766c..53095b565f 100644
--- a/swift/llm/template/template/internlm.py
+++ b/swift/llm/template/template/internlm.py
@@ -34,7 +34,10 @@
 
 register_template(ChatmlTemplateMeta(LLMTemplateType.internlm2, default_system=INTERNLM_SYSTEM))
 
-register_template(ChatmlTemplateMeta(RMTemplateType.internlm2_reward, default_system=INTERNLM_SYSTEM))
+register_template(
+    ChatmlTemplateMeta(
+        RMTemplateType.internlm2_reward, default_system=INTERNLM_SYSTEM, suffix=['<|im_end|>\n<|reward|>']))
+
 
 class InternLMXComposer2Template(Template):
     image_placeholder = ['</s>']
diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index b997ec4fb6..3d98b2d20a 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -2,7 +2,7 @@
 
 import torch
 
-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 os.environ['SWIFT_DEBUG'] = '1'
 
 
@@ -17,6 +17,8 @@ def _infer_model(pt_engine, system=None, messages=None):
         resp = pt_engine.infer([{'messages': messages}], request_config=request_config)
         response = resp[0].choices[0].message.content
         messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<image>这是什么'}]
+    else:
+        messages = messages.copy()
     resp = pt_engine.infer([{
         'messages': messages,
     }], request_config=request_config)
@@ -61,6 +63,7 @@ def test_internlm():
 
 
 def test_internlm2():
+    # pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm2-1_8b')
     pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm2_5-1_8b-chat')
     _infer_model(pt_engine)
     pt_engine.default_template.template_backend = 'jinja'
@@ -160,6 +163,23 @@ def test_skywork_o1():
                    '8 + 1 = 9\n   \\]\n4. **Apples Split Equally')
 
 
+def test_internlm2_reward():
+    pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm2-1_8b-reward')
+    messages = [{
+        'role': 'user',
+        'content': "Hello! What's your name?"
+    }, {
+        'role': 'assistant',
+        'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
+    }]
+    pt_engine.task_type = 'seq_cls'
+    res = _infer_model(pt_engine, messages=messages)
+    pt_engine.default_template.template_backend = 'jinja'
+    res2 = _infer_model(pt_engine, messages=messages)
+    assert res2 == '0.48681640625'
+    print()
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig, get_template, get_model_tokenizer, VllmEngine
     from swift.utils import get_logger, seed_everything
@@ -179,4 +199,5 @@ def test_skywork_o1():
     # test_llama()
     # test_openbuddy()
     # test_megrez()
-    test_skywork_o1()
+    # test_skywork_o1()
+    test_internlm2_reward()

From 1fd06c6b2c2509f328106860ee3de4b1b94f838f Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 16:37:27 +0800
Subject: [PATCH 20/47] update

---
 swift/llm/infer/infer_engine/pt_engine.py |  3 ++-
 swift/llm/model/model/reward_model.py     | 12 ++++--------
 swift/llm/template/base.py                | 21 +++++++--------------
 swift/llm/template/template/internlm.py   |  3 +--
 4 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index 227fd13ca9..070d8c85b1 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -314,12 +314,13 @@ def _infer_full(self,
         if adapter_names is not None:
             generate_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
-
+        template.debug_logger(inputs)  # debug
         generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model)
         output = dict(template.generate(self.model, **generate_kwargs))
         output.pop('past_key_values', None)
         batched_generate_ids = output['sequences']
         batched_generate_ids = template.get_generate_ids(batched_generate_ids, num_prompt_tokens)
+        template.debug_logger({'generate_ids': batched_generate_ids})  # debug
         batched_logprobs = self.preprocess_logits(
             output.get('logits'), batched_generate_ids, generation_config.top_logprobs)
 
diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
index b5a72c12db..30a9d5660d 100644
--- a/swift/llm/model/model/reward_model.py
+++ b/swift/llm/model/model/reward_model.py
@@ -8,11 +8,7 @@
 
 logger = get_logger()
 
-
-
-
-
-            # ModelGroup([
-            #     Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
-            #     Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
-            # ]),
\ No newline at end of file
+# ModelGroup([
+#     Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
+#     Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
+# ]),
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index ba41907066..940f3df642 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -674,23 +674,16 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
                     encoded[k] = None
         return encoded
 
-    def _debug_logger(self, generate_ids):
-        if isinstance(generate_ids, list) or isinstance(generate_ids, torch.Tensor) and generate_ids.ndim == 1:
-            generate_ids = [generate_ids]
-        for tokens in generate_ids:
-            if isinstance(tokens, torch.Tensor):
-                tokens = tokens.tolist()
-            logger.info(f'[GENERATE_IDS] {tokens}')
-            logger.info(f'[GENERATE] {self.safe_decode(tokens)}\n' + '-' * 50)
+    def debug_logger(self, inputs):
+        if not strtobool(os.getenv('SWIFT_DEBUG', 'false')):
+            return
+        self.print_inputs(inputs)
 
     def get_generate_ids(self, generate_ids: Union[torch.Tensor, List[int]],
                          num_prompt_tokens: int) -> Union[torch.Tensor, List[int]]:
-        if strtobool(os.getenv('SWIFT_DEBUG', 'false')):
-            self._debug_logger(generate_ids)
         if self.skip_prompt:
-            return generate_ids[..., num_prompt_tokens:]
-        else:
-            return generate_ids
+            generate_ids = generate_ids[..., num_prompt_tokens:]
+        return generate_ids
 
     def post_process_generate_response(self, response: str, inputs: StdTemplateInputs) -> str:
         return response
@@ -944,7 +937,7 @@ def _torchacc_xtuner_data_collator(self, res, padding_to, tokenizer, padding_sid
     def print_inputs(self, inputs: Dict[str, Any], tokenizer_kwargs: Optional[Dict[str, Any]] = None) -> None:
         if tokenizer_kwargs is None:
             tokenizer_kwargs = {}
-        for key in ['input', 'labels', 'chosen_input', 'chosen_labels', 'rejected_input', 'rejected_labels']:
+        for key in ['input', 'labels', 'generate_ids', 'chosen_input', 'chosen_labels', 'rejected_input', 'rejected_labels']:
             val = inputs.get(key)  # fix val is a tensor
             if val is None:
                 val = inputs.get(f'{key}_ids')
diff --git a/swift/llm/template/template/internlm.py b/swift/llm/template/template/internlm.py
index 53095b565f..4d596d8a14 100644
--- a/swift/llm/template/template/internlm.py
+++ b/swift/llm/template/template/internlm.py
@@ -35,8 +35,7 @@
 register_template(ChatmlTemplateMeta(LLMTemplateType.internlm2, default_system=INTERNLM_SYSTEM))
 
 register_template(
-    ChatmlTemplateMeta(
-        RMTemplateType.internlm2_reward, default_system=INTERNLM_SYSTEM, suffix=['<|im_end|>\n<|reward|>']))
+    ChatmlTemplateMeta(RMTemplateType.internlm2_reward, suffix=['<|im_end|>\n<|reward|>']))
 
 
 class InternLMXComposer2Template(Template):

From 3735cbdad803e60fd509195d2c3874de4ac86e50 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 16:37:42 +0800
Subject: [PATCH 21/47] update

---
 tests/test_align/test_template/test_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index 3d98b2d20a..af6ba95ad2 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -176,7 +176,7 @@ def test_internlm2_reward():
     res = _infer_model(pt_engine, messages=messages)
     pt_engine.default_template.template_backend = 'jinja'
     res2 = _infer_model(pt_engine, messages=messages)
-    assert res2 == '0.48681640625'
+    assert res == res2 == '0.48681640625'
     print()
 
 

From c5b7022eb83a44e40f529ddfc2912e52f3fe8c02 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 20:16:34 +0800
Subject: [PATCH 22/47] update

---
 .../export/{merge_lora => }/merge_lora.sh     |  0
 examples/export/push_to_hub.sh                |  5 ++
 examples/infer/demo_rm.py                     | 33 ++++++++++
 examples/train/seq_cls/bert/deploy.sh         |  5 +-
 examples/train/seq_cls/bert/infer.sh          |  3 +-
 examples/train/seq_cls/qwen2_5/deploy.sh      |  2 +-
 examples/train/seq_cls/qwen2_5/sft.sh         |  3 +-
 swift/llm/__init__.py                         |  7 +--
 swift/llm/argument/base_args/base_args.py     |  4 +-
 swift/llm/argument/base_args/model_args.py    |  3 +-
 swift/llm/dataset/dataset/llm.py              | 31 +++-------
 swift/llm/dataset/preprocessor/__init__.py    |  6 +-
 swift/llm/dataset/preprocessor/core.py        |  8 +++
 swift/llm/dataset/preprocessor/extra.py       |  2 +-
 swift/llm/infer/infer.py                      | 14 +++--
 swift/llm/infer/infer_engine/pt_engine.py     |  8 +--
 swift/llm/model/model/internlm.py             |  2 +-
 swift/llm/model/register.py                   | 15 +++--
 swift/llm/template/base.py                    | 17 +++++-
 swift/llm/template/template/internlm.py       |  3 +-
 swift/llm/template/template_inputs.py         |  1 -
 tests/test_align/test_cls.py                  | 60 +++++++++++++++++++
 tests/test_align/test_template/test_llm.py    |  1 -
 23 files changed, 173 insertions(+), 60 deletions(-)
 rename examples/export/{merge_lora => }/merge_lora.sh (100%)
 create mode 100644 examples/export/push_to_hub.sh
 create mode 100644 examples/infer/demo_rm.py
 create mode 100644 tests/test_align/test_cls.py

diff --git a/examples/export/merge_lora/merge_lora.sh b/examples/export/merge_lora.sh
similarity index 100%
rename from examples/export/merge_lora/merge_lora.sh
rename to examples/export/merge_lora.sh
diff --git a/examples/export/push_to_hub.sh b/examples/export/push_to_hub.sh
new file mode 100644
index 0000000000..c4bcef7421
--- /dev/null
+++ b/examples/export/push_to_hub.sh
@@ -0,0 +1,5 @@
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>'
diff --git a/examples/infer/demo_rm.py b/examples/infer/demo_rm.py
new file mode 100644
index 0000000000..7e32932a7c
--- /dev/null
+++ b/examples/infer/demo_rm.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import List
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
+    request_config = RequestConfig(max_tokens=512, temperature=0)
+    resp_list = engine.infer(infer_requests, request_config)
+    query0 = infer_requests[0].messages[0]['content']
+    print(f'query0: {query0}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+
+
+if __name__ == '__main__':
+    from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, load_dataset
+    model = 'Shanghai_AI_Laboratory/internlm2-1_8b-reward'
+    engine = PtEngine(model, max_batch_size=64)
+    # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
+    dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
+    print(f'dataset: {dataset}')
+    infer_requests = [InferRequest(**data) for data in dataset]
+    infer_batch(engine, infer_requests)
+
+    messages = [{
+        'role': 'user',
+        'content': "Hello! What's your name?"
+    }, {
+        'role': 'assistant',
+        'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
+    }]
+    infer_batch(engine, [InferRequest(messages=messages)])
diff --git a/examples/train/seq_cls/bert/deploy.sh b/examples/train/seq_cls/bert/deploy.sh
index 13825d3491..c2102b6932 100644
--- a/examples/train/seq_cls/bert/deploy.sh
+++ b/examples/train/seq_cls/bert/deploy.sh
@@ -1,9 +1,10 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift deploy \
     --model output/vx-xxx/checkpoint-xxx \
-    --served_model_name bert-base-chinese
+    --served_model_name bert-base-chinese \
+    --truncation_strategy right
 
 # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
 # "model": "bert-base-chinese",
-# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# "messages": [{"role": "user", "content": "包装差，容易被调包。"}]
 # }'
diff --git a/examples/train/seq_cls/bert/infer.sh b/examples/train/seq_cls/bert/infer.sh
index abd8f1f02a..13bc1a1fd3 100644
--- a/examples/train/seq_cls/bert/infer.sh
+++ b/examples/train/seq_cls/bert/infer.sh
@@ -2,4 +2,5 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --model output/vx-xxx/checkpoint-xxx \
     --load_data_args true \
-    --max_batch_size 16
+    --max_batch_size 16 \
+    --truncation_strategy right
diff --git a/examples/train/seq_cls/qwen2_5/deploy.sh b/examples/train/seq_cls/qwen2_5/deploy.sh
index 5476dae499..3bc08c297c 100644
--- a/examples/train/seq_cls/qwen2_5/deploy.sh
+++ b/examples/train/seq_cls/qwen2_5/deploy.sh
@@ -4,5 +4,5 @@ swift deploy \
 
 # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
 # "model": "Qwen2.5-7B",
-# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# "messages": [{"role": "user", "content": "包装差，容易被调包。"}]
 # }'
diff --git a/examples/train/seq_cls/qwen2_5/sft.sh b/examples/train/seq_cls/qwen2_5/sft.sh
index 067c6664eb..fe33ee3729 100644
--- a/examples/train/seq_cls/qwen2_5/sft.sh
+++ b/examples/train/seq_cls/qwen2_5/sft.sh
@@ -1,8 +1,9 @@
 # If `num_labels` is provided, it will be considered a classification task,
 # and AutoModelForSequenceClassification will be used to load the model.
+# You can also specify `--model Qwen/Qwen2.5-0.5B-Instruct --use_chat_template true`.
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
-    --model Qwen/Qwen2.5-7B \
+    --model Qwen/Qwen2.5-0.5B \
     --train_type lora \
     --dataset 'DAMO_NLP/jd:cls#2000' \
     --torch_dtype bfloat16 \
diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
index 8397076413..8279dca957 100644
--- a/swift/llm/__init__.py
+++ b/swift/llm/__init__.py
@@ -57,10 +57,9 @@
             'load_by_unsloth', 'git_clone_github', 'get_matched_model_meta'
         ],
         'dataset': [
-            'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING',
-            'MediaResource', 'register_dataset', 'register_dataset_info', 'EncodePreprocessor', 'LazyLLMDataset',
-            'ConstantLengthDataset', 'standard_keys', 'load_dataset', 'DATASET_TYPE', 'sample_dataset',
-            'RowPreprocessor', 'ResponsePreprocessor', 'DatasetMeta'
+            'AlpacaPreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING', 'MediaResource', 'register_dataset',
+            'register_dataset_info', 'EncodePreprocessor', 'LazyLLMDataset', 'ConstantLengthDataset', 'standard_keys',
+            'load_dataset', 'DATASET_TYPE', 'sample_dataset', 'RowPreprocessor', 'ResponsePreprocessor', 'DatasetMeta'
         ],
         'utils': [
             'deep_getattr', 'to_device', 'History', 'Messages', 'history_to_messages', 'messages_to_history',
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index 4ca469ec03..086f6349e6 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -252,6 +252,4 @@ def get_model_processor(self, *, model=None, model_type=None, model_revision=Non
         kwargs['model_type'] = model_type or self.model_type
         kwargs['model_revision'] = model_revision or self.model_revision
 
-        model, processor = get_model_tokenizer(**kwargs)
-        model.model_info.task_type = self.task_type
-        return model, processor
+        return get_model_tokenizer(**kwargs)
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index 2f60e86b2d..5cc0bbbbf3 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -154,7 +154,6 @@ def get_model_kwargs(self):
             'rope_scaling': self.rope_scaling,
         }
         if self.task_type == 'seq_cls':
-            from transformers import AutoModelForSequenceClassification
-            kwargs['automodel_class'] = AutoModelForSequenceClassification
+            kwargs['task_type'] = self.task_type
             kwargs['model_kwargs'] = {'num_labels': self.num_labels}
         return kwargs
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index 56af8a47e6..6b035ca7d9 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -4,8 +4,8 @@
 from functools import partial
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from ..preprocessor import (AlpacaPreprocessor, ClsPreprocessor, MessagesPreprocessor, ResponsePreprocessor,
-                            RowPreprocessor, TextGenerationPreprocessor)
+from ..preprocessor import (AlpacaPreprocessor, ClsGenerationPreprocessor, ClsPreprocessor, MessagesPreprocessor,
+                            ResponsePreprocessor, RowPreprocessor, TextGenerationPreprocessor)
 from ..register import DatasetMeta, SubsetDataset, register_dataset
 
 
@@ -165,24 +165,13 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         ms_dataset_id='modelscope/clue',
         hf_dataset_id='clue',
         subsets=['cmnli'],
-        preprocess_func=ClsPreprocessor(['neutral', 'entailment', 'contradiction'],
-                                        task='Natural Language Inference',
-                                        is_pair_seq=True),
+        preprocess_func=ClsGenerationPreprocessor(['neutral', 'entailment', 'contradiction'],
+                                                  task='Natural Language Inference',
+                                                  is_pair_seq=True),
         tags=['text-generation', 'classification'],
         split=['train', 'validation'],
     ))
 
-
-class JdClsPreprocessor(ClsPreprocessor):
-
-    def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
-        label = int(row['label'])
-        res = super().preprocess(row)
-        res['messages'].pop()
-        res['label'] = label
-        return res
-
-
 register_dataset(
     DatasetMeta(
         ms_dataset_id='DAMO_NLP/jd',
@@ -190,15 +179,13 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
             SubsetDataset(
                 'default',
                 'default',
-                preprocess_func=ClsPreprocessor(['negative', 'positive'],
-                                                task='Sentiment Classification',
-                                                is_pair_seq=False)),
+                preprocess_func=ClsGenerationPreprocessor(['negative', 'positive'],
+                                                          task='Sentiment Classification',
+                                                          is_pair_seq=False)),
             SubsetDataset(
                 'cls',
                 'default',
-                preprocess_func=JdClsPreprocessor(['negative', 'positive'],
-                                                  task='Sentiment Classification',
-                                                  is_pair_seq=False),
+                preprocess_func=ClsPreprocessor(columns_mapping={'sentence': 'query'}),
             ),
         ],
         tags=['text-generation', 'classification', '🔥'],
diff --git a/swift/llm/dataset/preprocessor/__init__.py b/swift/llm/dataset/preprocessor/__init__.py
index 61b2f8cbcc..f9c5587bfa 100644
--- a/swift/llm/dataset/preprocessor/__init__.py
+++ b/swift/llm/dataset/preprocessor/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .core import (DATASET_TYPE, AlpacaPreprocessor, AutoPreprocessor, MessagesPreprocessor, ResponsePreprocessor,
-                   RowPreprocessor, get_features_dataset, standard_keys)
-from .extra import ClsPreprocessor, GroundingMixin, TextGenerationPreprocessor
+from .core import (DATASET_TYPE, AlpacaPreprocessor, AutoPreprocessor, ClsPreprocessor, MessagesPreprocessor,
+                   ResponsePreprocessor, RowPreprocessor, get_features_dataset, standard_keys)
+from .extra import ClsGenerationPreprocessor, GroundingMixin, TextGenerationPreprocessor
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 1f42f5f4a1..1a47f44a88 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -440,6 +440,14 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         return row
 
 
+class ClsPreprocessor(ResponsePreprocessor):
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        res = super().preprocess(row)
+        res['label'] = int(res['label'])
+        return res
+
+
 class AutoPreprocessor:
 
     def __init__(self, *, columns_mapping: Optional[Dict[str, str]] = None, **kwargs) -> None:
diff --git a/swift/llm/dataset/preprocessor/extra.py b/swift/llm/dataset/preprocessor/extra.py
index 06f64d104f..aa7ec4f72f 100644
--- a/swift/llm/dataset/preprocessor/extra.py
+++ b/swift/llm/dataset/preprocessor/extra.py
@@ -70,7 +70,7 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
         return super().preprocess(row)
 
 
-class ClsPreprocessor(ResponsePreprocessor):
+class ClsGenerationPreprocessor(ResponsePreprocessor):
 
     def __init__(self,
                  labels: List[str],
diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
index 431f4868a0..f4b2c035fc 100644
--- a/swift/llm/infer/infer.py
+++ b/swift/llm/infer/infer.py
@@ -185,15 +185,21 @@ def infer_dataset(self) -> List[Dict[str, Any]]:
             if is_dist:
                 val_dataset = val_dataset.shard(args.global_world_size, args.rank, contiguous=True)
             val_dataset = list(val_dataset)
-            labels_list = [InferRequest.remove_response(data['messages']) for data in val_dataset]
+            labels_list = []
+            for data in val_dataset:
+                if args.task_type == 'causal_lm':
+                    labels = InferRequest.remove_response(data['messages'])
+                else:
+                    labels = data.pop('label', None)
+                    if labels is not None:
+                        labels = str(int(labels))
+                labels_list.append(labels)
 
             resp_list = self.infer(
                 val_dataset, request_config, template=self.template, use_tqdm=True, **self.infer_kwargs)
             for data, resp, labels in zip(val_dataset, resp_list, labels_list):
                 response = resp.choices[0].message.content
-                if labels:
-                    data['labels'] = labels
-                data = {'response': response, 'logprobs': resp.choices[0].logprobs, **data}
+                data = {'response': response, 'labels': labels, 'logprobs': resp.choices[0].logprobs, **data}
                 result_list.append(data)
             if is_dist:
                 total_result_list = [None for _ in range(args.global_world_size)] if args.rank == 0 else None
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index 070d8c85b1..609be33429 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -79,7 +79,6 @@ def __init__(
         for adapter in self.adapters:
             self._add_adapter(safe_snapshot_download(adapter, use_hf=use_hf, hub_token=hub_token))
         self._post_init()
-        self.task_type = 'causal_lm'
 
     def _post_init(self):
         super()._post_init()
@@ -97,7 +96,6 @@ def from_model_template(cls, model, template=None, *, max_batch_size: int = 1):
         self.processor = template.processor
         self.max_batch_size = max_batch_size
         self._post_init()
-        self.task_type = self.model_info.task_type
         return self
 
     def _prepare_generation_config(self, request_config: RequestConfig) -> _GenerationConfig:
@@ -279,7 +277,7 @@ def _infer_seq_cls(self,
         if adapter_names is not None:
             call_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
-        inputs.pop('labels')
+        inputs.pop('labels', None)
         logits = self.model(**inputs, **call_kwargs).logits
         if logits.shape[-1] > 1:
             preds = torch.argmax(logits, dim=-1).tolist()
@@ -397,7 +395,7 @@ def _infer(
             template.model = self.model
 
         generation_config = None
-        if self.task_type == 'seq_cls':
+        if self.model_info.task_type == 'seq_cls':
             template.set_mode('seq_cls')
         else:
             template.set_mode('pt')
@@ -414,7 +412,7 @@ def _infer(
         inputs = to_device(template.data_collator(batched_inputs), self.model.device)
         if self.model.model_meta.is_multimodal:
             _, inputs = template.pre_forward_hook(self.model, None, inputs)
-        if self.task_type != 'seq_cls':
+        if self.model_info.task_type == 'causal_lm':
             self.set_default_max_tokens(request_config, inputs)
             generation_config = self._prepare_generation_config(request_config)
             self._add_stop_words(generation_config, request_config, template)
diff --git a/swift/llm/model/model/internlm.py b/swift/llm/model/model/internlm.py
index 868fa25cb8..63faa20b8f 100644
--- a/swift/llm/model/model/internlm.py
+++ b/swift/llm/model/model/internlm.py
@@ -347,4 +347,4 @@ def get_model_tokenizer_xcomposer_ol(model_dir, *args, **kwargs):
         get_model_tokenizer_reward_model,
         requires=['transformers>=4.38'],
         architectures=['InternLM2ForRewardModel'],
-        tags=['reward_model']))
+        task_type='seq_cls'))
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 46da0015d4..f4e94b0291 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -8,8 +8,8 @@
 
 import torch
 from peft import PeftModel
-from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, GenerationConfig,
-                          PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase)
+from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification,
+                          AutoTokenizer, GenerationConfig, PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase)
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import is_torch_bf16_gpu_available, is_torch_cuda_available, is_torch_npu_available, strtobool
 from transformers.utils.versions import require_version
@@ -62,6 +62,7 @@ class ModelMeta:
     # Additional files that need to be saved for full parameter training/merge-lora.
     additional_saved_files: List[str] = field(default_factory=list)
     torch_dtype: Optional[torch.dtype] = None
+    task_type: Literal['causal_lm', 'seq_cls', None] = None
 
     # File patterns to ignore when downloading the model.
     ignore_patterns: List[str] = field(default_factory=list)
@@ -158,7 +159,9 @@ def get_model_tokenizer_from_local(model_dir: str,
                                    automodel_class=None,
                                    **kwargs):
     """Load the model and tokenizer from the local model_dir."""
-    automodel_class = automodel_class or AutoModelForCausalLM
+    automodel_class_mapping = {'seq_cls': AutoModelForSequenceClassification, 'causal_lm': AutoModelForCausalLM}
+    if automodel_class is None:
+        automodel_class = automodel_class_mapping[model_info.task_type]
     if model_config is None:
         model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
     # fix prediction_step (internvl2, ovis, ...)
@@ -363,6 +366,7 @@ def get_model_info_meta(
         # model kwargs
         model_type: Optional[str] = None,
         quantization_config=None,
+        task_type=None,
         **kwargs) -> Tuple[ModelInfo, ModelMeta]:
     model_meta = get_matched_model_meta(model_id_or_path)
     model_dir = safe_snapshot_download(
@@ -389,6 +393,7 @@ def get_model_info_meta(
         logger.info(f'Setting torch_dtype: {torch_dtype}')
     _check_torch_dtype(torch_dtype)
     model_info.torch_dtype = torch_dtype
+    model_info.task_type = task_type or model_meta.task_type
 
     model_meta.check_requires(model_info)
     return model_info, model_meta
@@ -411,6 +416,7 @@ def get_model_tokenizer(
         attn_impl: Literal['flash_attn', 'sdpa', 'eager', None] = None,
         rope_scaling: Optional[Dict[str, Any]] = None,
         automodel_class=None,
+        task_type: Literal['causal_lm', 'seq_cls'] = 'causal_lm',
         model_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]:
     """
@@ -439,7 +445,8 @@ def get_model_tokenizer(
         revision=revision,
         download_model=download_model,
         model_type=model_type,
-        quantization_config=quantization_config)
+        quantization_config=quantization_config,
+        task_type=task_type)
 
     if not use_torchacc() and device_map is None:
         device_map = get_default_device_map()
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index 940f3df642..d8c13e6ccb 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -185,6 +185,7 @@ def _kto_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
 
     def _seq_cls_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded = self._encode(inputs)
+        encoded.pop('labels', None)
         if inputs.label is not None:
             encoded['labels'] = int(inputs.label)
         return encoded
@@ -205,6 +206,9 @@ def encode(self,
         elif isinstance(inputs, StdTemplateInputs):
             inputs = deepcopy(inputs)
 
+        if not self.is_training:
+            InferRequest.remove_response(inputs.messages)
+
         assert isinstance(inputs, StdTemplateInputs)
         self._preprocess_inputs(inputs)
         if self.mode in {'vllm', 'lmdeploy'}:
@@ -677,7 +681,14 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
     def debug_logger(self, inputs):
         if not strtobool(os.getenv('SWIFT_DEBUG', 'false')):
             return
-        self.print_inputs(inputs)
+        if 'input_ids' in inputs:
+            k = 'input_ids'
+            val = inputs['input_ids']
+        else:
+            k = 'generate_ids'
+            val = inputs['generate_ids']
+        for v in val:
+            self.print_inputs({k: v.tolist()})
 
     def get_generate_ids(self, generate_ids: Union[torch.Tensor, List[int]],
                          num_prompt_tokens: int) -> Union[torch.Tensor, List[int]]:
@@ -937,7 +948,9 @@ def _torchacc_xtuner_data_collator(self, res, padding_to, tokenizer, padding_sid
     def print_inputs(self, inputs: Dict[str, Any], tokenizer_kwargs: Optional[Dict[str, Any]] = None) -> None:
         if tokenizer_kwargs is None:
             tokenizer_kwargs = {}
-        for key in ['input', 'labels', 'generate_ids', 'chosen_input', 'chosen_labels', 'rejected_input', 'rejected_labels']:
+        for key in [
+                'input', 'labels', 'generate', 'chosen_input', 'chosen_labels', 'rejected_input', 'rejected_labels'
+        ]:
             val = inputs.get(key)  # fix val is a tensor
             if val is None:
                 val = inputs.get(f'{key}_ids')
diff --git a/swift/llm/template/template/internlm.py b/swift/llm/template/template/internlm.py
index 4d596d8a14..fb4e9682fa 100644
--- a/swift/llm/template/template/internlm.py
+++ b/swift/llm/template/template/internlm.py
@@ -34,8 +34,7 @@
 
 register_template(ChatmlTemplateMeta(LLMTemplateType.internlm2, default_system=INTERNLM_SYSTEM))
 
-register_template(
-    ChatmlTemplateMeta(RMTemplateType.internlm2_reward, suffix=['<|im_end|>\n<|reward|>']))
+register_template(ChatmlTemplateMeta(RMTemplateType.internlm2_reward, suffix=['<|im_end|>\n<|reward|>']))
 
 
 class InternLMXComposer2Template(Template):
diff --git a/swift/llm/template/template_inputs.py b/swift/llm/template/template_inputs.py
index 1f6734dde4..81aba5716b 100644
--- a/swift/llm/template/template_inputs.py
+++ b/swift/llm/template/template_inputs.py
@@ -46,7 +46,6 @@ def __post_init__(self):
             if isinstance(val, str):
                 setattr(self, key, [val])
         assert isinstance(self.messages, list), f'messages: {self.messages}'
-        self.remove_response(self.messages)
 
     @staticmethod
     def remove_response(messages) -> Optional[str]:
diff --git a/tests/test_align/test_cls.py b/tests/test_align/test_cls.py
new file mode 100644
index 0000000000..6ed752d68a
--- /dev/null
+++ b/tests/test_align/test_cls.py
@@ -0,0 +1,60 @@
+import os
+from pprint import pprint
+
+import torch
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+kwargs = {
+    'per_device_train_batch_size': 4,
+    'per_device_eval_batch_size': 4,
+    'gradient_accumulation_steps': 4,
+    'num_train_epochs': 1,
+    'save_steps': 100,
+    'max_length': 512,
+    'task_type': 'seq_cls',
+    'num_labels': 2,
+}
+
+
+def calc_acc(infer_result):
+    n_correct = 0
+    for res in infer_result:
+        if res['response'] == res['labels']:
+            n_correct += 1
+    return f'acc: {n_correct/len(infer_result)}, n_correct: {n_correct}, len(res): {len(infer_result)}'
+
+
+def test_llm():
+    from swift.llm import sft_main, TrainArguments, infer_main, InferArguments, Template
+    res = []
+    for model in ['Qwen/Qwen2.5-0.5B-Instruct', 'Qwen/Qwen2.5-0.5B', 'AI-ModelScope/bert-base-chinese']:
+        dataset = ['DAMO_NLP/jd:cls#2000']
+        result = sft_main(TrainArguments(model=model, dataset=dataset, split_dataset_ratio=0.1, **kwargs))
+        last_model_checkpoint = result['last_model_checkpoint']
+        infer_result = infer_main(
+            InferArguments(ckpt_dir=last_model_checkpoint, load_data_args=True, truncation_strategy='right'))
+        res.append(calc_acc(infer_result))
+        infer_result2 = infer_main(
+            InferArguments(
+                ckpt_dir=last_model_checkpoint, load_data_args=True, max_batch_size=16, truncation_strategy='right'))
+        res.append(calc_acc(infer_result2))
+
+    model = 'Qwen/Qwen2.5-0.5B-Instruct'
+    dataset = ['DAMO_NLP/jd#2000']
+    train_kwargs = kwargs.copy()
+    train_kwargs.pop('task_type')
+    train_kwargs.pop('num_labels')
+    result = sft_main(TrainArguments(model=model, dataset=dataset, split_dataset_ratio=0.1, **train_kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_result = infer_main(
+        InferArguments(ckpt_dir=last_model_checkpoint, load_data_args=True, truncation_strategy='right'))
+    res.append(calc_acc(infer_result))
+    infer_result2 = infer_main(
+        InferArguments(
+            ckpt_dir=last_model_checkpoint, load_data_args=True, max_batch_size=16, truncation_strategy='right'))
+    res.append(calc_acc(infer_result2))
+    pprint(res)
+
+
+if __name__ == '__main__':
+    test_llm()
diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index af6ba95ad2..e8e23fa353 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -177,7 +177,6 @@ def test_internlm2_reward():
     pt_engine.default_template.template_backend = 'jinja'
     res2 = _infer_model(pt_engine, messages=messages)
     assert res == res2 == '0.48681640625'
-    print()
 
 
 if __name__ == '__main__':

From 16c8c00dedd43f2e3c802f7911d77cf7cb964623 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 2 Jan 2025 21:41:26 +0800
Subject: [PATCH 23/47] update

---
 examples/deploy/lora/client.py                | 10 ++--
 examples/deploy/lora/server.sh                |  2 +-
 examples/export/push_to_hub.sh                |  3 +-
 examples/infer/demo_bert.py                   | 49 +++++++++++++++++++
 .../infer/{demo_lora.py => demo_multilora.py} |  8 +--
 .../{demo_rm.py => demo_reward_model.py}      |  0
 examples/infer/pt/bert.sh                     |  6 +++
 examples/infer/pt/reward_model.sh             |  3 ++
 examples/train/seq_cls/bert/deploy.sh         |  2 +-
 examples/train/seq_cls/bert/infer.sh          |  2 +-
 examples/train/seq_cls/bert/sft.sh            |  6 ++-
 examples/train/seq_cls/qwen2_5/deploy.sh      |  2 +-
 swift/hub/hub.py                              |  2 +-
 swift/llm/model/register.py                   |  2 +-
 14 files changed, 81 insertions(+), 16 deletions(-)
 create mode 100644 examples/infer/demo_bert.py
 rename examples/infer/{demo_lora.py => demo_multilora.py} (90%)
 rename examples/infer/{demo_rm.py => demo_reward_model.py} (100%)
 create mode 100644 examples/infer/pt/bert.sh
 create mode 100644 examples/infer/pt/reward_model.sh

diff --git a/examples/deploy/lora/client.py b/examples/deploy/lora/client.py
index 37ce773cf1..e61caad8ae 100644
--- a/examples/deploy/lora/client.py
+++ b/examples/deploy/lora/client.py
@@ -7,18 +7,18 @@ def infer_multilora(engine: InferClient, infer_request: InferRequest):
     print(f'models: {models}')
     request_config = RequestConfig(max_tokens=512, temperature=0)
 
-    # use lora
+    # use lora1
     resp_list = engine.infer([infer_request], request_config, model=models[1])
     response = resp_list[0].choices[0].message.content
-    print(f'lora-response: {response}')
+    print(f'lora1-response: {response}')
     # origin model
     resp_list = engine.infer([infer_request], request_config, model=models[0])
     response = resp_list[0].choices[0].message.content
     print(f'response: {response}')
-    # use lora
-    resp_list = engine.infer([infer_request], request_config, model=models[1])
+    # use lora2
+    resp_list = engine.infer([infer_request], request_config, model=models[2])
     response = resp_list[0].choices[0].message.content
-    print(f'lora-response: {response}')
+    print(f'lora2-response: {response}')
 
 
 if __name__ == '__main__':
diff --git a/examples/deploy/lora/server.sh b/examples/deploy/lora/server.sh
index ff3605aead..e3e2d925b2 100644
--- a/examples/deploy/lora/server.sh
+++ b/examples/deploy/lora/server.sh
@@ -3,5 +3,5 @@
 CUDA_VISIBLE_DEVICES=0 swift deploy \
     --host 0.0.0.0 \
     --port 8000 \
-    --adapters swift-lora=swift/test_lora \
+    --adapters lora1=swift/test_lora lora2=swift/test_lora2 \
     --infer_backend vllm
diff --git a/examples/export/push_to_hub.sh b/examples/export/push_to_hub.sh
index c4bcef7421..9771dfe2fa 100644
--- a/examples/export/push_to_hub.sh
+++ b/examples/export/push_to_hub.sh
@@ -2,4 +2,5 @@ CUDA_VISIBLE_DEVICES=0 swift export \
     --adapters output/vx-xxx/checkpoint-xxx \
     --push_to_hub true \
     --hub_model_id '<your-model-id>' \
-    --hub_token '<your-sdk-token>'
+    --hub_token '<your-sdk-token>' \
+    --use_hf false
diff --git a/examples/infer/demo_bert.py b/examples/infer/demo_bert.py
new file mode 100644
index 0000000000..2f5881b0d2
--- /dev/null
+++ b/examples/infer/demo_bert.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import List
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
+    request_config = RequestConfig(max_tokens=512, temperature=0)
+    resp_list = engine.infer(infer_requests, request_config)
+    query0 = infer_requests[0].messages[0]['content']
+    print(f'query0: {query0}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+    print(f'query1: {query1}')
+    print(f'response1: {resp_list[1].choices[0].message.content}')
+
+
+if __name__ == '__main__':
+    # This is an example of BERT with LoRA.
+    from swift.llm import (InferEngine, InferRequest, PtEngine, RequestConfig, load_dataset, safe_snapshot_download,
+                           BaseArguments)
+    from swift.tuners import Swift
+    adapter_path = safe_snapshot_download('swift/test_bert')
+    args = BaseArguments.from_pretrained(adapter_path)
+    # method1
+    model, processor = args.get_model_processor()
+    model = Swift.from_pretrained(model, adapter_path)
+    template = args.get_template(engine.processor)
+    engine = PtEngine.from_model_template(model, template, max_batch_size=64)
+
+    # method2
+    # engine = PtEngine(args.model, adapters=[adapter_path], max_batch_size=64, task_type=args.task_type)
+    # template = args.get_template(engine.processor)
+    # engine.default_template = template
+
+    # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
+    dataset = load_dataset(['DAMO_NLP/jd:cls#1000'], seed=42)[0]
+    print(f'dataset: {dataset}')
+    infer_requests = [InferRequest(messages=data['messages']) for data in dataset]
+    infer_batch(engine, infer_requests)
+
+    infer_batch(engine,
+                [InferRequest(messages=[{
+                    'role': 'user',
+                    'content': '今天天气真好呀'
+                }, {
+                    'role': 'user',
+                    'content': '真倒霉'
+                }])])
diff --git a/examples/infer/demo_lora.py b/examples/infer/demo_multilora.py
similarity index 90%
rename from examples/infer/demo_lora.py
rename to examples/infer/demo_multilora.py
index 0004c8577c..f75ef3c0fb 100644
--- a/examples/infer/demo_lora.py
+++ b/examples/infer/demo_multilora.py
@@ -7,6 +7,7 @@
 def infer_multilora(infer_request: 'InferRequest', infer_backend: Literal['vllm', 'pt']):
     # Dynamic LoRA
     adapter_path = safe_snapshot_download('swift/test_lora')
+    adapter_path2 = safe_snapshot_download('swift/test_lora2')
     args = BaseArguments.from_pretrained(adapter_path)
     if infer_backend == 'pt':
         engine = PtEngine(args.model)
@@ -16,19 +17,20 @@ def infer_multilora(infer_request: 'InferRequest', infer_backend: Literal['vllm'
     template = get_template(args.template, engine.processor, args.system)
     request_config = RequestConfig(max_tokens=512, temperature=0)
     adapter_request = AdapterRequest('lora1', adapter_path)
+    adapter_request2 = AdapterRequest('lora2', adapter_path2)
 
     # use lora
     resp_list = engine.infer([infer_request], request_config, template=template, adapter_request=adapter_request)
     response = resp_list[0].choices[0].message.content
-    print(f'lora-response: {response}')
+    print(f'lora1-response: {response}')
     # origin model
     resp_list = engine.infer([infer_request], request_config)
     response = resp_list[0].choices[0].message.content
     print(f'response: {response}')
     # use lora
-    resp_list = engine.infer([infer_request], request_config, template=template, adapter_request=adapter_request)
+    resp_list = engine.infer([infer_request], request_config, template=template, adapter_request=adapter_request2)
     response = resp_list[0].choices[0].message.content
-    print(f'lora-response: {response}')
+    print(f'lora2-response: {response}')
 
 
 def infer_pt(infer_request: 'InferRequest'):
diff --git a/examples/infer/demo_rm.py b/examples/infer/demo_reward_model.py
similarity index 100%
rename from examples/infer/demo_rm.py
rename to examples/infer/demo_reward_model.py
diff --git a/examples/infer/pt/bert.sh b/examples/infer/pt/bert.sh
new file mode 100644
index 0000000000..914679246d
--- /dev/null
+++ b/examples/infer/pt/bert.sh
@@ -0,0 +1,6 @@
+# Since `swift/test_lora` is trained by swift and contains an `args.json` file,
+# there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
+# To disable this behavior, please set `--load_args false`.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters swift/test_bert
diff --git a/examples/infer/pt/reward_model.sh b/examples/infer/pt/reward_model.sh
new file mode 100644
index 0000000000..3ad2b3f56f
--- /dev/null
+++ b/examples/infer/pt/reward_model.sh
@@ -0,0 +1,3 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --model Shanghai_AI_Laboratory/internlm2-1_8b-reward
diff --git a/examples/train/seq_cls/bert/deploy.sh b/examples/train/seq_cls/bert/deploy.sh
index c2102b6932..68021a695d 100644
--- a/examples/train/seq_cls/bert/deploy.sh
+++ b/examples/train/seq_cls/bert/deploy.sh
@@ -1,6 +1,6 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift deploy \
-    --model output/vx-xxx/checkpoint-xxx \
+    --adapters output/vx-xxx/checkpoint-xxx \
     --served_model_name bert-base-chinese \
     --truncation_strategy right
 
diff --git a/examples/train/seq_cls/bert/infer.sh b/examples/train/seq_cls/bert/infer.sh
index 13bc1a1fd3..e38f2955ef 100644
--- a/examples/train/seq_cls/bert/infer.sh
+++ b/examples/train/seq_cls/bert/infer.sh
@@ -1,6 +1,6 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
-    --model output/vx-xxx/checkpoint-xxx \
+    --adapters output/vx-xxx/checkpoint-xxx \
     --load_data_args true \
     --max_batch_size 16 \
     --truncation_strategy right
diff --git a/examples/train/seq_cls/bert/sft.sh b/examples/train/seq_cls/bert/sft.sh
index 35081e0afc..538e74337b 100644
--- a/examples/train/seq_cls/bert/sft.sh
+++ b/examples/train/seq_cls/bert/sft.sh
@@ -4,19 +4,23 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model AI-ModelScope/bert-base-chinese \
-    --train_type full \
+    --train_type lora \
     --dataset 'DAMO_NLP/jd:cls#2000' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
     --save_total_limit 2 \
     --logging_steps 5 \
     --max_length 512 \
+    --truncation_strategy right \
     --output_dir output \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
diff --git a/examples/train/seq_cls/qwen2_5/deploy.sh b/examples/train/seq_cls/qwen2_5/deploy.sh
index 3bc08c297c..71627c0ff8 100644
--- a/examples/train/seq_cls/qwen2_5/deploy.sh
+++ b/examples/train/seq_cls/qwen2_5/deploy.sh
@@ -3,6 +3,6 @@ swift deploy \
     --adapters output/vx-xxx/checkpoint-xxx
 
 # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
-# "model": "Qwen2.5-7B",
+# "model": "Qwen2.5-0.5B",
 # "messages": [{"role": "user", "content": "包装差，容易被调包。"}]
 # }'
diff --git a/swift/hub/hub.py b/swift/hub/hub.py
index 704cfa39ed..0b2297d68c 100644
--- a/swift/hub/hub.py
+++ b/swift/hub/hub.py
@@ -273,7 +273,7 @@ def push_to_hub(cls,
             token or cls.ms_token,
             private,
             commit_message=commit_message,
-            ignore_patterns=ignore_patterns,
+            ignore_file_pattern=ignore_patterns,
             revision=revision,
             tag=path_in_repo)
 
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index f4e94b0291..b48203329b 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -393,7 +393,7 @@ def get_model_info_meta(
         logger.info(f'Setting torch_dtype: {torch_dtype}')
     _check_torch_dtype(torch_dtype)
     model_info.torch_dtype = torch_dtype
-    model_info.task_type = task_type or model_meta.task_type
+    model_info.task_type = model_meta.task_type or task_type
 
     model_meta.check_requires(model_info)
     return model_info, model_meta

From 368b2ef5e8b7cc7bade20c58205e372e93d66569 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 3 Jan 2025 11:37:00 +0800
Subject: [PATCH 24/47] update

---
 examples/deploy/bert/client.py                | 29 +++++++++++++++++++
 examples/deploy/bert/server.sh                | 11 +++++++
 examples/deploy/reward_model/client.py        | 18 ++++++++++++
 examples/deploy/reward_model/server.sh        |  5 ++++
 examples/infer/demo_bert.py                   | 26 ++++++++++-------
 examples/infer/demo_reward_model.py           |  6 ++--
 examples/infer/pt/bert.sh                     |  4 ++-
 examples/infer/pt/reward_model.sh             |  4 ++-
 examples/train/seq_cls/bert/deploy.sh         |  3 +-
 examples/train/seq_cls/bert/infer.sh          |  3 +-
 swift/llm/argument/base_args/model_args.py    |  3 +-
 swift/llm/argument/base_args/template_args.py |  3 --
 swift/llm/infer/infer.py                      | 16 +++++++---
 swift/llm/model/utils.py                      |  2 +-
 swift/llm/template/base.py                    |  2 ++
 15 files changed, 107 insertions(+), 28 deletions(-)
 create mode 100644 examples/deploy/bert/client.py
 create mode 100644 examples/deploy/bert/server.sh
 create mode 100644 examples/deploy/reward_model/client.py
 create mode 100644 examples/deploy/reward_model/server.sh

diff --git a/examples/deploy/bert/client.py b/examples/deploy/bert/client.py
new file mode 100644
index 0000000000..0387b10233
--- /dev/null
+++ b/examples/deploy/bert/client.py
@@ -0,0 +1,29 @@
+from typing import List
+
+from swift.llm import InferClient, InferRequest, RequestConfig
+
+
+def infer_batch(engine: 'InferEngine', infer_requests: List[InferRequest]):
+    resp_list = engine.infer(infer_requests)
+    query0 = infer_requests[0].messages[0]['content']
+    query1 = infer_requests[1].messages[0]['content']
+    print(f'query0: {query0}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+    print(f'query1: {query1}')
+    print(f'response1: {resp_list[1].choices[0].message.content}')
+
+
+if __name__ == '__main__':
+    engine = InferClient(host='127.0.0.1', port=8000)
+    models = engine.models
+    print(f'models: {models}')
+    infer_batch(engine, [
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '今天天气真好呀'
+        }]),
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '真倒霉'
+        }])
+    ])
diff --git a/examples/deploy/bert/server.sh b/examples/deploy/bert/server.sh
new file mode 100644
index 0000000000..07208b50af
--- /dev/null
+++ b/examples/deploy/bert/server.sh
@@ -0,0 +1,11 @@
+# Since `swift/test_lora` is trained by swift and contains an `args.json` file,
+# there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --adapters swift/test_bert \
+    --infer_backend pt \
+    --truncation_strategy right \
+    --max_length 512 \
+    --served_model_name bert-base-chinese
+
diff --git a/examples/deploy/reward_model/client.py b/examples/deploy/reward_model/client.py
new file mode 100644
index 0000000000..07f74b11b9
--- /dev/null
+++ b/examples/deploy/reward_model/client.py
@@ -0,0 +1,18 @@
+from typing import List
+
+from swift.llm import InferClient, InferRequest, RequestConfig
+
+if __name__ == '__main__':
+    engine = InferClient(host='127.0.0.1', port=8000)
+    models = engine.models
+    print(f'models: {models}')
+    messages = [{
+        'role': 'user',
+        'content': "Hello! What's your name?"
+    }, {
+        'role': 'assistant',
+        'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
+    }]
+    resp_list = engine.infer([InferRequest(messages=messages)])
+    print(f'messages: {messages}')
+    print(f'response: {resp_list[0].choices[0].message.content}')
diff --git a/examples/deploy/reward_model/server.sh b/examples/deploy/reward_model/server.sh
new file mode 100644
index 0000000000..53f70e3b79
--- /dev/null
+++ b/examples/deploy/reward_model/server.sh
@@ -0,0 +1,5 @@
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
+    --infer_backend pt
diff --git a/examples/infer/demo_bert.py b/examples/infer/demo_bert.py
index 2f5881b0d2..3d28d0aa76 100644
--- a/examples/infer/demo_bert.py
+++ b/examples/infer/demo_bert.py
@@ -6,9 +6,9 @@
 
 
 def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
-    request_config = RequestConfig(max_tokens=512, temperature=0)
-    resp_list = engine.infer(infer_requests, request_config)
+    resp_list = engine.infer(infer_requests)
     query0 = infer_requests[0].messages[0]['content']
+    query1 = infer_requests[1].messages[0]['content']
     print(f'query0: {query0}')
     print(f'response0: {resp_list[0].choices[0].message.content}')
     print(f'query1: {query1}')
@@ -22,10 +22,12 @@ def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
     from swift.tuners import Swift
     adapter_path = safe_snapshot_download('swift/test_bert')
     args = BaseArguments.from_pretrained(adapter_path)
+    args.max_length = 512
+    args.truncation_strategy = 'right'
     # method1
     model, processor = args.get_model_processor()
     model = Swift.from_pretrained(model, adapter_path)
-    template = args.get_template(engine.processor)
+    template = args.get_template(processor)
     engine = PtEngine.from_model_template(model, template, max_batch_size=64)
 
     # method2
@@ -39,11 +41,13 @@ def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
     infer_requests = [InferRequest(messages=data['messages']) for data in dataset]
     infer_batch(engine, infer_requests)
 
-    infer_batch(engine,
-                [InferRequest(messages=[{
-                    'role': 'user',
-                    'content': '今天天气真好呀'
-                }, {
-                    'role': 'user',
-                    'content': '真倒霉'
-                }])])
+    infer_batch(engine, [
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '今天天气真好呀'
+        }]),
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '真倒霉'
+        }])
+    ])
diff --git a/examples/infer/demo_reward_model.py b/examples/infer/demo_reward_model.py
index 7e32932a7c..91e66bb8b5 100644
--- a/examples/infer/demo_reward_model.py
+++ b/examples/infer/demo_reward_model.py
@@ -6,10 +6,8 @@
 
 
 def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
-    request_config = RequestConfig(max_tokens=512, temperature=0)
-    resp_list = engine.infer(infer_requests, request_config)
-    query0 = infer_requests[0].messages[0]['content']
-    print(f'query0: {query0}')
+    resp_list = engine.infer(infer_requests)
+    print(f'messages0: {infer_requests[0].messages}')
     print(f'response0: {resp_list[0].choices[0].message.content}')
 
 
diff --git a/examples/infer/pt/bert.sh b/examples/infer/pt/bert.sh
index 914679246d..28fbc566ae 100644
--- a/examples/infer/pt/bert.sh
+++ b/examples/infer/pt/bert.sh
@@ -3,4 +3,6 @@
 # To disable this behavior, please set `--load_args false`.
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
-    --adapters swift/test_bert
+    --adapters swift/test_bert \
+    --truncation_strategy right \
+    --max_length 512
diff --git a/examples/infer/pt/reward_model.sh b/examples/infer/pt/reward_model.sh
index 3ad2b3f56f..2d0b63d140 100644
--- a/examples/infer/pt/reward_model.sh
+++ b/examples/infer/pt/reward_model.sh
@@ -1,3 +1,5 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
-    --model Shanghai_AI_Laboratory/internlm2-1_8b-reward
+    --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
+    --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \
+    --max_batch_size 64
diff --git a/examples/train/seq_cls/bert/deploy.sh b/examples/train/seq_cls/bert/deploy.sh
index 68021a695d..58be2ee93e 100644
--- a/examples/train/seq_cls/bert/deploy.sh
+++ b/examples/train/seq_cls/bert/deploy.sh
@@ -2,7 +2,8 @@ CUDA_VISIBLE_DEVICES=0 \
 swift deploy \
     --adapters output/vx-xxx/checkpoint-xxx \
     --served_model_name bert-base-chinese \
-    --truncation_strategy right
+    --truncation_strategy right \
+    --max_length 512
 
 # curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
 # "model": "bert-base-chinese",
diff --git a/examples/train/seq_cls/bert/infer.sh b/examples/train/seq_cls/bert/infer.sh
index e38f2955ef..c51124f1b0 100644
--- a/examples/train/seq_cls/bert/infer.sh
+++ b/examples/train/seq_cls/bert/infer.sh
@@ -3,4 +3,5 @@ swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --load_data_args true \
     --max_batch_size 16 \
-    --truncation_strategy right
+    --truncation_strategy right \
+    --max_length 512
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index 5cc0bbbbf3..924f32adcc 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -116,6 +116,7 @@ def _init_rope_scaling(self):
 
     def _init_model_info(self) -> torch.dtype:
         self.model_info, self.model_meta = get_model_info_meta(**self.get_model_kwargs())
+        self.task_type = self.model_info.task_type
         self.model_dir = self.model_info.model_dir
         self.model_type = self.model_info.model_type
         if isinstance(self.rope_scaling, str):
@@ -152,8 +153,8 @@ def get_model_kwargs(self):
             'quantization_config': self.get_quantization_config(),
             'attn_impl': self.attn_impl,
             'rope_scaling': self.rope_scaling,
+            'task_type': self.task_type,
         }
         if self.task_type == 'seq_cls':
-            kwargs['task_type'] = self.task_type
             kwargs['model_kwargs'] = {'num_labels': self.num_labels}
         return kwargs
diff --git a/swift/llm/argument/base_args/template_args.py b/swift/llm/argument/base_args/template_args.py
index 64ed6c1cd8..98c4d80a72 100644
--- a/swift/llm/argument/base_args/template_args.py
+++ b/swift/llm/argument/base_args/template_args.py
@@ -47,9 +47,6 @@ def __post_init__(self):
         if self.template is None and hasattr(self, 'model_meta'):
             self.template = self.model_meta.template
 
-        if self.max_length is None and hasattr(self, 'model_info'):
-            self.max_length = self.model_info.max_model_len
-
     def get_template_kwargs(self):
         truncation_strategy = self.truncation_strategy
         if truncation_strategy == 'delete':
diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
index f4b2c035fc..271f025b17 100644
--- a/swift/llm/infer/infer.py
+++ b/swift/llm/infer/infer.py
@@ -138,10 +138,18 @@ def infer_cli(self) -> List[Dict[str, Any]]:
             infer_state.add_query(query)
             if args.model_meta.is_multimodal:
                 infer_state.input_mm_data()
-            data = infer_state.to_dict()
-            response = self.infer_single(data, request_config)
-            infer_state.add_response(response)
-            data = {'response': response, **data}
+            if args.task_type == 'seq_cls' and args.num_labels in {None, 1}:
+                # reward model
+                response = infer_state.input_text()
+                infer_state.add_response(response)
+                data = infer_state.to_dict()
+                response = self.infer_single(data, request_config)
+                data = {'response': response, **data}
+            else:
+                data = infer_state.to_dict()
+                response = self.infer_single(data, request_config)
+                infer_state.add_response(response)
+                data = {'response': response, **data}
             result_list.append(data)
             if self.jsonl_writer:
                 self.jsonl_writer.append(data)
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
index efacbc91fb..21bf592c00 100644
--- a/swift/llm/model/utils.py
+++ b/swift/llm/model/utils.py
@@ -59,7 +59,7 @@ class ModelInfo:
 
     # extra
     config: Optional[PretrainedConfig] = None
-    task_type: Optional[str] = None
+    task_type: Literal['causal_lm', 'seq_cls', None] = None
 
     def __post_init__(self):
         from .register import get_model_name
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index d8c13e6ccb..3648de64c4 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -79,6 +79,8 @@ def __init__(
         self.model_info = processor.model_info
         self.config = self.model_info.config
         self.model_meta = processor.model_meta
+        if max_length is None:
+            max_length = self.model_info.max_model_len
         tokenizer = self.tokenizer
 
         if not use_chat_template:

From 9a783ddeab4ae293b1fd8946c86cddf9a7e404f5 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 3 Jan 2025 11:42:34 +0800
Subject: [PATCH 25/47] update

---
 tests/train/test_rm.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/train/test_rm.py b/tests/train/test_rm.py
index 2d05c7e035..91a8fc6872 100644
--- a/tests/train/test_rm.py
+++ b/tests/train/test_rm.py
@@ -10,25 +10,26 @@
     'num_train_epochs': 1,
 }
 
+
 def test_infer():
     from swift.llm import infer_main, InferArguments
-    infer_main(InferArguments(model='Shanghai_AI_Laboratory/internlm2-1_8b-reward',
-                              val_dataset='AI-ModelScope/alpaca-gpt4-data-zh#500'))
+    infer_main(
+        InferArguments(
+            model='Shanghai_AI_Laboratory/internlm2-1_8b-reward', val_dataset='AI-ModelScope/alpaca-gpt4-data-zh#500'))
 
 
 def test_llm():
     from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
     result = sft_main(
         TrainArguments(
-            model='Qwen/Qwen2.5-1.5B-Instruct',
+            model='Shanghai_AI_Laboratory/internlm2-1_8b-reward',
             train_type='lora',
-            num_labels=2,
-            dataset=['DAMO_NLP/jd:cls#2000'],
+            dataset=['hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh#100'],
             **kwargs))
     last_model_checkpoint = result['last_model_checkpoint']
     infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
 
 
 if __name__ == '__main__':
-    test_infer()
-    # test_llm()
+    # test_infer()
+    test_llm()

From 121164026b4d612461743767a38c8d2584d41d48 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 3 Jan 2025 15:06:09 +0800
Subject: [PATCH 26/47] revert

---
 tests/test_align/test_template/test_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index e8e23fa353..46c1b4c90f 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -2,7 +2,7 @@
 
 import torch
 
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
 os.environ['SWIFT_DEBUG'] = '1'
 
 

From e7e7fd63eee27776eff4010baded1e5da6d4e162 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 5 Jan 2025 22:21:35 +0800
Subject: [PATCH 27/47] update

---
 swift/llm/model/model/reward_model.py | 14 -----------
 tests/train/test_rm.py                | 35 ---------------------------
 2 files changed, 49 deletions(-)
 delete mode 100644 swift/llm/model/model/reward_model.py
 delete mode 100644 tests/train/test_rm.py

diff --git a/swift/llm/model/model/reward_model.py b/swift/llm/model/model/reward_model.py
deleted file mode 100644
index 30a9d5660d..0000000000
--- a/swift/llm/model/model/reward_model.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from transformers import AutoConfig, AutoModel
-
-from swift.llm import TemplateType
-from swift.utils import get_logger
-from ..constant import RMModelType
-from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
-
-logger = get_logger()
-
-# ModelGroup([
-#     Model('Qwen/Qwen2.5-Math-RM-72B', 'Qwen/Qwen2.5-Math-RM-72B'),
-#     Model('Qwen/Qwen2-Math-RM-72B', 'Qwen/Qwen2-Math-RM-72B'),
-# ]),
diff --git a/tests/train/test_rm.py b/tests/train/test_rm.py
deleted file mode 100644
index 91a8fc6872..0000000000
--- a/tests/train/test_rm.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-kwargs = {
-    'per_device_train_batch_size': 2,
-    'per_device_eval_batch_size': 2,
-    'save_steps': 50,
-    'gradient_accumulation_steps': 4,
-    'num_train_epochs': 1,
-}
-
-
-def test_infer():
-    from swift.llm import infer_main, InferArguments
-    infer_main(
-        InferArguments(
-            model='Shanghai_AI_Laboratory/internlm2-1_8b-reward', val_dataset='AI-ModelScope/alpaca-gpt4-data-zh#500'))
-
-
-def test_llm():
-    from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
-    result = sft_main(
-        TrainArguments(
-            model='Shanghai_AI_Laboratory/internlm2-1_8b-reward',
-            train_type='lora',
-            dataset=['hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh#100'],
-            **kwargs))
-    last_model_checkpoint = result['last_model_checkpoint']
-    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
-
-
-if __name__ == '__main__':
-    # test_infer()
-    test_llm()

From 2b47806c7b4221b39fa6e7fa1f8016589e176e77 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 00:06:19 +0800
Subject: [PATCH 28/47] update

---
 swift/llm/argument/rlhf_args.py            | 11 +++---
 swift/llm/infer/utils.py                   |  6 ++--
 swift/llm/train/rlhf.py                    | 41 +++++++++++++++++-----
 swift/llm/train/tuner.py                   | 16 +++------
 swift/trainers/rlhf_trainer/ppo_trainer.py | 26 +++++++-------
 5 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 914aab2769..e9d3ec72d4 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import List, Literal, Optional
 
 from swift.llm import MODEL_MAPPING
 from .train_args import TrainArguments
@@ -44,6 +44,7 @@ class RLHFArguments(TrainArguments):
     undesirable_weight: float = 1.0
     # PPO
     reward_model: Optional[str] = None
+    reward_adapters: List[str] = field(default_factory=list)
     reward_model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     reward_model_revision: Optional[str] = None
@@ -71,9 +72,11 @@ def __post_init__(self):
             raise ValueError('CPO/ORPO or LoRA training does not require a ref_model to be passed in.')
 
     def _init_ppo(self):
-        self.response_length = self.max_new_tokens
-        self.num_ppo_epochs = self.num_train_epochs
-        # TODO: streaming, MLLM
+        if self.rlhf_type == 'ppo':
+            self.response_length = self.max_new_tokens
+            self.num_ppo_epochs = self.num_train_epochs
+            self.padding_side = 'left'
+            # TODO: streaming, MLLM
 
     def _init_simpo(self):
         if self.rlhf_type != 'simpo':
diff --git a/swift/llm/infer/utils.py b/swift/llm/infer/utils.py
index 67f1b02f01..6f004857dc 100644
--- a/swift/llm/infer/utils.py
+++ b/swift/llm/infer/utils.py
@@ -118,7 +118,7 @@ def check_query(self, query: str) -> Optional[str]:
         return query
 
 
-def _prepare_adapter(args, model):
+def prepare_adapter(args, model, adapters=None):
     if args.tuner_backend == 'unsloth':
         if args.model_meta.is_multimodal:
             from unsloth import FastVisionModel as UnslothModel
@@ -131,7 +131,7 @@ def _prepare_adapter(args, model):
     else:
         tuner = Swift
     # compat deploy
-    for adapter in args.adapters:
+    for adapter in adapters:
         model = tuner.from_pretrained(model, adapter)
     if args.train_type == 'bone':
         # Bone has a problem of float32 matmul with bloat16 in `peft==0.14.0`
@@ -141,6 +141,6 @@ def _prepare_adapter(args, model):
 
 def prepare_model_template(args, **kwargs):
     model, processor = args.get_model_processor(**kwargs)
-    model = _prepare_adapter(args, model)
+    model = prepare_adapter(args, model, args.adapters)
     template = args.get_template(processor)
     return model, template
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 2e5bff8910..1f0bbea640 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -12,21 +12,41 @@ class SwiftRLHF(SwiftSft):
     args: args_class
 
     def _prepare_model_tokenizer(self):
+        from swift.llm.infer.utils import prepare_adapter
         args = self.args
-        self.ref_model = None
-        if args.ref_model:
+        for key in ['ref', 'reward', 'value']:
+            origin_key = key
+            setattr(self, f'{key}_model', None)
+            if key == 'value':
+                if args.rlhf_type == 'ppo':
+                    key = 'reward'
+                else:
+                    continue
+            model_id_or_path = getattr(args, f'{key}_model')
+            if model_id_or_path is None:
+                continue
+            model_type = getattr(args, f'{key}_model_type')
+            model_revision = getattr(args, f'{key}_model_revision')
+            adapters = args.adapters if key == 'ref' else args.reward_adapters
+
             # Be aware of the unexpected behavior caused by double monkey patching.
-            self.ref_model, _ = args.get_model_processor(
-                model=args.ref_model, model_type=args.ref_model_type, model_revision=args.ref_model_revision)
-            self.ref_model.requires_grad_(False).eval()
+            model = args.get_model_processor(
+                model=model_id_or_path, model_type=model_type, model_revision=model_revision)[0]
+
+            model = prepare_adapter(args, model, adapters)
+            if origin_key in {'ref', 'reward'}:
+                model.requires_grad_(False).eval()
+            else:
+                model = self.prepare_model(args, model, task_type='seq_cls')
+            setattr(self, f'{origin_key}_model', model)
 
         super()._prepare_model_tokenizer()
 
     def _prepare_template(self) -> None:
         args = self.args
         super()._prepare_template()
-        mode = 'kto' if args.rlhf_type == 'kto' else 'rlhf'
-        self.template.set_mode(mode)
+        model_mapping = {'kto': 'kto', 'ppo': 'pt'}
+        self.template.set_mode(model_mapping.get(args.rlhf_type, 'rlhf'))
 
         if args.rlhf_type != 'orpo' or args.model_meta.is_multimodal:
             # Avoid padding labels during the model's forward pass in multimodal models.
@@ -41,8 +61,11 @@ def _get_dataset(self):
 
     def _get_trainer_kwargs(self):
         trainer_kwargs = {}
-        if self.ref_model:
-            trainer_kwargs['ref_model'] = self.ref_model
+        for key in ['ref', 'reward', 'value']:
+            key = f'{key}_model'
+            model = getattr(self, key)
+            if model:
+                trainer_kwargs[key] = model
         return trainer_kwargs
 
 
diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
index 4584bdccae..afa6fe7692 100644
--- a/swift/llm/train/tuner.py
+++ b/swift/llm/train/tuner.py
@@ -136,7 +136,7 @@ def get_vera_target_modules(model, config):
     return config
 
 
-def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset=None):
+def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset=None, task_type=None):
     from swift.tuners import (AdaLoraConfig, AdapterConfig, BOFTConfig, LLaMAProConfig, LongLoRAModelType, LoraConfig,
                               LoRAConfig, ReftConfig, Swift, VeraConfig)
     target_modules = get_target_modules(args, model)
@@ -153,7 +153,7 @@ def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset
         'lorap_lr_ratio': args.lorap_lr_ratio,
         'init_lora_weights': args.init_weights,
     }
-    task_type = args.task_type.upper()
+    task_type = (task_type or args.task_type).upper()
     if args.train_type in ('lora', 'longlora'):
         if args.use_swift_lora:
             lora_config = LoRAConfig(lora_dtype=args.lora_dtype, **lora_kwargs)
@@ -329,14 +329,7 @@ def torchacc_resume_from_checkpoint(args, model):
 class TunerMixin:
 
     @classmethod
-    def prepare_model(
-        cls,
-        args,
-        model,
-        *,
-        template=None,
-        train_dataset=None,
-    ):
+    def prepare_model(cls, args, model, *, template=None, train_dataset=None, task_type=None):
         if args.use_liger:
             # Apply liger
             apply_liger(args.model_type)
@@ -361,7 +354,8 @@ def prepare_model(
                     tuner: Tuner = extra_tuners[args.train_type]
                     model = tuner.prepare_model(args, model)
                 else:
-                    model = prepare_adapter(args, model, template=template, train_dataset=train_dataset)
+                    model = prepare_adapter(
+                        args, model, template=template, train_dataset=train_dataset, task_type=task_type)
             # fix bug: Attempting to unscale FP16 gradients.
             #   peft: https://github.com/huggingface/peft/issues/1249
             for p in model.parameters():
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index bcdfbf6b27..29d11191d2 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from contextlib import contextmanager
+
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
 from trl import PPOv2Trainer as HFPPOTrainer
@@ -9,6 +11,18 @@
 
 class PPOTrainer(RLHFTrainerMixin, SwiftMixin, HFPPOTrainer):
 
+    @contextmanager
+    def _patch_dataloader():
+        print()
+
+    @contextmanager
+    def _patch_init():
+        kwargs_to_pop = ['model', 'model_init', 'compute_metrics', 'preprocess_logits_for_metrics']
+        for kwarg in kwargs_to_pop:
+            kwargs.pop(kwarg, None)
+        kwargs['config'] = kwargs.pop('args')
+        HFPPOTrainer.__init__(self, **kwargs)
+
     def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, **kwargs):
         kwargs['policy'] = model
         kwargs['ref_policy'] = ref_model
@@ -33,15 +47,3 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
     def train(self, *args, **kwargs):
         # remove args that are not needed for the HFPPOTrainer
         HFPPOTrainer.train(self)
-
-
-def patched_init(self, **kwargs):
-    kwargs_to_pop = ['model', 'model_init', 'compute_metrics', 'preprocess_logits_for_metrics']
-    for kwarg in kwargs_to_pop:
-        kwargs.pop(kwarg, None)
-    kwargs['config'] = kwargs.pop('args')
-    original_init(self, **kwargs)
-
-
-original_init = HFPPOTrainer.__init__
-HFPPOTrainer.__init__ = patched_init

From ea082cdc4e3a4651e15d1bfeb40ac76345ecc03f Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 10:34:33 +0800
Subject: [PATCH 29/47] update

---
 swift/llm/train/rlhf.py  |  4 ++++
 swift/llm/train/tuner.py |  8 +++++---
 tests/train/test_ppo.py  | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 tests/train/test_ppo.py

diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 1f0bbea640..9cb0eb124c 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -38,6 +38,10 @@ def _prepare_model_tokenizer(self):
                 model.requires_grad_(False).eval()
             else:
                 model = self.prepare_model(args, model, task_type='seq_cls')
+                logger.info(f'value_model: {model}')
+                model_parameter_info = get_model_parameter_info(model)
+                self.train_msg['value_model_parameter_info'] = model_parameter_info
+                logger.info(f'value_model_parameter_info: {model_parameter_info}')
             setattr(self, f'{origin_key}_model', model)
 
         super()._prepare_model_tokenizer()
diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
index afa6fe7692..c47212934d 100644
--- a/swift/llm/train/tuner.py
+++ b/swift/llm/train/tuner.py
@@ -105,7 +105,7 @@ def get_target_modules(args, model) -> Union[str, List[str]]:
     return target_modules
 
 
-def get_modules_to_save(args, model):
+def get_modules_to_save(args, model, task_type=None):
     modules_to_save = args.modules_to_save.copy()
     if 'all-embedding' in args.modules_to_save:
         modules_to_save.remove('all-embedding')
@@ -113,6 +113,8 @@ def get_modules_to_save(args, model):
     if 'all-norm' in args.modules_to_save:
         modules_to_save.remove('all-norm')
         modules_to_save += find_norm(model)
+    if task_type and task_type.lower() == 'seq_cls':  # reward_model
+        modules_to_save.append('v_head')
     return modules_to_save
 
 
@@ -139,8 +141,9 @@ def get_vera_target_modules(model, config):
 def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset=None, task_type=None):
     from swift.tuners import (AdaLoraConfig, AdapterConfig, BOFTConfig, LLaMAProConfig, LongLoRAModelType, LoraConfig,
                               LoRAConfig, ReftConfig, Swift, VeraConfig)
+    task_type = (task_type or args.task_type).upper()
     target_modules = get_target_modules(args, model)
-    modules_to_save = get_modules_to_save(args, model)
+    modules_to_save = get_modules_to_save(args, model, task_type)
     lora_kwargs = {
         'r': args.lora_rank,
         'target_modules': target_modules,
@@ -153,7 +156,6 @@ def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset
         'lorap_lr_ratio': args.lorap_lr_ratio,
         'init_lora_weights': args.init_weights,
     }
-    task_type = (task_type or args.task_type).upper()
     if args.train_type in ('lora', 'longlora'):
         if args.use_swift_lora:
             lora_config = LoRAConfig(lora_dtype=args.lora_dtype, **lora_kwargs)
diff --git a/tests/train/test_ppo.py b/tests/train/test_ppo.py
new file mode 100644
index 0000000000..073701d3fe
--- /dev/null
+++ b/tests/train/test_ppo.py
@@ -0,0 +1,40 @@
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+kwargs = {
+    'per_device_train_batch_size': 2,
+    'save_steps': 5,
+    'gradient_accumulation_steps': 4,
+    'num_train_epochs': 1,
+}
+
+
+def test_rm():
+    from swift.llm import rlhf_main, RLHFArguments, infer_main, InferArguments
+    result = rlhf_main(
+        RLHFArguments(
+            rlhf_type='rm',
+            model='Shanghai_AI_Laboratory/internlm2-1_8b-reward',
+            dataset=['hjh0119/shareAI-Llama3-DPO-zh-en-emoji#100'],
+            **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True, merge_lora=True))
+
+
+def test_ppo():
+    from swift.llm import rlhf_main, RLHFArguments, infer_main, InferArguments
+    result = rlhf_main(
+        RLHFArguments(
+            rlhf_type='ppo',
+            model='Qwen/Qwen2.5-7B-Instruct',
+            reward_model='AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2',
+            dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100', 'AI-ModelScope/alpaca-gpt4-data-en#100'],
+            **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True, merge_lora=True))
+
+
+if __name__ == '__main__':
+    # test_rm()
+    test_ppo()

From 2f045c97c4fb8a8c55dd660027995158768d7e30 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 14:21:39 +0800
Subject: [PATCH 30/47] update

---
 swift/llm/train/rlhf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 9cb0eb124c..f2e715b4bb 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -1,11 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import List, Union
 
-from swift.utils import patch_getattr
+from swift.utils import get_logger, get_model_parameter_info, patch_getattr
 from ..argument import RLHFArguments
 from .kto import prepare_kto_dataset
 from .sft import SwiftSft
 
+logger = get_logger()
+
 
 class SwiftRLHF(SwiftSft):
     args_class = RLHFArguments

From 6c62557c43366e0ccdd6c819721a1d901510272c Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 16:38:04 +0800
Subject: [PATCH 31/47] update

---
 swift/trainers/arguments.py                |  4 +-
 swift/trainers/mixin.py                    | 29 ++++++------
 swift/trainers/rlhf_trainer/ppo_trainer.py | 54 +++++++++-------------
 3 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index a0b78b947f..f107b41aff 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -80,7 +80,7 @@ class Seq2SeqTrainingArguments(SwiftArgumentsMixin, HfSeq2SeqTrainingArguments):
 
 try:
     from trl import (DPOConfig as HfDPOConfig, CPOConfig as HfCPOConfig, ORPOConfig as HfORPOConfig, KTOConfig as
-                     HfKTOConfig, RewardConfig as HfRewardConfig, PPOv2Config as HfPPOConfig)
+                     HfKTOConfig, RewardConfig as HfRewardConfig, PPOv2Config as HfPPOv2Config)
 
     @dataclass
     class DPOConfig(SwiftArgumentsMixin, HfDPOConfig):
@@ -103,7 +103,7 @@ class RewardConfig(SwiftArgumentsMixin, HfRewardConfig):
         pass
 
     @dataclass
-    class PPOConfig(SwiftArgumentsMixin, HfPPOConfig):
+    class PPOConfig(SwiftArgumentsMixin, HfPPOv2Config):
         pass
 
 except (ImportError, RuntimeError):
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index d0281bcdfc..5eb7a72dd3 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -44,21 +44,20 @@
 
 class SwiftMixin:
 
-    def __init__(
-            self,
-            model: Union[PreTrainedModel, Module] = None,
-            args: TrainingArguments = None,
-            data_collator: Optional[DataCollator] = None,
-            train_dataset: Optional[HfDataset] = None,
-            eval_dataset: Optional[Union[HfDataset, Dict[str, HfDataset]]] = None,
-            template: Optional[Template] = None,
-            model_init: Optional[Callable[[], PreTrainedModel]] = None,
-            compute_loss_func: Optional[Callable] = None,
-            compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-            callbacks: Optional[List[TrainerCallback]] = None,
-            optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-            preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor],
-                                                             torch.Tensor]] = None) -> None:
+    def __init__(self,
+                 model: Union[PreTrainedModel, Module] = None,
+                 args: TrainingArguments = None,
+                 data_collator: Optional[DataCollator] = None,
+                 train_dataset: Optional[HfDataset] = None,
+                 eval_dataset: Optional[Union[HfDataset, Dict[str, HfDataset]]] = None,
+                 template: Optional[Template] = None,
+                 model_init: Optional[Callable[[], PreTrainedModel]] = None,
+                 compute_loss_func: Optional[Callable] = None,
+                 compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+                 callbacks: Optional[List[TrainerCallback]] = None,
+                 optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+                 preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+                 **kwargs) -> None:
         if args.check_model and hasattr(model, 'model_dir'):
             check_local_model_is_latest(
                 model.model_dir, user_agent={
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index 29d11191d2..a6a24a634c 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -3,47 +3,39 @@
 
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
-from trl import PPOv2Trainer as HFPPOTrainer
+from trl import PPOv2Trainer as HFPPOv2Trainer
 
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
 
 
-class PPOTrainer(RLHFTrainerMixin, SwiftMixin, HFPPOTrainer):
+class PPOTrainer(SwiftMixin, HFPPOv2Trainer):
+    ppo_trainer_init = HFPPOv2Trainer.__init__
+    del HFPPOv2Trainer.__init__
 
+    @staticmethod
     @contextmanager
-    def _patch_dataloader():
-        print()
+    def _patch_dataloader(data_collator):
+        __init__ = DataLoader.__init__
 
-    @contextmanager
-    def _patch_init():
-        kwargs_to_pop = ['model', 'model_init', 'compute_metrics', 'preprocess_logits_for_metrics']
-        for kwarg in kwargs_to_pop:
-            kwargs.pop(kwarg, None)
-        kwargs['config'] = kwargs.pop('args')
-        HFPPOTrainer.__init__(self, **kwargs)
+        def __new_init__(self, *args, **kwargs):
+            kwargs['data_collator'] = data_collator
+            __init__(self, *args, **kwargs)
+
+        DataLoader.__init__ = __new_init__
+        yield
+        DataLoader.__init__ = __init__
 
     def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, **kwargs):
-        kwargs['policy'] = model
-        kwargs['ref_policy'] = ref_model
-        super().__init__(model, ref_model, *_args, **kwargs)
-        # reset dataloader
-        self.dataloader = DataLoader(
-            self.train_dataset,
-            batch_size=self.local_dataloader_batch_size,
-            shuffle=True,
-            collate_fn=kwargs['data_collator'],
-            drop_last=True,  # needed; otherwise the last batch will be of ragged shape
-        )
-        self.accelerator.prepare(self.data_collator)
-        self.eval_dataloader = DataLoader(
-            self.eval_dataset,
-            batch_size=self.args.per_device_eval_batch_size,
-            collate_fn=kwargs['data_collator'],
-            drop_last=True,
-        )  # no need to shuffle eval dataset
-        self.eval_dataloader = self.accelerator.prepare(self.eval_dataloader)
+        super().__init__(model, *_args, **kwargs)
+        with self._patch_dataloader(kwargs['data_collator']):
+            new_kwargs = {
+                k: v
+                for k, v in kwargs.items()
+                if k in ['train_dataset', 'data_collator', 'reward_model', 'val_model', 'eval_dataset', 'tokenizer']
+            }
+            self.ppo_trainer_init(config=kwargs['args'], policy=model, ref_policy=ref_model, **new_kwargs)
 
     def train(self, *args, **kwargs):
         # remove args that are not needed for the HFPPOTrainer
-        HFPPOTrainer.train(self)
+        super().train()

From 585ad2397e13d644d535947c84b1a8f45eaa7d66 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 16:39:05 +0800
Subject: [PATCH 32/47] fix

---
 swift/trainers/rlhf_trainer/ppo_trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index a6a24a634c..c17833ee7b 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -6,7 +6,6 @@
 from trl import PPOv2Trainer as HFPPOv2Trainer
 
 from ..mixin import SwiftMixin
-from .rlhf_mixin import RLHFTrainerMixin
 
 
 class PPOTrainer(SwiftMixin, HFPPOv2Trainer):

From 89dbe794dc111fa0f190b275e9b0c3510cb65fd8 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 16:47:47 +0800
Subject: [PATCH 33/47] fix

---
 swift/trainers/rlhf_trainer/ppo_trainer.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index c17833ee7b..1d4666662b 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -5,7 +5,9 @@
 from transformers import PreTrainedModel
 from trl import PPOv2Trainer as HFPPOv2Trainer
 
+from swift.utils import patch_getattr
 from ..mixin import SwiftMixin
+from .rlhf_mixin import RLHFTrainerMixin
 
 
 class PPOTrainer(SwiftMixin, HFPPOv2Trainer):
@@ -14,11 +16,11 @@ class PPOTrainer(SwiftMixin, HFPPOv2Trainer):
 
     @staticmethod
     @contextmanager
-    def _patch_dataloader(data_collator):
+    def _patch_dataloader(collate_fn):
         __init__ = DataLoader.__init__
 
         def __new_init__(self, *args, **kwargs):
-            kwargs['data_collator'] = data_collator
+            kwargs['collate_fn'] = collate_fn
             __init__(self, *args, **kwargs)
 
         DataLoader.__init__ = __new_init__
@@ -31,9 +33,11 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
             new_kwargs = {
                 k: v
                 for k, v in kwargs.items()
-                if k in ['train_dataset', 'data_collator', 'reward_model', 'val_model', 'eval_dataset', 'tokenizer']
+                if k in ['train_dataset', 'data_collator', 'reward_model', 'value_model', 'eval_dataset']
             }
-            self.ppo_trainer_init(config=kwargs['args'], policy=model, ref_policy=ref_model, **new_kwargs)
+            self.ppo_trainer_init(
+                config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
+        patch_getattr(self.model.__class__, 'policy')
 
     def train(self, *args, **kwargs):
         # remove args that are not needed for the HFPPOTrainer

From d8030db5322cb6639e62dd9a816719d43ae88225 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 17:34:23 +0800
Subject: [PATCH 34/47] fix

---
 swift/llm/argument/base_args/base_args.py |  3 +-
 swift/llm/argument/rlhf_args.py           | 46 ++++++++++++++---------
 swift/llm/template/template_meta.py       |  6 +++
 swift/llm/train/rlhf.py                   |  9 +++--
 4 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index 2851683db2..9d4d922a3c 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -242,7 +242,7 @@ def get_template(self, processor: 'Processor') -> 'Template':
         logger.info(f'default_system: {template.template_meta.default_system}')
         return template
 
-    def get_model_processor(self, *, model=None, model_type=None, model_revision=None, **kwargs):
+    def get_model_processor(self, *, model=None, model_type=None, model_revision=None, task_type=None, **kwargs):
         if self.tuner_backend == 'unsloth':
             return load_by_unsloth(self)
         kwargs.update(self.get_model_kwargs())
@@ -250,5 +250,6 @@ def get_model_processor(self, *, model=None, model_type=None, model_revision=Non
         kwargs['model_id_or_path'] = model or self.model
         kwargs['model_type'] = model_type or self.model_type
         kwargs['model_revision'] = model_revision or self.model_revision
+        kwargs['task_type'] = task_type or self.task_type
 
         return get_model_tokenizer(**kwargs)
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index e9d3ec72d4..62658fa2ea 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -3,11 +3,37 @@
 from typing import List, Literal, Optional
 
 from swift.llm import MODEL_MAPPING
+from ..template import get_template_meta
 from .train_args import TrainArguments
 
 
 @dataclass
-class RLHFArguments(TrainArguments):
+class PPOArguments:
+    reward_model: Optional[str] = None
+    reward_adapters: List[str] = field(default_factory=list)
+    reward_model_type: Optional[str] = field(
+        default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
+    reward_model_revision: Optional[str] = None
+
+    num_ppo_epochs: int = 4
+    whiten_rewards: bool = False
+    kl_coef: float = 0.05
+    cliprange: float = 0.2
+    vf_coef: float = 0.1
+    cliprange_value: float = 0.2
+    gamma: float = 1.0
+    lam: float = 0.95
+
+    num_mini_batches: int = 1
+    local_rollout_forward_batch_size: int = 64
+    num_sample_generations: int = 10
+    response_length: int = 53
+    temperature: float = 0.7
+    missing_eos_penalty: Optional[float] = None
+
+
+@dataclass
+class RLHFArguments(PPOArguments, TrainArguments):
     """
     RLHFArguments is a dataclass that holds arguments specific to the Reinforcement
         Learning with Human Feedback (RLHF) training backend.
@@ -42,27 +68,13 @@ class RLHFArguments(TrainArguments):
     # KTO
     desirable_weight: float = 1.0
     undesirable_weight: float = 1.0
-    # PPO
-    reward_model: Optional[str] = None
-    reward_adapters: List[str] = field(default_factory=list)
-    reward_model_type: Optional[str] = field(
-        default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
-    reward_model_revision: Optional[str] = None
-    local_rollout_forward_batch_size: int = 64
-    kl_coef: float = 0.05
-    cliprange: float = 0.2
-    vf_coef: float = 0.1
-    cliprange_value: float = 0.2
-    gamma: float = 1.0
-    lam: float = 0.95
-    num_sample_generations: int = 10
 
     def __post_init__(self):
         self._init_rm()
         self._init_simpo()
-        self._init_ppo()
         self._set_default()
         super().__post_init__()
+        self._init_ppo()
 
         if self.rlhf_type in ['dpo', 'kto'] and self.train_type == 'full' or self.rlhf_type == 'ppo':
             self.ref_model = self.ref_model or self.model
@@ -73,8 +85,6 @@ def __post_init__(self):
 
     def _init_ppo(self):
         if self.rlhf_type == 'ppo':
-            self.response_length = self.max_new_tokens
-            self.num_ppo_epochs = self.num_train_epochs
             self.padding_side = 'left'
             # TODO: streaming, MLLM
 
diff --git a/swift/llm/template/template_meta.py b/swift/llm/template/template_meta.py
index 98520516c2..82a07bdfae 100644
--- a/swift/llm/template/template_meta.py
+++ b/swift/llm/template/template_meta.py
@@ -128,6 +128,12 @@ def init(self, tokenizer: PreTrainedTokenizerBase) -> None:
         if tokenizer.eos_token not in self.stop_words:
             self.stop_words.append(tokenizer.eos_token)
 
+        self.stop_token_id = tokenizer.eos_token_id
+        if self.suffix:
+            stop_token_id = tokenizer.convert_tokens_to_ids(self.suffix[-1])
+            if stop_token_id is not None:
+                self.stop_token_id = stop_token_id
+
     def check_system(self, system: Optional[str]) -> None:
         if system is not None:
             assert self.support_system, (
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index f2e715b4bb..4d1dddc030 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -30,16 +30,16 @@ def _prepare_model_tokenizer(self):
             model_type = getattr(args, f'{key}_model_type')
             model_revision = getattr(args, f'{key}_model_revision')
             adapters = args.adapters if key == 'ref' else args.reward_adapters
-
+            task_type = args.task_type if origin_key == 'ref' else 'seq_cls'
             # Be aware of the unexpected behavior caused by double monkey patching.
             model = args.get_model_processor(
-                model=model_id_or_path, model_type=model_type, model_revision=model_revision)[0]
+                model=model_id_or_path, model_type=model_type, model_revision=model_revision, task_type=task_type)[0]
 
             model = prepare_adapter(args, model, adapters)
             if origin_key in {'ref', 'reward'}:
                 model.requires_grad_(False).eval()
             else:
-                model = self.prepare_model(args, model, task_type='seq_cls')
+                model = self.prepare_model(args, model, task_type=task_type)
                 logger.info(f'value_model: {model}')
                 model_parameter_info = get_model_parameter_info(model)
                 self.train_msg['value_model_parameter_info'] = model_parameter_info
@@ -58,6 +58,9 @@ def _prepare_template(self) -> None:
             # Avoid padding labels during the model's forward pass in multimodal models.
             self.template.loss_scale = 'last_round'
 
+        if args.rlhf_type == 'ppo':
+            self.training_args.stop_token_id = self.template.template_meta.stop_token_id
+
     def _get_dataset(self):
         args = self.args
         train_dataset, val_dataset = super()._get_dataset()

From ac49ee642e664dfe902271129790d6b92cb9c333 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 6 Jan 2025 19:26:23 +0800
Subject: [PATCH 35/47] update

---
 ...06\345\222\214\351\203\250\347\275\262.md" |  2 +-
 .../Instruction/Inference-and-deployment.md   |  2 +-
 swift/llm/argument/base_args/base_args.py     |  6 ++---
 swift/llm/argument/rlhf_args.py               |  1 +
 swift/llm/infer/infer_engine/infer_engine.py  |  2 +-
 swift/llm/train/rlhf.py                       | 10 +++++---
 swift/trainers/rlhf_trainer/ppo_trainer.py    | 24 +++++++++++++++++--
 7 files changed, 36 insertions(+), 11 deletions(-)

diff --git "a/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md" "b/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
index d6cb841f07..76a574b957 100644
--- "a/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
+++ "b/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
@@ -4,7 +4,7 @@ SWIFT支持以命令行、Python代码和界面方式进行推理和部署：
 - 使用`engine.infer`或者`engine.infer_async`进行python的方式推理. 参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py).
 - 使用`swift infer`使用命令行的方式进行推理. 参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/infer/cli_demo.sh).
 - 使用`swift deploy`进行服务部署，并使用openai API或者`client.infer`的方式推理. 服务端参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/server), 客户端参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client).
-- 使用`swift app`部署模型进行界面推理, 可以查看[这里](../GetStarted/界面使用.md)
+- 使用`swift app`部署模型进行界面推理, 可以查看[这里](../GetStarted/Web-UI.md)
 
 
 ## 命令行推理指令
diff --git a/docs/source_en/Instruction/Inference-and-deployment.md b/docs/source_en/Instruction/Inference-and-deployment.md
index e77b7a970d..1229ba3590 100644
--- a/docs/source_en/Instruction/Inference-and-deployment.md
+++ b/docs/source_en/Instruction/Inference-and-deployment.md
@@ -4,7 +4,7 @@ SWIFT supports inference and deployment through command line, Python code, and i
 - Use `engine.infer` or `engine.infer_async` for Python-based inference. See [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py) for reference.
 - Use `swift infer` for command-line-based inference. See [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/cli_demo.sh) for reference.
 - Use `swift deploy` for service deployment and perform inference using the OpenAI API or `client.infer`. Refer to the server guidelines [here](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/server) and the client guidelines [here](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client).
-- Deploy the model with `swift app` for web-based inference. You can check [here](../GetStarted/Interface-usage.md) for details.
+- Deploy the model with `swift app` for web-based inference. You can check [here](../GetStarted/Web-UI.md) for details.
 
 
 ## Command Line Inference
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index 9d4d922a3c..22340beef7 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -43,7 +43,6 @@ def _handle_ckpt_dir(self: 'BaseArguments'):
                 return
             self.adapters.insert(0, self.ckpt_dir)
         else:
-            assert self.model is None, f'self.model: {self.model}'
             self.model = self.ckpt_dir
         self.ckpt_dir = None
         logger.warning('The `--ckpt_dir` parameter will be removed in `ms-swift>=3.2`. '
@@ -236,9 +235,10 @@ def _init_device(self):
             else:
                 torch.cuda.set_device(self.local_rank)
 
-    def get_template(self, processor: 'Processor') -> 'Template':
+    def get_template(self, processor: 'Processor', template_type=None) -> 'Template':
         template_kwargs = self.get_template_kwargs()
-        template = get_template(self.template, processor, **template_kwargs)
+        template_type = template_type or self.template
+        template = get_template(template_type, processor, **template_kwargs)
         logger.info(f'default_system: {template.template_meta.default_system}')
         return template
 
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 62658fa2ea..60c4511587 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -14,6 +14,7 @@ class PPOArguments:
     reward_model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     reward_model_revision: Optional[str] = None
+    reward_template: Optional[str] = None
 
     num_ppo_epochs: int = 4
     whiten_rewards: bool = False
diff --git a/swift/llm/infer/infer_engine/infer_engine.py b/swift/llm/infer/infer_engine/infer_engine.py
index 80feb707b8..fe6057b383 100644
--- a/swift/llm/infer/infer_engine/infer_engine.py
+++ b/swift/llm/infer/infer_engine/infer_engine.py
@@ -174,7 +174,7 @@ def _get_num_tokens(inputs: Dict[str, Any]) -> int:
             else:
                 return input_ids.shape[-1]
         elif 'inputs_embeds' in inputs:  # 2d or 3d
-            return inputs['inputs_embeds'].shape[-1]
+            return inputs['inputs_embeds'].shape[-2]
         raise ValueError(f'Unable to retrieve input_ids and inputs_embeds. inputs: {inputs}')
 
     def set_default_max_tokens(self, request_config: RequestConfig, inputs: Dict[str, Any]) -> None:
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 4d1dddc030..8d4ac35925 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -32,8 +32,10 @@ def _prepare_model_tokenizer(self):
             adapters = args.adapters if key == 'ref' else args.reward_adapters
             task_type = args.task_type if origin_key == 'ref' else 'seq_cls'
             # Be aware of the unexpected behavior caused by double monkey patching.
-            model = args.get_model_processor(
-                model=model_id_or_path, model_type=model_type, model_revision=model_revision, task_type=task_type)[0]
+            model, processor = args.get_model_processor(
+                model=model_id_or_path, model_type=model_type, model_revision=model_revision, task_type=task_type)
+            if origin_key == 'reward':
+                self.reward_template = args.get_template(processor, args.reward_template)
 
             model = prepare_adapter(args, model, adapters)
             if origin_key in {'ref', 'reward'}:
@@ -59,7 +61,7 @@ def _prepare_template(self) -> None:
             self.template.loss_scale = 'last_round'
 
         if args.rlhf_type == 'ppo':
-            self.training_args.stop_token_id = self.template.template_meta.stop_token_id
+            args.training_args.stop_token_id = self.template.template_meta.stop_token_id
 
     def _get_dataset(self):
         args = self.args
@@ -75,6 +77,8 @@ def _get_trainer_kwargs(self):
             model = getattr(self, key)
             if model:
                 trainer_kwargs[key] = model
+        if self.args.rlhf_type == 'ppo':
+            trainer_kwargs['reward_template'] = self.reward_template
         return trainer_kwargs
 
 
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index 1d4666662b..3208342d42 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from contextlib import contextmanager
-
+import torch
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
 from trl import PPOv2Trainer as HFPPOv2Trainer
@@ -29,6 +29,7 @@ def __new_init__(self, *args, **kwargs):
 
     def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, **kwargs):
         super().__init__(model, *_args, **kwargs)
+        self.reward_template = kwargs['reward_template']
         with self._patch_dataloader(kwargs['data_collator']):
             new_kwargs = {
                 k: v
@@ -39,6 +40,25 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
                 config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
         patch_getattr(self.model.__class__, 'policy')
 
+    @contextmanager
+    def patch_reward_model(self):
+        model_cls = self.reward_model.__class__
+        forward = model_cls.forward
+        trainer = self
+
+        def new_forward(self, input_ids, *args, **kwargs):
+            idx = (input_ids == 0).cumsum(dim=1)[:, -1]
+            trainer.template.tokenizer.batch_decode(input_ids)
+
+            print(trainer)
+            return forward(self, input_ids, *args, **kwargs)
+
+        model_cls.forward = new_forward
+        yield
+        model_cls.forward =  forward
+
+
     def train(self, *args, **kwargs):
         # remove args that are not needed for the HFPPOTrainer
-        super().train()
+        with self.patch_reward_model():
+            super().train()

From 25d9d9deda6353e4756d4ffc67cc4d4848aceb56 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 00:44:03 +0800
Subject: [PATCH 36/47] update

---
 examples/train/rlhf/ppo.sh                 | 28 ++++++++++++++++++++++
 swift/llm/argument/rlhf_args.py            |  3 ++-
 swift/llm/train/rlhf.py                    |  8 ++-----
 swift/trainers/mixin.py                    |  5 ++--
 swift/trainers/rlhf_trainer/ppo_trainer.py | 26 ++++----------------
 swift/trainers/trainer_factory.py          |  3 +++
 tests/train/test_ppo.py                    |  2 +-
 7 files changed, 43 insertions(+), 32 deletions(-)
 create mode 100644 examples/train/rlhf/ppo.sh

diff --git a/examples/train/rlhf/ppo.sh b/examples/train/rlhf/ppo.sh
new file mode 100644
index 0000000000..a4341bea31
--- /dev/null
+++ b/examples/train/rlhf/ppo.sh
@@ -0,0 +1,28 @@
+nproc_per_node=2
+
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=$nproc_per_node \
+swift rlhf \
+    --rlhf_type ppo \
+    --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
+    --reward_model 'AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2' \
+    --train_type lora \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' 'AI-ModelScope/alpaca-gpt4-data-en#20000' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 60c4511587..83620a9b3c 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -14,7 +14,6 @@ class PPOArguments:
     reward_model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     reward_model_revision: Optional[str] = None
-    reward_template: Optional[str] = None
 
     num_ppo_epochs: int = 4
     whiten_rewards: bool = False
@@ -87,6 +86,8 @@ def __post_init__(self):
     def _init_ppo(self):
         if self.rlhf_type == 'ppo':
             self.padding_side = 'left'
+            self.metric_for_best_model = None
+            self.training_args.metric_for_best_model = None
             # TODO: streaming, MLLM
 
     def _init_simpo(self):
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 8d4ac35925..37e8d9903c 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -32,10 +32,8 @@ def _prepare_model_tokenizer(self):
             adapters = args.adapters if key == 'ref' else args.reward_adapters
             task_type = args.task_type if origin_key == 'ref' else 'seq_cls'
             # Be aware of the unexpected behavior caused by double monkey patching.
-            model, processor = args.get_model_processor(
-                model=model_id_or_path, model_type=model_type, model_revision=model_revision, task_type=task_type)
-            if origin_key == 'reward':
-                self.reward_template = args.get_template(processor, args.reward_template)
+            model = args.get_model_processor(
+                model=model_id_or_path, model_type=model_type, model_revision=model_revision, task_type=task_type)[0]
 
             model = prepare_adapter(args, model, adapters)
             if origin_key in {'ref', 'reward'}:
@@ -77,8 +75,6 @@ def _get_trainer_kwargs(self):
             model = getattr(self, key)
             if model:
                 trainer_kwargs[key] = model
-        if self.args.rlhf_type == 'ppo':
-            trainer_kwargs['reward_template'] = self.reward_template
         return trainer_kwargs
 
 
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index 5eb7a72dd3..8c68e1a27d 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -72,6 +72,7 @@ def __init__(self,
             from swift.trainers.xtuner import init_sequence_parallel_xtuner
             init_sequence_parallel_xtuner(args.sequence_parallel_size)
 
+        self.model_meta = model.model_meta
         with self.hub.patch_hub():
             super().__init__(
                 model=model,
@@ -216,7 +217,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         # tokenizer
         if not is_adapter:
             from swift.llm import save_checkpoint
-            additional_saved_files = self.model.model_meta.additional_saved_files
+            additional_saved_files = self.model_meta.additional_saved_files
             save_checkpoint(None, self.template.processor, output_dir, additional_saved_files=additional_saved_files)
 
     def _fix_zero3_gather_all_parameters(self) -> None:
@@ -246,7 +247,7 @@ def _save_checkpoint(self, *args, **kwargs):
         return result
 
     def train(self, *args, **kwargs):
-        if self.model.model_meta.is_multimodal:
+        if self.model_meta.is_multimodal:
             models = list(
                 set([
                     v for k, v in self.__dict__.items()
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index 3208342d42..19f8328b46 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from contextlib import contextmanager
+
 import torch
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -29,7 +30,6 @@ def __new_init__(self, *args, **kwargs):
 
     def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, **kwargs):
         super().__init__(model, *_args, **kwargs)
-        self.reward_template = kwargs['reward_template']
         with self._patch_dataloader(kwargs['data_collator']):
             new_kwargs = {
                 k: v
@@ -38,27 +38,9 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
             }
             self.ppo_trainer_init(
                 config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
-        patch_getattr(self.model.__class__, 'policy')
-
-    @contextmanager
-    def patch_reward_model(self):
-        model_cls = self.reward_model.__class__
-        forward = model_cls.forward
-        trainer = self
-
-        def new_forward(self, input_ids, *args, **kwargs):
-            idx = (input_ids == 0).cumsum(dim=1)[:, -1]
-            trainer.template.tokenizer.batch_decode(input_ids)
-
-            print(trainer)
-            return forward(self, input_ids, *args, **kwargs)
-
-        model_cls.forward = new_forward
-        yield
-        model_cls.forward =  forward
-
+        unwrap_model = self.accelerator.unwrap_model(self.model)
+        patch_getattr(unwrap_model, 'policy')
 
     def train(self, *args, **kwargs):
         # remove args that are not needed for the HFPPOTrainer
-        with self.patch_reward_model():
-            super().train()
+        super().train()
diff --git a/swift/trainers/trainer_factory.py b/swift/trainers/trainer_factory.py
index 480ca8287d..19c93a042b 100644
--- a/swift/trainers/trainer_factory.py
+++ b/swift/trainers/trainer_factory.py
@@ -56,4 +56,7 @@ def get_training_args(cls, args):
             if k not in parameters:
                 args_dict.pop(k)
 
+        if 'ppo' in training_args_cls.__name__.lower():
+            args_dict['world_size'] = args.global_world_size
+
         return training_args_cls(**args_dict)
diff --git a/tests/train/test_ppo.py b/tests/train/test_ppo.py
index 073701d3fe..0a7c98022e 100644
--- a/tests/train/test_ppo.py
+++ b/tests/train/test_ppo.py
@@ -27,7 +27,7 @@ def test_ppo():
     result = rlhf_main(
         RLHFArguments(
             rlhf_type='ppo',
-            model='Qwen/Qwen2.5-7B-Instruct',
+            model='LLM-Research/Meta-Llama-3.1-8B-Instruct',
             reward_model='AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2',
             dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100', 'AI-ModelScope/alpaca-gpt4-data-en#100'],
             **kwargs))

From 102257bca9878b1d553379d0db770778bbc47acf Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 10:59:47 +0800
Subject: [PATCH 37/47] update

---
 examples/deploy/lora/client.py             |  2 +-
 examples/infer/demo_hf.py                  | 60 ++++++++++++++++++++++
 examples/infer/demo_lora.py                |  2 +-
 swift/llm/template/base.py                 |  2 +-
 tests/test_align/test_template/test_llm.py |  4 +-
 5 files changed, 65 insertions(+), 5 deletions(-)
 create mode 100644 examples/infer/demo_hf.py

diff --git a/examples/deploy/lora/client.py b/examples/deploy/lora/client.py
index e61caad8ae..ae66b10df0 100644
--- a/examples/deploy/lora/client.py
+++ b/examples/deploy/lora/client.py
@@ -23,5 +23,5 @@ def infer_multilora(engine: InferClient, infer_request: InferRequest):
 
 if __name__ == '__main__':
     engine = InferClient(host='127.0.0.1', port=8000)
-    infer_request = InferRequest(messages=[{'role': 'user', 'content': '你是谁'}])
+    infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
     infer_multilora(engine, infer_request)
diff --git a/examples/infer/demo_hf.py b/examples/infer/demo_hf.py
new file mode 100644
index 0000000000..58959078f8
--- /dev/null
+++ b/examples/infer/demo_hf.py
@@ -0,0 +1,60 @@
+def infer_hf():
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from peft import PeftModel
+    from modelscope import snapshot_download
+    model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct')
+    adapter_dir = snapshot_download('swift/test_lora')
+    model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype='auto', device_map='auto')
+    model = PeftModel.from_pretrained(model, adapter_dir)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+    messages = [{
+        'role': 'system',
+        'content': 'You are a helpful assistant.'
+    }, {
+        'role': 'user',
+        'content': 'who are you?'
+    }]
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    model_inputs = tokenizer([text], return_tensors='pt').to(model.device)
+
+    generated_ids = model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
+    generated_ids = [
+        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    ]
+
+    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    print(f'response: {response}')
+    return response
+
+
+def infer_swift():
+    from swift.llm import get_model_tokenizer, get_template, InferRequest, RequestConfig, PtEngine
+    from modelscope import snapshot_download
+    from swift.tuners import Swift
+    model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct')
+    adapter_dir = snapshot_download('swift/test_lora')
+    model, tokenizer = get_model_tokenizer(model_dir, device_map='auto')
+    model = Swift.from_pretrained(model, adapter_dir)
+    template = get_template(model.model_meta.template, tokenizer)
+    engine = PtEngine.from_model_template(model, template)
+
+    messages = [{
+        'role': 'system',
+        'content': 'You are a helpful assistant.'
+    }, {
+        'role': 'user',
+        'content': 'who are you?'
+    }]
+    request_config = RequestConfig(max_tokens=512, temperature=0)
+    resp_list = engine.infer([InferRequest(messages=messages)], request_config=request_config)
+    response = resp_list[0].choices[0].message.content
+    print(f'response: {response}')
+    return response
+
+
+if __name__ == '__main__':
+    response = infer_hf()
+    response2 = infer_swift()
+    assert response == response2
diff --git a/examples/infer/demo_lora.py b/examples/infer/demo_lora.py
index 7489d1c38a..8d9396f135 100644
--- a/examples/infer/demo_lora.py
+++ b/examples/infer/demo_lora.py
@@ -63,6 +63,6 @@ def infer_lora(infer_request: 'InferRequest'):
     from swift.llm import (PtEngine, RequestConfig, AdapterRequest, get_template, BaseArguments, InferRequest,
                            safe_snapshot_download, get_model_tokenizer)
     from swift.tuners import Swift
-    infer_request = InferRequest(messages=[{'role': 'user', 'content': '你是谁'}])
+    infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
     # infer_lora(infer_request)
     infer_multilora(infer_request, 'pt')
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index a4b2aa7c1d..d2a2ae84a8 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -598,7 +598,7 @@ def _swift_encode(self, inputs: StdTemplateInputs):
             context_list = prompt.copy()
             extra_context_list = []
             extra_context_type = None
-            if i < n_round - 1 or self.mode == 'seq_cls' and response is not None:
+            if i < n_round - 1:
                 # Not the last round.
                 context_list.append('{{RESPONSE}}')
                 extra_context_list = template_meta.chat_sep
diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index da54bbb017..0b29120dfe 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -215,7 +215,7 @@ def test_qwen2_reward():
     res = _infer_model(pt_engine, messages=messages)
     pt_engine.default_template.template_backend = 'jinja'
     res2 = _infer_model(pt_engine, messages=messages)
-    assert res == res2 == '1.390625'
+    assert res == '1.84375' and res2 == '1.390625'  # \n diff
 
 
 def test_qwen2_5_math():
@@ -239,7 +239,7 @@ def test_skywork_reward():
     res = _infer_model(pt_engine, messages=messages)
     pt_engine.default_template.template_backend = 'jinja'
     res2 = _infer_model(pt_engine, messages=messages)
-    assert res == '14.1875'
+    assert res == '14.25'
     assert res2 == '13.8125'
 
 

From db9bdc6ff1ef23d930ff84b1b3d0eedd64d875bc Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 11:14:48 +0800
Subject: [PATCH 38/47] update

---
 examples/train/rlhf/ppo.sh      | 5 +++--
 swift/llm/argument/rlhf_args.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/train/rlhf/ppo.sh b/examples/train/rlhf/ppo.sh
index a4341bea31..7983d2229b 100644
--- a/examples/train/rlhf/ppo.sh
+++ b/examples/train/rlhf/ppo.sh
@@ -12,7 +12,7 @@ swift rlhf \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
-    --learning_rate 1e-4 \
+    --learning_rate 1e-5 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --target_modules all-linear \
@@ -25,4 +25,5 @@ swift rlhf \
     --output_dir output \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
-    --deepspeed zero2
+    --deepspeed zero2 \
+    --response_length 512
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index 83620a9b3c..d707dfdf7f 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -27,7 +27,7 @@ class PPOArguments:
     num_mini_batches: int = 1
     local_rollout_forward_batch_size: int = 64
     num_sample_generations: int = 10
-    response_length: int = 53
+    response_length: int = 512
     temperature: float = 0.7
     missing_eos_penalty: Optional[float] = None
 

From ee202e1dc5f817087a2baa3416a81b8ed36746c5 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 11:40:12 +0800
Subject: [PATCH 39/47] update

---
 ...344\271\211\346\225\260\346\215\256\351\233\206.md" | 10 +++++++++-
 docs/source_en/Customization/Custom-dataset.md         | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
index df19d99391..9c63c6f080 100644
--- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
@@ -53,7 +53,7 @@ query-response格式：
 
 ### RLHF
 
-#### DPO/ORPO/CPO/SimPO/RM/PPO
+#### DPO/ORPO/CPO/SimPO/RM
 
 ```jsonl
 {"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}, {"role": "assistant", "content": "明天天气晴朗"}], "rejected_response": "我不知道"}
@@ -67,6 +67,14 @@ query-response格式：
 {"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}], "label": true}
 ```
 
+#### PPO
+
+```jsonl
+{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "告诉我明天的天气"}]}
+{"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}]}
+{"messages": [{"role": "user", "content": "你的名字是什么"}]}
+```
+
 ### 序列分类
 ```jsonl
 {"messages": [{"role": "user", "content": "今天天气真好呀"}], "label": 1}
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
index 3bb38cfe3c..268b06dbc0 100644
--- a/docs/source_en/Customization/Custom-dataset.md
+++ b/docs/source_en/Customization/Custom-dataset.md
@@ -52,7 +52,7 @@ The following provides the recommended dataset format for ms-swift, where the sy
 
 ### RLHF
 
-#### DPO/ORPO/CPO/SimPO/RM/PPO
+#### DPO/ORPO/CPO/SimPO/RM
 
 ```jsonl
 {"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "Tell me tomorrow's weather"}, {"role": "assistant", "content": "Tomorrow's weather will be sunny"}], "rejected_response": "I don't know"}
@@ -66,6 +66,14 @@ The following provides the recommended dataset format for ms-swift, where the sy
 {"messages": [{"role": "system", "content": "You are a useful and harmless math calculator"}, {"role": "user", "content": "What is 1 + 1?"}, {"role": "assistant", "content": "It equals 2"}, {"role": "user", "content": "What about adding 1?"}, {"role": "assistant", "content": "It equals 3"}], "label": true}
 ```
 
+#### PPO
+
+```jsonl
+{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "Tell me tomorrow's weather"}]}
+{"messages": [{"role": "system", "content": "You are a useful and harmless math calculator"}, {"role": "user", "content": "What is 1 + 1?"}, {"role": "assistant", "content": "It equals 2"}, {"role": "user", "content": "What about adding 1?"}]}
+{"messages": [{"role": "user", "content": "What is your name?"}]}
+```
+
 ### Sequence Classification
 ```jsonl
 {"messages": [{"role": "user", "content": "The weather is really nice today"}], "label": 1}

From 455fbd5142ba5259eebbc68563ccdcfd768c4b6c Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 11:44:51 +0800
Subject: [PATCH 40/47] fix

---
 swift/llm/argument/rlhf_args.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
index d707dfdf7f..8ddd396f53 100644
--- a/swift/llm/argument/rlhf_args.py
+++ b/swift/llm/argument/rlhf_args.py
@@ -3,7 +3,6 @@
 from typing import List, Literal, Optional
 
 from swift.llm import MODEL_MAPPING
-from ..template import get_template_meta
 from .train_args import TrainArguments
 
 

From 5789bb920536e78b1bf7405c45e9b4cbb20c9e9b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 13:12:42 +0800
Subject: [PATCH 41/47] update

---
 ...44\350\241\214\345\217\202\346\225\260.md" | 23 ++++++++++++-
 .../Instruction/Command-line-parameters.md    | 34 ++++++++++++++++---
 examples/train/rlhf/ppo.sh                    |  4 +--
 swift/plugin/loss_scale.py                    |  7 ++--
 4 files changed, 57 insertions(+), 11 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 0946491270..e8769a1a6c 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -50,7 +50,7 @@
 - 🔥max_pixels: 多模态模型图片前处理的最大像素数（H\*W），默认不缩放。
 - tools_prompt: 智能体训练时的工具列表转为system的格式，请参考[智能体训练](./智能体的支持.md)，默认为'react_en'
 - padding_side: 当训练`batch_size>=2`时的padding_side，可选值为'left', 'right'，默认为'right'。（`generate`的batch_size>=2时，只进行左padding）
-- loss_scale: 如何针对训练添加token的loss权重。默认为`'default'`，代表所有response（含history）以1计算交叉熵损失。具体可以查看[插件化](../Customization/插件化.md)和[智能体训练](./智能体的支持.md)
+- loss_scale: 如何针对训练添加token的loss权重。默认为`'default'`，代表所有response（含history）以1计算交叉熵损失。可选值为'default', 'last_round', 'all', 以及agent需要的loss_scale: 'react', 'agentflan', 'alpha_umi', 'qwen'。具体可以查看[插件化](../Customization/插件化.md)和[智能体训练](./智能体的支持.md)
 - sequence_parallel_size: 序列并行数量。参考[example](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel/train.sh)
 - use_chat_template: 使用chat模板或generation模板，默认为`True`。`swift pt`会自动设置为generation模板
 - template_backend: 使用swift或jinja进行推理。如果使用jinja，则使用transformers的`apply_chat_template`。默认为swift
@@ -324,6 +324,27 @@ RLHF参数继承于[训练参数](#训练参数)
 - desirable_weight: KTO算法中对desirable response的loss权重 $\lambda_D$ ，默认为`1.`
 - undesirable_weight: KTO论文中对undesirable response的loss权重 $\lambda_U$ , 默认为`1.`
 
+#### PPO参数
+- reward_model: 默认为None
+- reward_adapters: 默认为`[]`
+- reward_model_type: 默认为None
+- reward_model_revision: 默认为None
+
+以下参数含义可以参考[这里](https://huggingface.co/docs/trl/main/ppo_trainer)
+- num_ppo_epochs: 默认为4
+- whiten_rewards: 默认为False
+- kl_coef: 默认为0.05
+- cliprange: 默认为0.2
+- vf_coef: 默认为0.1
+- cliprange_value: 默认为0.2
+- gamma: 默认为1.0
+- lam: 默认为0.95
+- num_mini_batches: 默认为1
+- local_rollout_forward_batch_size: 默认为64
+- num_sample_generations: 默认为10
+- response_length: 默认为512
+- temperature: 默认为0.7
+- missing_eos_penalty: 默认为None
 
 ### 推理参数
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 454b9a0984..233a51861f 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -50,7 +50,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - 🔥max_pixels: Maximum pixel count for pre-processing images in multimodal models (H*W), default is no scaling.
 - tools_prompt: The list of tools for agent training converted to system format, refer to [Agent Training](./Agent-support.md), default is 'react_en'.
 - padding_side: The padding_side used when training with `batch_size >= 2`, with optional values of 'left' and 'right', defaulting to 'right'. (When the batch_size in `generate` is >= 2, only left padding is applied.)
-- loss_scale: How to add token loss weight during training. Default is `'default'`, meaning all responses (including history) are treated as 1 for cross-entropy loss. For specifics, see [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
+- loss_scale: How to add token loss weight during training. Default is `'default'`, meaning all responses (including history) are treated as 1 for cross-entropy loss. The optional values are 'default', 'last_round', 'all', and the loss scale required by the agent: 'react', 'agentflan', 'alpha_umi', 'qwen'. For specifics, see [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
 - sequence_parallel_size: Number of sequence parallelism. Refer to [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel/train.sh).
 - use_chat_template: Use chat template or generation template, default is `True`. `swift pt` is automatically set to the generation template.
 - template_backend: Use swift or jinja for inference. If using jinja, it will utilize transformers' `apply_chat_template`. Default is swift.
@@ -318,16 +318,40 @@ RLHF arguments inherit from the [training arguments](#training-arguments).
 
 - 🔥beta: KL regularization term coefficient, default is `None`, i.e., for `simpo` algorithm default is `2.`, for other algorithms default is `0.1`. Refer to the [documentation](./Human-alignment.md) for specifics.
 - label_smoothing: Whether to use DPO smoothing, default value is `0`, generally set between 0~0.5.
--
+
 - 🔥rpo_alpha: Weight for adding sft_loss in DPO, default is `1`. The final loss is `KL_loss + rpo_alpha * sft_loss`.
--
+
 - cpo_alpha: The coefficient of nll loss in CPO/SimPO loss, default is `1.`.
--
+
 - simpo_gamma: Reward margin term in SimPO algorithm, recommended to set between 0.5-1.5 in the paper, default is `1.`.
--
+
 - desirable_weight: Loss weight for desirable response in KTO algorithm $\lambda_D$, default is `1.`.
 - undesirable_weight: Loss weight for undesirable response in KTO paper $\lambda_U$, default is `1.`.
 
+#### PPO Arguments
+
+- reward_model: Defaults to None
+- reward_adapters: Defaults to `[]`
+- reward_model_type: Defaults to None
+- reward_model_revision: Defaults to None
+
+The meanings of the following parameters can be referenced [here](https://huggingface.co/docs/trl/main/ppo_trainer):
+
+- num_ppo_epochs: Defaults to 4
+- whiten_rewards: Defaults to False
+- kl_coef: Defaults to 0.05
+- cliprange: Defaults to 0.2
+- vf_coef: Defaults to 0.1
+- cliprange_value: Defaults to 0.2
+- gamma: Defaults to 1.0
+- lam: Defaults to 0.95
+- num_mini_batches: Defaults to 1
+- local_rollout_forward_batch_size: Defaults to 64
+- num_sample_generations: Defaults to 10
+- response_length: Defaults to 512
+- temperature: Defaults to 0.7
+- missing_eos_penalty: Defaults to None
+
 ### Inference Arguments
 
 Inference arguments include the [base arguments](#base-arguments), [merge arguments](#merge-arguments), [vLLM arguments](#vllm-arguments), [LMDeploy arguments](#LMDeploy-arguments), and also contain the following:
diff --git a/examples/train/rlhf/ppo.sh b/examples/train/rlhf/ppo.sh
index 7983d2229b..4410b1609e 100644
--- a/examples/train/rlhf/ppo.sh
+++ b/examples/train/rlhf/ppo.sh
@@ -1,6 +1,6 @@
-nproc_per_node=2
+nproc_per_node=4
 
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=$nproc_per_node \
 swift rlhf \
     --rlhf_type ppo \
diff --git a/swift/plugin/loss_scale.py b/swift/plugin/loss_scale.py
index 275d2e0e4b..21733dfabe 100644
--- a/swift/plugin/loss_scale.py
+++ b/swift/plugin/loss_scale.py
@@ -180,11 +180,12 @@ def get_loss_scale(self, context: str, context_type: ContextType, *args, **kwarg
 
 # Add your loss scale here, use --loss_scale xxx to train
 loss_scale_map = {
+    'last_round': LastRoundLossScale(),
+    'default': LossScale(),
+    'all': TrainAllLossScale(),
+    # agent
     'agentflan': AgentFlanLossScale(),
     'react': REACTLossScale(),
     'alpha_umi': AlphaUmiLossScale(),
-    'default': LossScale(),
-    'last_round': LastRoundLossScale(),
     'qwen': QwenLossScale(),
-    'all': TrainAllLossScale(),
 }

From 813dadf163b258355c87a2681bb3b5a85c5741b8 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 13:14:31 +0800
Subject: [PATCH 42/47] update

---
 examples/train/rlhf/ppo.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/train/rlhf/ppo.sh b/examples/train/rlhf/ppo.sh
index 4410b1609e..86f93d9348 100644
--- a/examples/train/rlhf/ppo.sh
+++ b/examples/train/rlhf/ppo.sh
@@ -1,3 +1,4 @@
+# Currently, it only supports the case where the model and reward_model use the same template/tokenizer.
 nproc_per_node=4
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 \

From 828996b82d3813834543b9d27c57e75e10e7980a Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 13:17:26 +0800
Subject: [PATCH 43/47] fix

---
 swift/llm/train/rlhf.py                    | 2 +-
 swift/trainers/rlhf_trainer/ppo_trainer.py | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 37e8d9903c..feffd4e65c 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import List, Union
 
-from swift.utils import get_logger, get_model_parameter_info, patch_getattr
+from swift.utils import get_logger, get_model_parameter_info
 from ..argument import RLHFArguments
 from .kto import prepare_kto_dataset
 from .sft import SwiftSft
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index 19f8328b46..c31ae5b7eb 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -1,19 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from contextlib import contextmanager
 
-import torch
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
 from trl import PPOv2Trainer as HFPPOv2Trainer
 
 from swift.utils import patch_getattr
 from ..mixin import SwiftMixin
-from .rlhf_mixin import RLHFTrainerMixin
+
+ppo_trainer_init = HFPPOv2Trainer.__init__
+del HFPPOv2Trainer.__init__
 
 
 class PPOTrainer(SwiftMixin, HFPPOv2Trainer):
-    ppo_trainer_init = HFPPOv2Trainer.__init__
-    del HFPPOv2Trainer.__init__
 
     @staticmethod
     @contextmanager
@@ -36,7 +35,7 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
                 for k, v in kwargs.items()
                 if k in ['train_dataset', 'data_collator', 'reward_model', 'value_model', 'eval_dataset']
             }
-            self.ppo_trainer_init(
+            ppo_trainer_init(
                 config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
         unwrap_model = self.accelerator.unwrap_model(self.model)
         patch_getattr(unwrap_model, 'policy')

From 0486eab8e5068140f1ac385e072cdc795647a19c Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 13:24:48 +0800
Subject: [PATCH 44/47] update

---
 swift/trainers/rlhf_trainer/ppo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
index c31ae5b7eb..1196d5b06c 100644
--- a/swift/trainers/rlhf_trainer/ppo_trainer.py
+++ b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -36,7 +36,7 @@ def __init__(self, model: PreTrainedModel, ref_model: PreTrainedModel, *_args, *
                 if k in ['train_dataset', 'data_collator', 'reward_model', 'value_model', 'eval_dataset']
             }
             ppo_trainer_init(
-                config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
+                self, config=kwargs['args'], tokenizer=self.tokenizer, policy=model, ref_policy=ref_model, **new_kwargs)
         unwrap_model = self.accelerator.unwrap_model(self.model)
         patch_getattr(unwrap_model, 'policy')
 

From 2e98d6aedc69c93e3b82b23ecc10ae9b847316ea Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 14:12:35 +0800
Subject: [PATCH 45/47] fix

---
 swift/trainers/__init__.py       | 10 ++++-----
 swift/trainers/arguments.py      | 37 --------------------------------
 swift/trainers/rlhf_arguments.py | 28 ++++++++++++++++++++++++
 tests/train/test_ppo.py          |  4 ++--
 4 files changed, 34 insertions(+), 45 deletions(-)
 create mode 100644 swift/trainers/rlhf_arguments.py

diff --git a/swift/trainers/__init__.py b/swift/trainers/__init__.py
index 2e57e64de2..da7ab951cc 100644
--- a/swift/trainers/__init__.py
+++ b/swift/trainers/__init__.py
@@ -15,10 +15,10 @@
     ShardedDDPOption = None
 
 if TYPE_CHECKING:
-    from .arguments import (Seq2SeqTrainingArguments, TrainingArguments, DPOConfig, CPOConfig, KTOConfig, ORPOConfig,
-                            PPOConfig, RewardConfig)
+    from .arguments import Seq2SeqTrainingArguments, TrainingArguments
     from .rlhf_trainer import (CPOTrainer, DPOTrainer, KTOTrainer, ORPOTrainer, RLHFTrainerMixin, PPOTrainer,
                                RewardTrainer)
+    from .rlhf_arguments import DPOConfig, CPOConfig, KTOConfig, ORPOConfig, PPOConfig, RewardConfig
     from .trainer_factory import TrainerFactory
     from .trainers import Seq2SeqTrainer, Trainer
     from .mixin import SwiftMixin
@@ -26,10 +26,8 @@
 else:
     _extra_objects = {k: v for k, v in globals().items() if not k.startswith('_')}
     _import_structure = {
-        'arguments': [
-            'Seq2SeqTrainingArguments', 'TrainingArguments', 'DPOConfig', 'CPOConfig', 'KTOConfig', 'ORPOConfig',
-            'PPOConfig', 'RewardConfig'
-        ],
+        'arguments': ['Seq2SeqTrainingArguments', 'TrainingArguments'],
+        'rlhf_arguments': ['DPOConfig', 'CPOConfig', 'KTOConfig', 'ORPOConfig', 'PPOConfig', 'RewardConfig'],
         'rlhf_trainer':
         ['CPOTrainer', 'DPOTrainer', 'KTOTrainer', 'ORPOTrainer', 'RLHFTrainerMixin', 'PPOTrainer', 'RewardTrainer'],
         'trainer_factory': ['TrainerFactory'],
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
index f107b41aff..809d42b85b 100644
--- a/swift/trainers/arguments.py
+++ b/swift/trainers/arguments.py
@@ -76,40 +76,3 @@ class TrainingArguments(SwiftArgumentsMixin, HfTrainingArguments):
 @dataclass
 class Seq2SeqTrainingArguments(SwiftArgumentsMixin, HfSeq2SeqTrainingArguments):
     pass
-
-
-try:
-    from trl import (DPOConfig as HfDPOConfig, CPOConfig as HfCPOConfig, ORPOConfig as HfORPOConfig, KTOConfig as
-                     HfKTOConfig, RewardConfig as HfRewardConfig, PPOv2Config as HfPPOv2Config)
-
-    @dataclass
-    class DPOConfig(SwiftArgumentsMixin, HfDPOConfig):
-        pass
-
-    @dataclass
-    class CPOConfig(SwiftArgumentsMixin, HfCPOConfig):
-        pass
-
-    @dataclass
-    class ORPOConfig(SwiftArgumentsMixin, HfORPOConfig):
-        pass
-
-    @dataclass
-    class KTOConfig(SwiftArgumentsMixin, HfKTOConfig):
-        pass
-
-    @dataclass
-    class RewardConfig(SwiftArgumentsMixin, HfRewardConfig):
-        pass
-
-    @dataclass
-    class PPOConfig(SwiftArgumentsMixin, HfPPOv2Config):
-        pass
-
-except (ImportError, RuntimeError):
-    DPOConfig = None
-    CPOConfig = None
-    ORPOConfig = None
-    KTOConfig = None
-    RewardConfig = None
-    PPOConfig = None
diff --git a/swift/trainers/rlhf_arguments.py b/swift/trainers/rlhf_arguments.py
new file mode 100644
index 0000000000..a9309bb201
--- /dev/null
+++ b/swift/trainers/rlhf_arguments.py
@@ -0,0 +1,28 @@
+from trl import (DPOConfig as HfDPOConfig, CPOConfig as HfCPOConfig, ORPOConfig as HfORPOConfig, KTOConfig as
+                     HfKTOConfig, RewardConfig as HfRewardConfig, PPOv2Config as HfPPOv2Config)
+
+from .arguments import SwiftArgumentsMixin
+
+@dataclass
+class DPOConfig(SwiftArgumentsMixin, HfDPOConfig):
+    pass
+
+@dataclass
+class CPOConfig(SwiftArgumentsMixin, HfCPOConfig):
+    pass
+
+@dataclass
+class ORPOConfig(SwiftArgumentsMixin, HfORPOConfig):
+    pass
+
+@dataclass
+class KTOConfig(SwiftArgumentsMixin, HfKTOConfig):
+    pass
+
+@dataclass
+class RewardConfig(SwiftArgumentsMixin, HfRewardConfig):
+    pass
+
+@dataclass
+class PPOConfig(SwiftArgumentsMixin, HfPPOv2Config):
+    pass
\ No newline at end of file
diff --git a/tests/train/test_ppo.py b/tests/train/test_ppo.py
index 0a7c98022e..4ad3180502 100644
--- a/tests/train/test_ppo.py
+++ b/tests/train/test_ppo.py
@@ -27,8 +27,8 @@ def test_ppo():
     result = rlhf_main(
         RLHFArguments(
             rlhf_type='ppo',
-            model='LLM-Research/Meta-Llama-3.1-8B-Instruct',
-            reward_model='AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2',
+            model='LLM-Research/Llama-3.2-1B-Instruct',
+            reward_model='AI-ModelScope/GRM-Llama3.2-3B-rewardmodel-ft',
             dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100', 'AI-ModelScope/alpaca-gpt4-data-en#100'],
             **kwargs))
     last_model_checkpoint = result['last_model_checkpoint']

From 6c2b6826d7b777bc8168a26b90ff08419124c896 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 14:16:52 +0800
Subject: [PATCH 46/47] fix

---
 swift/trainers/rlhf_arguments.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/swift/trainers/rlhf_arguments.py b/swift/trainers/rlhf_arguments.py
index a9309bb201..9db0541522 100644
--- a/swift/trainers/rlhf_arguments.py
+++ b/swift/trainers/rlhf_arguments.py
@@ -1,28 +1,40 @@
-from trl import (DPOConfig as HfDPOConfig, CPOConfig as HfCPOConfig, ORPOConfig as HfORPOConfig, KTOConfig as
-                     HfKTOConfig, RewardConfig as HfRewardConfig, PPOv2Config as HfPPOv2Config)
+from dataclasses import dataclass
+
+from trl import CPOConfig as HfCPOConfig
+from trl import DPOConfig as HfDPOConfig
+from trl import KTOConfig as HfKTOConfig
+from trl import ORPOConfig as HfORPOConfig
+from trl import PPOv2Config as HfPPOv2Config
+from trl import RewardConfig as HfRewardConfig
 
 from .arguments import SwiftArgumentsMixin
 
+
 @dataclass
 class DPOConfig(SwiftArgumentsMixin, HfDPOConfig):
     pass
 
+
 @dataclass
 class CPOConfig(SwiftArgumentsMixin, HfCPOConfig):
     pass
 
+
 @dataclass
 class ORPOConfig(SwiftArgumentsMixin, HfORPOConfig):
     pass
 
+
 @dataclass
 class KTOConfig(SwiftArgumentsMixin, HfKTOConfig):
     pass
 
+
 @dataclass
 class RewardConfig(SwiftArgumentsMixin, HfRewardConfig):
     pass
 
+
 @dataclass
 class PPOConfig(SwiftArgumentsMixin, HfPPOv2Config):
-    pass
\ No newline at end of file
+    pass

From c592ac0871022ef6f4f839ed002fa917fce9a6d4 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 7 Jan 2025 14:43:44 +0800
Subject: [PATCH 47/47] update

---
 swift/llm/template/template/mplug.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/swift/llm/template/template/mplug.py b/swift/llm/template/template/mplug.py
index 4e25652257..9882cd3388 100644
--- a/swift/llm/template/template/mplug.py
+++ b/swift/llm/template/template/mplug.py
@@ -97,7 +97,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         if images:
             image_inputs = processor.image_processor(images, cut_enable=cut_enable, return_tensors='pt')
             added_tokens_len = 0
-            cut_shapes = image_inputs['cut_shape'] or [None] * len(idx_list)
+            cut_shapes = image_inputs['cut_shape'] or [None] * 2 * len(idx_list)
             image_token_list = self.processor.encode('<|image|>', add_special_tokens=False)
             for idx, cut_shape in zip(idx_list, cut_shapes[::2]):
                 if cut_shape:
@@ -161,6 +161,8 @@ def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, An
         if 'pixel_values' in inputs:
             pixel_values = inputs.pop('pixel_values')
             inputs['image_embeds'] = torch.concat([model.forward_image(pv) for pv in pixel_values])
+        else:
+            inputs['media_offset'] = [None] * inputs['input_ids'].shape[0]
         return inputs