modelscope · Jintao-Huang · Jan 5, 2025 · Jan 5, 2025 · Jan 5, 2025 · Jan 5, 2025
diff --git a/README.md b/README.md
@@ -254,7 +254,7 @@ RLHF:
 CUDA_VISIBLE_DEVICES=0 swift rlhf \
     --rlhf_type dpo \
     --model Qwen/Qwen2.5-7B-Instruct \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:en \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
     --train_type lora \
     --output_dir output \
     ...

diff --git a/README_CN.md b/README_CN.md
@@ -246,7 +246,7 @@ RLHF：
 CUDA_VISIBLE_DEVICES=0 swift rlhf \
     --rlhf_type dpo \
     --model Qwen/Qwen2.5-7B-Instruct \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
     --train_type lora \
     --output_dir output \
     ...

diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -67,6 +67,7 @@
 - num_beams: beam search的并行保留数量，默认为1
 - 🔥stream: 流式输出，默认为`False`
 - stop_words: 额外的停止词，默认为`[]`
+- logprobs: 是否输出logprobs，默认为False
 
 ### 量化参数
 以下为拉起模型时量化的参数，具体含义可以查看[量化](https://huggingface.co/docs/transformers/main/en/main_classes/quantization)文档。这里不包含`swift export`中涉及的`gptq`、`awq`量化参数

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -68,6 +68,7 @@ Refer to the [generation_config](https://huggingface.co/docs/transformers/main_c
 - num_beams: Number of beams for beam search, default is 1.
 - 🔥stream: Stream output, default is `False`.
 - stop_words: Additional stop words, default is `[]`.
+- logprobs: Whether to output logprobs, default is False.
 
 ### Quantization Arguments
 

diff --git a/examples/train/multimodal/dpo.sh → examples/train/multimodal/rlhf/dpo.sh b/examples/train/multimodal/dpo.sh → examples/train/multimodal/rlhf/dpo.sh
@@ -1,15 +1,15 @@
-# 4*32GiB
+# 4*50GiB
 # You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
-# --rlhf_type cpo/orpo/simpo/rm/kto are also supported
-nproc_per_node=4
+# --rlhf_type cpo/orpo/simpo are also supported
+nproc_per_node=2
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=$nproc_per_node \
 MAX_PIXELS=1003520 \
 swift rlhf \
     --rlhf_type dpo \
     --model Qwen/Qwen2-VL-7B-Instruct \
-    --dataset swift/RLAIF-V-Dataset \
+    --dataset 'swift/RLAIF-V-Dataset#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
@@ -24,7 +24,7 @@ swift rlhf \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 5 \
-    --deepspeed zero3 \
+    --deepspeed zero2 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/examples/train/multimodal/rlhf/kto.sh b/examples/train/multimodal/rlhf/kto.sh
@@ -0,0 +1,31 @@
+# Due to the absence of a multi-modal open-source dataset for kto,
+# we will use a pure text kto dataset as an example here.
+nproc_per_node=2
+
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=$nproc_per_node \
+MAX_PIXELS=1003520 \
+swift rlhf \
+    --rlhf_type kto \
+    --model Qwen/Qwen2-VL-7B-Instruct \
+    --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 5 \
+    --deepspeed zero2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4
diff --git a/examples/train/rlhf/cpo.sh b/examples/train/rlhf/cpo.sh
@@ -6,15 +6,22 @@ swift rlhf \
     --rlhf_type cpo \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
-    --weight_decay 0.1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/examples/train/rlhf/dpo.sh b/examples/train/rlhf/dpo.sh
@@ -6,14 +6,22 @@ swift rlhf \
     --rlhf_type dpo \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/examples/train/rlhf/kto.sh b/examples/train/rlhf/kto.sh
@@ -1,19 +1,26 @@
-nproc_per_node=4
+nproc_per_node=2
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=$nproc_per_node \
 swift rlhf \
     --rlhf_type kto \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
-    --num_train_epochs 2 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/examples/train/rlhf/orpo.sh b/examples/train/rlhf/orpo.sh
@@ -6,14 +6,22 @@ swift rlhf \
     --rlhf_type orpo \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/examples/train/rlhf/rm.sh b/examples/train/rlhf/rm.sh
@@ -6,14 +6,22 @@ swift rlhf \
     --rlhf_type rm \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/examples/train/rlhf/simpo.sh b/examples/train/rlhf/simpo.sh
@@ -1,18 +1,25 @@
+# 2*50GB
 nproc_per_node=2
 
 CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=$nproc_per_node \
 swift rlhf \
     --rlhf_type simpo \
-    --model Qwen/Qwen2.5-7B-Instruct \
+    --model Qwen/Qwen2.5-3B-Instruct \
     --train_type full \
-    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji:zh \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
     --learning_rate 1e-5 \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --warmup_ratio 0.03 \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
-    --deepspeed zero3 \
-    --logging_steps 5
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/swift/hub/hub.py b/swift/hub/hub.py
@@ -291,7 +291,7 @@ def load_dataset(cls,
         cls.try_login(token)
         if revision is None or revision == 'main':
             revision = 'master'
-        # noinspection PyTypeChecker
+
         return MsDataset.load(
             dataset_id,
             subset_name=subset_name,

diff --git a/swift/llm/argument/export_args.py b/swift/llm/argument/export_args.py
@@ -3,6 +3,8 @@
 from dataclasses import dataclass
 from typing import Literal, Optional
 
+import torch
+
 from swift.utils import get_logger
 from .base_args import BaseArguments, to_abspath
 from .merge_args import MergeArguments
@@ -51,14 +53,6 @@ class ExportArguments(MergeArguments, BaseArguments):
     # compat
     to_peft_format: bool = False
 
-    def _init_quant(self):
-
-        if self.quant_bits:
-            if self.quant_method is None:
-                raise ValueError('Please specify the quantization method using `--quant_method awq/gptq`.')
-            if len(self.dataset) == 0 and self.quant_method in {'gptq', 'awq'}:
-                raise ValueError(f'self.dataset: {self.dataset}, Please input the quant dataset.')
-
     def _init_output_dir(self):
         suffix = None
         if self.output_dir is None:
@@ -68,7 +62,7 @@ def _init_output_dir(self):
                 suffix = 'peft'
             elif self.merge_lora:
                 suffix = 'merged'
-            elif self.quant_bits:
+            elif self.quant_method:
                 suffix = f'{self.quant_method}-int{self.quant_bits}'
             elif self.to_ollama:
                 suffix = 'ollama'
@@ -82,13 +76,14 @@ def _init_output_dir(self):
         assert not os.path.exists(self.output_dir), f'args.output_dir: {self.output_dir} already exists.'
 
     def __post_init__(self):
+        if self.quant_bits and self.quant_method is None:
+            raise ValueError('Please specify the quantization method using `--quant_method awq/gptq/bnb`.')
+        if self.quant_method and self.quant_bits is None:
+            raise ValueError('Please specify `--quant_bits`.')
+        if self.quant_method in {'gptq', 'awq'} and self.torch_dtype is None:
+            self.torch_dtype = torch.float16
+
         BaseArguments.__post_init__(self)
         self._init_output_dir()
-        if self.quant_bits:
-            self._init_quant()
-
-    def _init_torch_dtype(self) -> None:
-        if self.quant_bits and self.torch_dtype is None:
-            self.torch_dtype = 'float16'
-            logger.info(f'Setting args.torch_dtype: {self.torch_dtype}')
-        super()._init_torch_dtype()
+        if self.quant_method in {'gptq', 'awq'} and len(self.dataset) == 0:
+            raise ValueError(f'self.dataset: {self.dataset}, Please input the quant dataset.')
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
@@ -44,6 +44,7 @@ class RLHFArguments(TrainArguments):
     undesirable_weight: float = 1.0
 
     def __post_init__(self):
+        self._init_rm()
         self._init_simpo()
         self._set_default()
         super().__post_init__()
@@ -65,6 +66,11 @@ def _init_simpo(self):
         if self.beta is None:
             self.beta = 2.
 
+    def _init_rm(self):
+        if self.rlhf_type == 'rm':
+            self.task_type = 'seq_cls'
+            self.num_labels = 1
+
     def _set_default(self):
         if self.beta is None:
             self.beta = 0.1

diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
@@ -537,20 +537,10 @@ def repair_conversations(s: Union[str, Any]) -> Any:
 register_dataset(
     DatasetMeta(
         ms_dataset_id='hjh0119/shareAI-Llama3-DPO-zh-en-emoji',
-        subsets=[
-            SubsetDataset(
-                'zh',
-                preprocess_func=ResponsePreprocessor(columns_mapping={
-                    'answer_zh': 'response',
-                    'answer_en': 'rejected_response'
-                })),
-            SubsetDataset(
-                'en',
-                preprocess_func=ResponsePreprocessor(columns_mapping={
-                    'answer_en': 'response',
-                    'answer_zh': 'rejected_response'
-                }))
-        ],
+        preprocess_func=ResponsePreprocessor(columns_mapping={
+            'answer_zh': 'response',
+            'answer_en': 'rejected_response'
+        }),
         tags=['rlhf', 'dpo']))
 
 register_dataset(

diff --git a/swift/llm/export/export.py b/swift/llm/export/export.py
@@ -21,7 +21,7 @@ def run(self):
             args.adapters[0] = swift_to_peft_format(args.adapters[0], args.output_dir)
         elif args.merge_lora:
             merge_lora(args)
-        elif args.quant_method is not None:
+        elif args.quant_method:
             quantize_model(args)
         elif args.to_ollama:
             export_to_ollama(args)

diff --git a/swift/llm/infer/infer_engine/infer_engine.py b/swift/llm/infer/infer_engine/infer_engine.py
@@ -115,7 +115,6 @@ def _update_metrics(result, metrics: Optional[List[Metric]] = None):
                 metric.update(response)
         return result_origin
 
-    @torch.inference_mode()
     def infer(self,
               infer_requests: List[InferRequest],
               request_config: Optional[RequestConfig] = None,