diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 60474a3b78..4a8fd7eef3 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -37,7 +37,7 @@ - new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)。 - 注意:你也可以传入以`.txt`结尾的文件路径,每行为一个special token。 - num_labels: 分类模型(即`--task_type seq_cls`)需要指定该参数。代表标签数量,默认为None。 -- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为'single_label_classification'。 +- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None,若模型为 reward_model 或 num_labels 为1,该参数为'regression',其他情况,该参数为'single_label_classification'。 - rope_scaling: rope类型,支持`linear`和`dynamic`和`yarn`,或者直接传入一个json字符串:`"{\"factor\":2.0,\"type\":\"yarn\"}"`,请配合`max_model_len`共同使用。默认为None。 - max_model_len: 如果使用`rope_scaling`,可以设置`max_model_len`,该参数可以用来计算rope的`factor`倍数。最后的`max_position_embeddings`会设置为原值的`factor`倍。如果`rope_scaling`是json字符串,则本值不生效。 - device_map: 模型使用的device_map配置,例如:'auto'、'cpu'、json字符串、json文件路径。默认为None,根据设备和分布式训练情况自动设置。 diff --git "a/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 9aabef22c7..8331a216cd 100644 --- "a/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -268,7 +268,7 @@ Megatron训练参数继承自Megatron参数和基本参数(与ms-swift共用da - enable_channel_loss: 打开channel loss,默认为`False`。你需要在数据集中准备"channel"字段,ms-swift会根据该字段分组统计loss。数据集格式参考[channel loss](../Customization/自定义数据集.md#channel-loss)。 - 🔥task_type: 默认为'causal_lm'。可选为'causal_lm'、'seq_cls'。 - num_labels: 分类模型(即`--task_type seq_cls`)需要指定该参数。代表标签数量,默认为None。 -- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为'single_label_classification'。 +- problem_type: 分类模型(即`--task_type seq_cls`)需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None,若模型为 reward_model 或 num_labels 为1,该参数为'regression',其他情况,该参数为'single_label_classification'。 ## RLHF参数 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 122d993f2d..3f95ec0695 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -38,7 +38,7 @@ Hints: - new_special_tokens: The special tokens to be added. Default is `[]`. See the example [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens). - Note: You can also pass a file path ending with `.txt`, where each line represents a special token. - num_labels: This parameter is required for classification models (i.e., `--task_type seq_cls`). It represents the number of labels, with a default value of None. -- problem_type: This parameter is required for classification models (i.e., `--task_type seq_cls`). The options are 'regression', 'single_label_classification', and 'multi_label_classification'. The default value is 'single_label_classification'. +- problem_type: This parameter is required for classification models (i.e., `--task_type seq_cls`). The options are 'regression', 'single_label_classification', and 'multi_label_classification'. Defaults to None. If the model is a reward_model or num_labels equals 1, this parameter is 'regression'; otherwise it is 'single_label_classification'. - rope_scaling: RoPE type, supports `linear`, `dynamic`, and `yarn`, or you can directly pass in a JSON string: `"{\"factor\":2.0,\"type\":\"yarn\"}"`. Please use in conjunction with `max_model_len`. Default is None. - max_model_len: If using `rope_scaling`, you can set `max_model_len`. This parameter can be used to calculate the RoPE `factor` multiplier. The final `max_position_embeddings` will be set to the original value multiplied by the `factor`. If `rope_scaling` is a JSON string, this value will not take effect. - device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions. diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 4f72f6a52c..6de6f07a2f 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -283,7 +283,7 @@ Megatron training parameters are inherited from Megatron parameters and basic pa - enable_channel_loss: Enable channel loss, default is `false`. You need to prepare a "channel" field in your dataset; ms-swift will compute and aggregate the loss grouped by this field. For dataset format, please refer to [channel loss](../Customization/Custom-dataset.md#channel-loss). - 🔥task_type: Defaults to "causal_lm". Options: "causal_lm", "seq_cls". - num_labels: Required for classification models (i.e., `--task_type seq_cls`). Represents the number of labels; default is None. -- problem_type: Required for classification models (i.e., `--task_type seq_cls`). Options: "regression", "single_label_classification", "multi_label_classification". Default is "single_label_classification". +- problem_type: Required for classification models (i.e., `--task_type seq_cls`). Options: "regression", "single_label_classification", "multi_label_classification". Defaults to None. If the model is a reward_model or num_labels equals 1, this parameter is 'regression'; otherwise it is 'single_label_classification'. ## RLHF Parameters diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py index 0b1a57ae2e..e74a2d912e 100644 --- a/swift/llm/argument/base_args/model_args.py +++ b/swift/llm/argument/base_args/model_args.py @@ -46,8 +46,7 @@ class ModelArguments: new_special_tokens: List[str] = field(default_factory=list) num_labels: Optional[int] = None - problem_type: Literal['regression', 'single_label_classification', - 'multi_label_classification'] = 'single_label_classification' + problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None rope_scaling: Optional[str] = None device_map: Optional[Union[dict, str]] = None max_memory: Optional[Union[dict, str]] = None diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py index 5efa78e03e..1c828c049e 100644 --- a/swift/llm/model/register.py +++ b/swift/llm/model/register.py @@ -232,6 +232,7 @@ def get_model_tokenizer_from_local(model_dir: str, rope_scaling = kwargs.get('rope_scaling') max_model_len = kwargs.get('max_model_len') return_dummy_model = kwargs.get('return_dummy_model') + model_meta = kwargs.get('model_meta') if rope_scaling: HfConfigFactory.set_config_attr(model_config, 'rope_scaling', rope_scaling) if max_model_len: @@ -245,6 +246,15 @@ def get_model_tokenizer_from_local(model_dir: str, model_info.num_labels = num_labels model_config.num_labels = num_labels + if model_info.task_type == 'seq_cls': + problem_type = kwargs.get('problem_type') + if problem_type is None: + if model_info.num_labels == 1 or model_meta.is_reward: + problem_type = 'regression' + else: + problem_type = 'single_label_classification' + model_config.problem_type = problem_type + if model_info.quant_method == 'fp8': torch_dtype = 'auto' model = None @@ -260,7 +270,6 @@ def get_model_tokenizer_from_local(model_dir: str, model = None automodel_class = automodel_class or AutoModelForCausalLM - model_meta = kwargs['model_meta'] context_kwargs = { 'model_info': model_info, 'model_meta': model_meta, @@ -715,12 +724,6 @@ def get_model_tokenizer( # fix transformers==4.52.4 qwen2.5-vl HfConfigFactory.set_config_attr(llm_model.config, 'vocab_size', vocab_size) - if task_type == 'seq_cls': - problem_type = kwargs.get('problem_type') - if problem_type is None and model_info.num_labels == 1: - problem_type = 'regression' - if problem_type is not None: - model_info.config.problem_type = problem_type tokenizer.model_info = model_info tokenizer.model_meta = model_meta diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py index 88b886ee1c..a9847db55e 100644 --- a/swift/megatron/argument/megatron_args.py +++ b/swift/megatron/argument/megatron_args.py @@ -102,8 +102,7 @@ class ExtraMegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): enable_channel_loss: bool = False task_type: Literal['causal_lm', 'seq_cls'] = None num_labels: Optional[int] = None - problem_type: Literal['regression', 'single_label_classification', - 'multi_label_classification'] = 'single_label_classification' + problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None original_max_position_embeddings: Optional[int] = None partial_rotary_factor: Optional[float] = None