modelscope · Jintao-Huang · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023
diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
@@ -22,7 +22,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
 
     pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/
     pip install -r requirements/llm.txt -U -i https://mirrors.aliyun.com/pypi/simple/
-    pip install -r requirements/aigc.txt -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install diffusers decord einops -U -i https://mirrors.aliyun.com/pypi/simple/
 
     # test with install
     pip install .

diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -24,7 +24,7 @@
 - `--train_dataset_sample`: 对训练集进行采样, 默认是`20000`, 用于加快训练的速度. 该参数是为了避免数据集过大, 单个epoch训练时间过长的问题. LoRA的收敛通常较快, 不需要很多数据样本的微调. 如果你指定为`-1`, 则使用完整的训练集进行训练, 该情况一般出现在全参数微调的设置下.
 - `--val_dataset_sample`: 对验证集进行采样, 默认是`None`. 如果你指定为`-1`, 则使用完整的验证集进行验证.
 - `--system`: 对话模板中使用的system, 默认为`None`, 即使用模型默认的system.
-- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 如果某数据样本长度超过max_length, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制.
+- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 当指定`--truncation_strategy delete`时, 如果某数据样本长度超过max_length, 我们会删除该数据样本. 如果指定`--truncation_strategy truncation_left`时, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制.
 - `--truncation_strategy`: 默认是`'delete'`表示把超过max_length的句子从数据集中删除. `'truncation_left'`表示会将超过文本的左边给切除掉, 这可能会切到special token, 会影响性能, 并不推荐.
 - `--check_dataset_strategy`: 默认值为`'none'`, 即不做检查. 如果你训练的模型是LLM, 则推荐使用`'warning'`作为数据检查的策略. 如果你的训练目标为句子分类等任务, 则建议设置为'`none`'.
 - `--custom_train_dataset_path`: 默认值为`None`. 具体的含义参考[自定义与拓展](./自定义与拓展.md).

diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
@@ -32,6 +32,6 @@ python llm_sft.py \
     --logging_steps 10 \
     --use_flash_attn false \
     --push_to_hub false \
-    --hub_model_id qwen-7b-chat-lora \
+    --hub_model_id qwen-vl-chat-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -587,7 +587,8 @@ def handle_compatibility(args: Union[SftArguments, InferArguments]) -> None:
 
 
 def set_model_type(args: Union[SftArguments, InferArguments]) -> None:
-    assert args.model_type is None or args.model_id_or_path is None
+    assert args.model_type is None or args.model_id_or_path is None, (
+        '`model_type` and `model_id_or_path` can only specify one of them.')
     if args.model_id_or_path is not None:
         model_mapping_reversed = {
             v['model_id_or_path'].lower(): k

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -298,7 +298,7 @@ def _register_model(
     requires=['transformers<4.34'],
     support_vllm=True)
 def get_model_tokenizer_from_repo(model_dir: str,
-                                  torch_dtype: Dtype,
+                                  torch_dtype: Optional[Dtype],
                                   model_kwargs: Dict[str, Any],
                                   load_model: bool = True,
                                   model_config=None,
@@ -309,7 +309,8 @@ def get_model_tokenizer_from_repo(model_dir: str,
     if model_config is None:
         model_config = AutoConfig.from_pretrained(
             model_dir, trust_remote_code=True)
-    model_config.torch_dtype = torch_dtype
+    if torch_dtype is not None:
+        model_config.torch_dtype = torch_dtype
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(
             model_dir, trust_remote_code=True)
@@ -332,45 +333,28 @@ def get_model_tokenizer_from_repo(model_dir: str,
     'ZhipuAI/cogagent-chat',
     LoRATM.cogagent,
     TemplateType.cogagent,
-    requires=['transformers>=4.36'],
-    support_vllm=False)
+    support_gradient_checkpointing=False)
 @register_model(
     ModelType.cogagent_vqa,
     'ZhipuAI/cogagent-vqa',
     LoRATM.cogagent,
     TemplateType.cogagent,
-    requires=['transformers>=4.36'],
-    support_vllm=False)
-def get_model_tokenizer_from_repo_cogagent(
-        model_dir: str,
-        torch_dtype: Dtype,
-        model_kwargs: Dict[str, Any],
-        load_model: bool = True,
-        model_config=None,
-        tokenizer=None,
-        automodel_class=AutoModelForCausalLM,
-        **kwargs):
-    """load from an independent repository"""
-    if model_config is None:
-        model_config = AutoConfig.from_pretrained(
-            model_dir, trust_remote_code=True)
-    model_config.torch_dtype = torch_dtype
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            'AI-ModelScope/vicuna-7b-v1.5',
-            trust_remote_code=True,
-            padding_side='left')
-    eos_token = kwargs.get('eos_token')
-    if eos_token is not None:
-        tokenizer.eos_token = eos_token
-    model = None
-    if load_model:
-        model = automodel_class.from_pretrained(
-            model_dir,
-            config=model_config,
-            torch_dtype=torch_dtype,
-            trust_remote_code=True,
-            **model_kwargs)
+    support_gradient_checkpointing=False)
+def get_model_tokenizer_cogagent(model_dir: str,
+                                 torch_dtype: Dtype,
+                                 model_kwargs: Dict[str, Any],
+                                 load_model: bool = True,
+                                 **kwargs):
+    tokenizer = AutoTokenizer.from_pretrained(
+        'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True)
+    model, tokenizer = get_model_tokenizer_from_repo(
+        model_dir,
+        torch_dtype,
+        model_kwargs,
+        load_model,
+        tokenizer=tokenizer,
+        **kwargs)
+    if model is not None:
         logger.info(
             'CogAgent with FusedLayerNorm will cause an training loss of Nan, '
             'to avoid this, please uninstall apex.')

diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -447,17 +447,15 @@ def _is_chinese_char(cp):
     return False
 
 
-def inference_stream(
-    model: PreTrainedModel,
-    template: Template,
-    query: str,
-    history: Optional[History] = None,
-    system: Optional[str] = None,
-    image: Optional['Image'] = None,
-    *,
-    generation_config: Optional[GenerationConfig] = None,
-    stop_words: Optional[List[StopWords]] = None,
-) -> Iterator[Tuple[str, History]]:
+def inference_stream(model: PreTrainedModel,
+                     template: Template,
+                     query: str,
+                     history: Optional[History] = None,
+                     system: Optional[str] = None,
+                     *,
+                     generation_config: Optional[GenerationConfig] = None,
+                     stop_words: Optional[List[StopWords]] = None,
+                     **kwargs) -> Iterator[Tuple[str, History]]:
     """
     generation_config: Priority: generation_config > model.generation_config.
     """
@@ -468,6 +466,7 @@ def inference_stream(
     else:
         history = deepcopy(history)
     example = {'query': query, 'history': history, 'system': system}
+    image = kwargs.pop('image', None)
     if image is not None:
         example['image'] = image
     inputs = template.encode(example)
@@ -500,6 +499,7 @@ def inference_stream(
         stop_words.append(template.suffix[-1])
     decode_kwargs = {}
     model_kwargs = {}
+    # Compatible with cogagent
     if 'token_type_ids' in inputs:
         model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device)
     if 'images' in inputs:
@@ -510,6 +510,7 @@ def inference_stream(
         model_kwargs['cross_images'] = [[
             inputs['cross_images'][0][0].to(device).to(torch.float16)
         ]]
+    # Compatible with qwen-audio
     if audio_info is not None:
         audio_info = get_audio_info(tokenizer, audio_info=audio_info)
         decode_kwargs['audio_info'] = audio_info

diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
@@ -97,7 +97,7 @@ def test_loss_matching(self):
                 '--verbose',
                 str(not bool_var), '--merge_lora_and_save',
                 str(bool_var), '--load_dataset_config',
-                str(bool_var) or NO_EVAL_HUMAN
+                str(bool_var or NO_EVAL_HUMAN)
             ])
             loss = output['log_history'][-1]['train_loss']
             losses.append(loss)
@@ -169,7 +169,7 @@ def test_custom_dataset(self):
             infer_args = InferArguments(
                 ckpt_dir=best_model_checkpoint,
                 load_args_from_ckpt_dir=load_args_from_ckpt_dir,
-                load_dataset_config=load_args_from_ckpt_dir or NO_EVAL_HUMAN,
+                load_dataset_config=load_args_from_ckpt_dir and NO_EVAL_HUMAN,
                 merge_lora_and_save=load_args_from_ckpt_dir,
                 val_dataset_sample=-1,
                 custom_val_dataset_path=[