From e00d9ccd5d146e9f8c6cc104f728e11e7c03d80d Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 28 Dec 2023 11:21:35 +0800 Subject: [PATCH 1/3] update 1228 --- ...44\350\241\214\345\217\202\346\225\260.md" | 2 +- .../llm/scripts/qwen_vl_chat/lora/sft.sh | 2 +- swift/llm/utils/argument.py | 3 +- swift/llm/utils/model.py | 56 +++++++------------ swift/llm/utils/utils.py | 23 ++++---- 5 files changed, 36 insertions(+), 50 deletions(-) diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 32666f3ce2..2f25db79ac 100644 --- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -24,7 +24,7 @@ - `--train_dataset_sample`: 对训练集进行采样, 默认是`20000`, 用于加快训练的速度. 该参数是为了避免数据集过大, 单个epoch训练时间过长的问题. LoRA的收敛通常较快, 不需要很多数据样本的微调. 如果你指定为`-1`, 则使用完整的训练集进行训练, 该情况一般出现在全参数微调的设置下. - `--val_dataset_sample`: 对验证集进行采样, 默认是`None`. 如果你指定为`-1`, 则使用完整的验证集进行验证. - `--system`: 对话模板中使用的system, 默认为`None`, 即使用模型默认的system. -- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 如果某数据样本长度超过max_length, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制. +- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 当指定`--truncation_strategy delete`时, 如果某数据样本长度超过max_length, 我们会删除该数据集. 如果指定`--truncation_strategy truncation_left`时, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制. - `--truncation_strategy`: 默认是`'delete'`表示把超过max_length的句子从数据集中删除. `'truncation_left'`表示会将超过文本的左边给切除掉, 这可能会切到special token, 会影响性能, 并不推荐. - `--check_dataset_strategy`: 默认值为`'none'`, 即不做检查. 如果你训练的模型是LLM, 则推荐使用`'warning'`作为数据检查的策略. 如果你的训练目标为句子分类等任务, 则建议设置为'`none`'. - `--custom_train_dataset_path`: 默认值为`None`. 具体的含义参考[自定义与拓展](./自定义与拓展.md). diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh index c1b19cf8a5..fdd602298a 100644 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh @@ -32,6 +32,6 @@ python llm_sft.py \ --logging_steps 10 \ --use_flash_attn false \ --push_to_hub false \ - --hub_model_id qwen-7b-chat-lora \ + --hub_model_id qwen-vl-chat-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index f08596d9f8..9357f72857 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -587,7 +587,8 @@ def handle_compatibility(args: Union[SftArguments, InferArguments]) -> None: def set_model_type(args: Union[SftArguments, InferArguments]) -> None: - assert args.model_type is None or args.model_id_or_path is None + assert args.model_type is None or args.model_id_or_path is None, ( + '`model_type` and `model_id_or_path` can only specify one of them.') if args.model_id_or_path is not None: model_mapping_reversed = { v['model_id_or_path'].lower(): k diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 19f4b900ea..37927f6ac6 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -298,7 +298,7 @@ def _register_model( requires=['transformers<4.34'], support_vllm=True) def get_model_tokenizer_from_repo(model_dir: str, - torch_dtype: Dtype, + torch_dtype: Optional[Dtype], model_kwargs: Dict[str, Any], load_model: bool = True, model_config=None, @@ -309,7 +309,8 @@ def get_model_tokenizer_from_repo(model_dir: str, if model_config is None: model_config = AutoConfig.from_pretrained( model_dir, trust_remote_code=True) - model_config.torch_dtype = torch_dtype + if torch_dtype is not None: + model_config.torch_dtype = torch_dtype if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained( model_dir, trust_remote_code=True) @@ -332,45 +333,28 @@ def get_model_tokenizer_from_repo(model_dir: str, 'ZhipuAI/cogagent-chat', LoRATM.cogagent, TemplateType.cogagent, - requires=['transformers>=4.36'], - support_vllm=False) + support_gradient_checkpointing=False) @register_model( ModelType.cogagent_vqa, 'ZhipuAI/cogagent-vqa', LoRATM.cogagent, TemplateType.cogagent, - requires=['transformers>=4.36'], - support_vllm=False) -def get_model_tokenizer_from_repo_cogagent( - model_dir: str, - torch_dtype: Dtype, - model_kwargs: Dict[str, Any], - load_model: bool = True, - model_config=None, - tokenizer=None, - automodel_class=AutoModelForCausalLM, - **kwargs): - """load from an independent repository""" - if model_config is None: - model_config = AutoConfig.from_pretrained( - model_dir, trust_remote_code=True) - model_config.torch_dtype = torch_dtype - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained( - 'AI-ModelScope/vicuna-7b-v1.5', - trust_remote_code=True, - padding_side='left') - eos_token = kwargs.get('eos_token') - if eos_token is not None: - tokenizer.eos_token = eos_token - model = None - if load_model: - model = automodel_class.from_pretrained( - model_dir, - config=model_config, - torch_dtype=torch_dtype, - trust_remote_code=True, - **model_kwargs) + support_gradient_checkpointing=False) +def get_model_tokenizer_cogagent(model_dir: str, + torch_dtype: Dtype, + model_kwargs: Dict[str, Any], + load_model: bool = True, + **kwargs): + tokenizer = AutoTokenizer.from_pretrained( + 'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True) + model, tokenizer = get_model_tokenizer_from_repo( + model_dir, + torch_dtype, + model_kwargs, + load_model, + tokenizer=tokenizer, + **kwargs) + if model is not None: logger.info( 'CogAgent with FusedLayerNorm will cause an training loss of Nan, ' 'to avoid this, please uninstall apex.') diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index f13283b60e..36697a3a1c 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -447,17 +447,15 @@ def _is_chinese_char(cp): return False -def inference_stream( - model: PreTrainedModel, - template: Template, - query: str, - history: Optional[History] = None, - system: Optional[str] = None, - image: Optional['Image'] = None, - *, - generation_config: Optional[GenerationConfig] = None, - stop_words: Optional[List[StopWords]] = None, -) -> Iterator[Tuple[str, History]]: +def inference_stream(model: PreTrainedModel, + template: Template, + query: str, + history: Optional[History] = None, + system: Optional[str] = None, + *, + generation_config: Optional[GenerationConfig] = None, + stop_words: Optional[List[StopWords]] = None, + **kwargs) -> Iterator[Tuple[str, History]]: """ generation_config: Priority: generation_config > model.generation_config. """ @@ -468,6 +466,7 @@ def inference_stream( else: history = deepcopy(history) example = {'query': query, 'history': history, 'system': system} + image = kwargs.pop('image', None) if image is not None: example['image'] = image inputs = template.encode(example) @@ -500,6 +499,7 @@ def inference_stream( stop_words.append(template.suffix[-1]) decode_kwargs = {} model_kwargs = {} + # Compatible with cogagent if 'token_type_ids' in inputs: model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device) if 'images' in inputs: @@ -510,6 +510,7 @@ def inference_stream( model_kwargs['cross_images'] = [[ inputs['cross_images'][0][0].to(device).to(torch.float16) ]] + # Compatible with qwen-audio if audio_info is not None: audio_info = get_audio_info(tokenizer, audio_info=audio_info) decode_kwargs['audio_info'] = audio_info From 25840d5a98d69c321455101178491437c4c2a0b3 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 28 Dec 2023 11:26:24 +0800 Subject: [PATCH 2/3] update --- ...\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 2f25db79ac..36d75d60a6 100644 --- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -24,7 +24,7 @@ - `--train_dataset_sample`: 对训练集进行采样, 默认是`20000`, 用于加快训练的速度. 该参数是为了避免数据集过大, 单个epoch训练时间过长的问题. LoRA的收敛通常较快, 不需要很多数据样本的微调. 如果你指定为`-1`, 则使用完整的训练集进行训练, 该情况一般出现在全参数微调的设置下. - `--val_dataset_sample`: 对验证集进行采样, 默认是`None`. 如果你指定为`-1`, 则使用完整的验证集进行验证. - `--system`: 对话模板中使用的system, 默认为`None`, 即使用模型默认的system. -- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 当指定`--truncation_strategy delete`时, 如果某数据样本长度超过max_length, 我们会删除该数据集. 如果指定`--truncation_strategy truncation_left`时, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制. +- `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 当指定`--truncation_strategy delete`时, 如果某数据样本长度超过max_length, 我们会删除该数据样本. 如果指定`--truncation_strategy truncation_left`时, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制. - `--truncation_strategy`: 默认是`'delete'`表示把超过max_length的句子从数据集中删除. `'truncation_left'`表示会将超过文本的左边给切除掉, 这可能会切到special token, 会影响性能, 并不推荐. - `--check_dataset_strategy`: 默认值为`'none'`, 即不做检查. 如果你训练的模型是LLM, 则推荐使用`'warning'`作为数据检查的策略. 如果你的训练目标为句子分类等任务, 则建议设置为'`none`'. - `--custom_train_dataset_path`: 默认值为`None`. 具体的含义参考[自定义与拓展](./自定义与拓展.md). From 587971eb8093f1ad7791c65478970693cc8e4e45 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 28 Dec 2023 13:56:50 +0800 Subject: [PATCH 3/3] update --- .dev_scripts/ci_container_test.sh | 2 +- tests/llm/test_run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 485ca995a1..8627bb775a 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -22,7 +22,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/ pip install -r requirements/llm.txt -U -i https://mirrors.aliyun.com/pypi/simple/ - pip install -r requirements/aigc.txt -U -i https://mirrors.aliyun.com/pypi/simple/ + pip install diffusers decord einops -U -i https://mirrors.aliyun.com/pypi/simple/ # test with install pip install . diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py index fda1bb828e..2fa393c79c 100644 --- a/tests/llm/test_run.py +++ b/tests/llm/test_run.py @@ -97,7 +97,7 @@ def test_loss_matching(self): '--verbose', str(not bool_var), '--merge_lora_and_save', str(bool_var), '--load_dataset_config', - str(bool_var) or NO_EVAL_HUMAN + str(bool_var or NO_EVAL_HUMAN) ]) loss = output['log_history'][-1]['train_loss'] losses.append(loss) @@ -169,7 +169,7 @@ def test_custom_dataset(self): infer_args = InferArguments( ckpt_dir=best_model_checkpoint, load_args_from_ckpt_dir=load_args_from_ckpt_dir, - load_dataset_config=load_args_from_ckpt_dir or NO_EVAL_HUMAN, + load_dataset_config=load_args_from_ckpt_dir and NO_EVAL_HUMAN, merge_lora_and_save=load_args_from_ckpt_dir, val_dataset_sample=-1, custom_val_dataset_path=[