From a9618198d004f05299f75c2f1b8bebf947495a29 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 19 Dec 2023 16:54:29 +0800 Subject: [PATCH 01/10] unfinished work --- .../llm/scripts/cogagent_chat/lora/infer.sh | 15 ++++++++ .../llm/scripts/cogagent_chat/lora/sft.sh | 34 +++++++++++++++++++ swift/llm/utils/model.py | 20 +++++++++++ swift/llm/utils/template.py | 17 +++++++++- 4 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh create mode 100644 examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh new file mode 100644 index 0000000000..5edb32c604 --- /dev/null +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh @@ -0,0 +1,15 @@ +# Experimental environment: V100, A10, 3090 +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0 \ +python llm_infer.py \ + --ckpt_dir "output/codefuse-codellama-34b-chat/vx_xxx/checkpoint-xxx" \ + --load_args_from_ckpt_dir true \ + --eval_human false \ + --max_length 4096 \ + --use_flash_attn true \ + --max_new_tokens 2048 \ + --temperature 0.3 \ + --top_p 0.7 \ + --repetition_penalty 1.05 \ + --do_sample true \ + --merge_lora_and_save false \ diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh new file mode 100644 index 0000000000..c23bbb8749 --- /dev/null +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh @@ -0,0 +1,34 @@ +# Experimental environment: V100, A10, 3090 +# 18GB GPU memory +PYTHONPATH=../../.. \ +CUDA_VISIBLE_DEVICES=0 \ +python llm_sft.py \ + --model_type cogagent-chat \ + --sft_type lora \ + --tuner_backend swift \ + --dtype fp16 \ + --output_dir output \ + --custom_train_dataset_path xxx.jsonl \ + --custom_val_dataset_path yyy.jsonl \ + --train_dataset_sample -1 \ + --num_train_epochs 1 \ + --max_length 4096 \ + --check_dataset_strategy warning \ + --lora_rank 8 \ + --lora_alpha 32 \ + --lora_dropout_p 0.05 \ + --gradient_checkpointing true \ + --batch_size 1 \ + --weight_decay 0.01 \ + --learning_rate 1e-4 \ + --gradient_accumulation_steps 16 \ + --max_grad_norm 0.5 \ + --warmup_ratio 0.03 \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 10 \ + --push_to_hub false \ + --hub_model_id cogagent-chat-lora \ + --hub_private_repo true \ + --hub_token 'your-sdk-token' \ diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 4aeb013db3..370eee36d9 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -152,6 +152,9 @@ class ModelType: deepseek_coder_33b = 'deepseek-coder-33b' deepseek_coder_33b_chat = 'deepseek-coder-33b-chat' + cogagent_chat = 'cogagent-chat' + cogagent_vqa = 'cogagent-vqa' + @classmethod def get_model_name_list(cls) -> List[str]: res = [] @@ -170,6 +173,9 @@ class LoRATM(NamedTuple): qwen = ['c_attn'] polylm = ['c_attn'] bloom = ['query_key_value'] + cogagent = ['vision_expert_query_key_value', 'vision_expert_dense', + 'language_expert_query_key_value', 'language_expert_dense', + 'query', 'key_value', 'dense'] GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel], @@ -285,6 +291,20 @@ def _register_model( TemplateType.default_generation, requires=['transformers<4.34'], support_vllm=True) +@register_model( + ModelType.cogagent_chat, + 'ZhipuAI/cogagent-chat', + LoRATM.cogagent_chat, + TemplateType.llama, + requires=['transformers>=4.36'], + support_vllm=False) +@register_model( + ModelType.cogagent_vqa, + 'ZhipuAI/cogagent-vqa', + LoRATM.cogagent, + TemplateType.cogagent_vqa, + requires=['transformers>=4.36'], + support_vllm=False) def get_model_tokenizer_from_repo(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index fb46b7ad24..9c357228d9 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -35,6 +35,7 @@ class TemplateType: deepseek = 'deepseek' codefuse_codellama = 'codefuse-codellama' deepseek_coder = 'deepseek-coder' + cogagent = 'cogagent' @classmethod def get_template_name_list(cls) -> List[str]: @@ -125,7 +126,8 @@ def _concat_context_list( def _encode_context_list( tokenizer: PreTrainedTokenizerBase, context_list: List[Context], - compute_loss_idx: Optional[List[int]] = None + compute_loss_idx: Optional[List[int]] = None, + **args, ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]: input_ids: List[int] = [] labels: List[int] = [] @@ -154,6 +156,7 @@ def _encode_context_list( [old_audio_info[k], audio_info[k]], dim=0) for k in ['audio_span_tokens', 'audio_urls']: old_audio_info[k] = old_audio_info[k] + audio_info[k] + token_list = tokenizer( context, return_attention_mask=False, @@ -330,6 +333,14 @@ def encode(self, example: Dict[str, self.truncation_strategy) +class CogAgentTemplate(Template): + + def encode(self, example: Dict[str, + Any], model) -> Dict[str, Optional[List[int]]]: + image = Image.open(context).convert('RGB') + return model.build_conversation_input_ids(self.tokenizer, query=example['query'], + history=example['history'], images=[example['image']]) + TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} @@ -484,6 +495,10 @@ def register_template(template_type: str, Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '], ['<|endoftext|>'], ['<|endoftext|>'], '')) +register_template( + TemplateType.cogagent, + CogAgentTemplate([], [], [], [], None, [])) + def get_template( template_type: str, From 175e20893da086ee468ee3ed8666f7e5debafee3 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Tue, 19 Dec 2023 20:17:19 +0800 Subject: [PATCH 02/10] fix --- swift/llm/sft.py | 2 +- swift/llm/utils/dataset.py | 24 ++++++++++++++++++++++++ swift/llm/utils/model.py | 6 +++--- swift/llm/utils/template.py | 26 +++++++++++++++++++------- 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/swift/llm/sft.py b/swift/llm/sft.py index 96854339da..b81998ded7 100644 --- a/swift/llm/sft.py +++ b/swift/llm/sft.py @@ -174,7 +174,7 @@ def llm_sft(args: SftArguments) -> str: logger.info(f'val_dataset: {val_dataset}') template: Template = get_template(args.template_type, tokenizer, args.system, args.max_length, - args.truncation_strategy) + args.truncation_strategy, model=model) args.system = template.default_system logger.info(f'system: {args.system}') if not args.lazy_tokenize: diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 0845fe03ea..5d76b2a297 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -106,6 +106,7 @@ class DatasetName: # vision coco_en = 'coco-en' coco_mini_en = 'coco-mini-en' + capcha_images = 'capcha-images' # audio aishell1_zh = 'aishell1-zh' aishell1_mini_zh = 'aishell1-mini-zh' @@ -599,6 +600,29 @@ def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset: get_dataset_from_repo, tags=['chat', 'general', 'multi-round']) + +def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset: + dataset = dataset.rename_columns({ + 'image': 'query', + 'solution': 'response', + }) + def add_system(row): + row['system'] = 'CAPTCHA:' + return row + dataset = dataset.map(add_system) + return dataset + + +register_dataset( + DatasetName.capcha_images, + 'AI-ModelScope/captcha-images', + [('default', 'train')], + [('default', 'validation')], + _preprocess_capcha_images, + get_dataset_from_repo, + tags=['chat', 'multi-modal', 'vision', '🔥']) + + register_dataset( DatasetName.cls_fudan_news_zh, 'damo/zh_cls_fudan-news', ['train'], diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 370eee36d9..48824a7732 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -294,15 +294,15 @@ def _register_model( @register_model( ModelType.cogagent_chat, 'ZhipuAI/cogagent-chat', - LoRATM.cogagent_chat, - TemplateType.llama, + LoRATM.cogagent, + TemplateType.cogagent, requires=['transformers>=4.36'], support_vllm=False) @register_model( ModelType.cogagent_vqa, 'ZhipuAI/cogagent-vqa', LoRATM.cogagent, - TemplateType.cogagent_vqa, + TemplateType.cogagent, requires=['transformers>=4.36'], support_vllm=False) def get_model_tokenizer_from_repo(model_dir: str, diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 9c357228d9..31143dc0f4 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -297,7 +297,8 @@ def _init_template( tokenizer: PreTrainedTokenizerBase, default_system: Optional[str] = None, max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete' + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs ) -> None: assert self._is_init is False self._is_init = True @@ -335,11 +336,21 @@ def encode(self, example: Dict[str, class CogAgentTemplate(Template): + def _init_template( + self, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs + ) -> None: + self.model = kwargs.pop('model') + super()._init_template(tokenizer, default_system, max_length, truncation_strategy) + def encode(self, example: Dict[str, - Any], model) -> Dict[str, Optional[List[int]]]: - image = Image.open(context).convert('RGB') - return model.build_conversation_input_ids(self.tokenizer, query=example['query'], - history=example['history'], images=[example['image']]) + Any]) -> Dict[str, Optional[List[int]]]: + return self.model.build_conversation_input_ids(self.tokenizer, query=example['response'], + history=None, images=[example['query'].convert('RGB')]) TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} @@ -505,11 +516,12 @@ def get_template( tokenizer: PreTrainedTokenizerBase, default_system: Optional[str] = None, max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete' + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs, ) -> Template: template_info = TEMPLATE_MAPPING[template_type] template = deepcopy(template_info['template']) template._init_template(tokenizer, default_system, max_length, - truncation_strategy) + truncation_strategy, **kwargs) template.template_type = template_type return template From d6c940d8605f27829d153d0c71bd8f26c9d24adc Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 19 Dec 2023 21:20:52 +0800 Subject: [PATCH 03/10] fix --- swift/llm/utils/template.py | 195 +++++++++++++++++++++++++++++------- 1 file changed, 161 insertions(+), 34 deletions(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 31143dc0f4..c2a469a404 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -80,7 +80,7 @@ def get_audio_info( *, context: Optional[str] = None, audio_info: Optional[Dict[str, - Any]] = None) -> Optional[Dict[str, Any]]: + Any]] = None) -> Optional[Dict[str, Any]]: assert context is not None or audio_info is not None assert context is None or audio_info is None if context is None: @@ -93,13 +93,13 @@ def get_audio_info( def _concat_context_list( - context_list: List[Context], - res_context_list: List[Context], - compute_loss_idx: List[int], - system: Optional[str] = None, - query: Optional[str] = None, - response: Optional[str] = None, - round0: Optional[int] = None, + context_list: List[Context], + res_context_list: List[Context], + compute_loss_idx: List[int], + system: Optional[str] = None, + query: Optional[str] = None, + response: Optional[str] = None, + round0: Optional[int] = None, ) -> None: # concat context list and replace placeholder round1 = None @@ -124,10 +124,10 @@ def _concat_context_list( def _encode_context_list( - tokenizer: PreTrainedTokenizerBase, - context_list: List[Context], - compute_loss_idx: Optional[List[int]] = None, - **args, + tokenizer: PreTrainedTokenizerBase, + context_list: List[Context], + compute_loss_idx: Optional[List[int]] = None, + **args, ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]: input_ids: List[int] = [] labels: List[int] = [] @@ -293,12 +293,12 @@ def __init__(self, self._is_init = False def _init_template( - self, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs + self, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs ) -> None: assert self._is_init is False self._is_init = True @@ -310,7 +310,7 @@ def _init_template( self.truncation_strategy = truncation_strategy def encode(self, example: Dict[str, - Any]) -> Dict[str, Optional[List[int]]]: + Any]) -> Dict[str, Optional[List[int]]]: if not self._is_init: raise ValueError( 'Template has not been initialized, please call init_template(...) first.' @@ -335,22 +335,149 @@ def encode(self, example: Dict[str, class CogAgentTemplate(Template): + LANGUAGE_TOKEN_TYPE = 0 + VISION_TOKEN_TYPE = 1 def _init_template( - self, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs + self, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs ) -> None: self.model = kwargs.pop('model') super()._init_template(tokenizer, default_system, max_length, truncation_strategy) + @staticmethod + def vqa_history_to_prompt(history, query): + # Only support single round chat in vqa mode + prompt = "Question: " + # for i, (old_query, response) in enumerate(history): + # prompt += old_query + " Short answer: " + response + " Question: " + prompt += query + " Short answer:" + return prompt + + @staticmethod + def chat_old_history_to_prompt(history, query): + prompt = "Question: " + for i, (old_query, response) in enumerate(history): + prompt += old_query + " Answer: " + response + "\nQuestion: " + prompt += query + " Answer:" + return prompt + + @staticmethod + def chat_history_to_prompt(history, query): + prompt = " [INST] " + for i, (old_query, response) in enumerate(history): + prompt += old_query + " [/INST] " + response + " [INST] " + prompt += query + " [/INST] " + return prompt + + @staticmethod + def base_history_to_prompt(history, query): + prompt = query + return prompt + + _history_to_prompt = { + "base": base_history_to_prompt, + "chat": chat_history_to_prompt, + "chat_old": chat_old_history_to_prompt, + "vqa": vqa_history_to_prompt + } + + def build_conversation_input_ids( + self, + tokenizer: "PreTrainedTokenizer", + *, + query: str, + label: str, + history: Optional[List[Tuple[str, str]]] = None, + images: Optional[List["PIL.Image"]] = None, + template_version: Optional[Literal["base", "chat", "vqa"]] = None, + ): + from torchvision import transforms + image_size: int = self.config.vision_config['image_size'] + cross_image_size: int = self.config.cross_image_size + patch_size: int = self.config.vision_config['patch_size'] + template_version = template_version or self.config.template_version + assert images is None or len(images) <= 1, f"not support multi images by now." + history = history or [] + text = self._history_to_prompt[template_version](history, query) + + input_ids = [tokenizer.bos_token_id] + token_type_ids = [self.LANGUAGE_TOKEN_TYPE] + if images is not None and len(images) == 1: + ori = images + # vision + transform = transforms.Compose( + [ + transforms.Resize( + (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC + ), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + images = [transform(ori[0])] + cross_transform = transforms.Compose( + [ + transforms.Resize( + (cross_image_size, cross_image_size), interpolation=transforms.InterpolationMode.BICUBIC + ), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + cross_images = [cross_transform(ori[0])] + # language + vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2 + input_ids += [tokenizer.pad_token_id] * vision_token_num + token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num + text_ids = tokenizer.encode(text, add_special_tokens=False) + label_ids = tokenizer.encode(label, add_special_tokens=False) + if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1: + if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1): + return None + else: + text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):] + + input_ids += text_ids + labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id] + input_ids += label_ids + [tokenizer.eos_token_id] + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids) + attention_mask = [1] * len(input_ids) + + if len(input_ids) < self.max_length: + padding_len = self.max_length - len(input_ids) + input_ids += [tokenizer.pad_token_id] * padding_len + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len + attention_mask += [0] * padding_len + labels += [-100] * padding_len + + return { + 'input_ids': torch.tensor(input_ids, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), + 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), + 'images': images, + 'cross_images': cross_images, + 'labels': labels + } + def encode(self, example: Dict[str, - Any]) -> Dict[str, Optional[List[int]]]: - return self.model.build_conversation_input_ids(self.tokenizer, query=example['response'], - history=None, images=[example['query'].convert('RGB')]) + Any]) -> Dict[str, Optional[List[int]]]: + input_kwargs = self.build_conversation_input_ids(self.tokenizer, query=example['system'], + label=example['response'], + history=example.get('history'), + images=[example['query'].convert('RGB')]) + if len(input_kwargs['input_ids']) > self.max_length - 1: + if self.truncation_strategy == 'delete': + return None + else: + input_kwargs['input_ids'] = input_kwargs['input_ids'][-self.max_length - 1:] + input_kwargs['attention_mask'] = input_kwargs['attention_mask'][-self.max_length - 1:] + input_kwargs['token_type_ids'] = input_kwargs['input_ids'][:self.max_length - 1] + TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} @@ -512,12 +639,12 @@ def register_template(template_type: str, def get_template( - template_type: str, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs, + template_type: str, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs, ) -> Template: template_info = TEMPLATE_MAPPING[template_type] template = deepcopy(template_info['template']) From 21a3477c756a9bfef04eafa0c8538de8b29c661a Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 13:16:11 +0800 Subject: [PATCH 04/10] fix --- swift/llm/utils/model.py | 34 ++++++++++++++++++++++++++++++++-- swift/llm/utils/template.py | 19 ++++++------------- swift/llm/utils/utils.py | 15 +++++++++++++++ 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 48824a7732..7c6d82b217 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -291,6 +291,36 @@ def _register_model( TemplateType.default_generation, requires=['transformers<4.34'], support_vllm=True) +def get_model_tokenizer_from_repo(model_dir: str, + torch_dtype: Dtype, + model_kwargs: Dict[str, Any], + load_model: bool = True, + model_config=None, + tokenizer=None, + automodel_class=AutoModelForCausalLM, + **kwargs): + """load from an independent repository""" + if model_config is None: + model_config = AutoConfig.from_pretrained( + model_dir, trust_remote_code=True) + model_config.torch_dtype = torch_dtype + if tokenizer is None: + tokenizer = AutoTokenizer.from_pretrained( + model_dir, trust_remote_code=True) + eos_token = kwargs.get('eos_token') + if eos_token is not None: + tokenizer.eos_token = eos_token + model = None + if load_model: + model = automodel_class.from_pretrained( + model_dir, + config=model_config, + torch_dtype=torch_dtype, + trust_remote_code=True, + **model_kwargs) + return model, tokenizer + + @register_model( ModelType.cogagent_chat, 'ZhipuAI/cogagent-chat', @@ -305,7 +335,7 @@ def _register_model( TemplateType.cogagent, requires=['transformers>=4.36'], support_vllm=False) -def get_model_tokenizer_from_repo(model_dir: str, +def get_model_tokenizer_from_repo_cogagent(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], load_model: bool = True, @@ -320,7 +350,7 @@ def get_model_tokenizer_from_repo(model_dir: str, model_config.torch_dtype = torch_dtype if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained( - model_dir, trust_remote_code=True) + 'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True) eos_token = kwargs.get('eos_token') if eos_token is not None: tokenizer.eos_token = eos_token diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index c2a469a404..dd383b91da 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -397,10 +397,10 @@ def build_conversation_input_ids( template_version: Optional[Literal["base", "chat", "vqa"]] = None, ): from torchvision import transforms - image_size: int = self.config.vision_config['image_size'] - cross_image_size: int = self.config.cross_image_size - patch_size: int = self.config.vision_config['patch_size'] - template_version = template_version or self.config.template_version + image_size: int = self.model.config.vision_config['image_size'] + cross_image_size: int = self.model.config.cross_image_size + patch_size: int = self.model.config.vision_config['patch_size'] + template_version = template_version or self.model.config.template_version assert images is None or len(images) <= 1, f"not support multi images by now." history = history or [] text = self._history_to_prompt[template_version](history, query) @@ -445,7 +445,7 @@ def build_conversation_input_ids( input_ids += text_ids labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id] input_ids += label_ids + [tokenizer.eos_token_id] - token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids) + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1) attention_mask = [1] * len(input_ids) if len(input_ids) < self.max_length: @@ -466,17 +466,10 @@ def build_conversation_input_ids( def encode(self, example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]: - input_kwargs = self.build_conversation_input_ids(self.tokenizer, query=example['system'], + return self.build_conversation_input_ids(self.tokenizer, query=example['system'], label=example['response'], history=example.get('history'), images=[example['query'].convert('RGB')]) - if len(input_kwargs['input_ids']) > self.max_length - 1: - if self.truncation_strategy == 'delete': - return None - else: - input_kwargs['input_ids'] = input_kwargs['input_ids'][-self.max_length - 1:] - input_kwargs['attention_mask'] = input_kwargs['attention_mask'][-self.max_length - 1:] - input_kwargs['token_type_ids'] = input_kwargs['input_ids'][:self.max_length - 1] TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index 23d5a61816..60782a618c 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -341,6 +341,21 @@ def data_collate_fn(batch: List[Dict[str, Any]], get_audio_info(tokenizer, audio_info=b['audio_info']) for b in batch ] + if batch[0].get('images') is not None: + res['images'] = [ + b['images'] + for b in batch + ] + if batch[0].get('cross_images') is not None: + res['cross_images'] = [ + b['cross_images'] + for b in batch + ] + if batch[0].get('token_type_ids') is not None: + res['token_type_ids'] = torch.stack([ + b['token_type_ids'] + for b in batch + ]) return res From 2be6724f25e454f6157ae0553694f6594a01d65f Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 21:23:38 +0800 Subject: [PATCH 05/10] fix --- .../llm/scripts/cogagent_chat/lora/infer.sh | 4 +- swift/llm/infer.py | 8 ++- swift/llm/utils/dataset.py | 5 +- swift/llm/utils/model.py | 2 +- swift/llm/utils/template.py | 71 ++++++++++++------- swift/llm/utils/utils.py | 16 ++++- 6 files changed, 71 insertions(+), 35 deletions(-) diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh index 5edb32c604..6dd1a0604d 100644 --- a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh @@ -2,9 +2,9 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0 \ python llm_infer.py \ - --ckpt_dir "output/codefuse-codellama-34b-chat/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "/mnt/workspace/yzhao/tastelikefeet/swift/examples/pytorch/llm/output/cogagent-chat/v47-20231220-132558/checkpoint-400" \ --load_args_from_ckpt_dir true \ - --eval_human false \ + --eval_human true \ --max_length 4096 \ --use_flash_attn true \ --max_new_tokens 2048 \ diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 869e6ab4c0..55474f7962 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -142,7 +142,7 @@ def prepare_model_template( template: Template = get_template(args.template_type, tokenizer, args.system, args.max_length, - args.truncation_strategy) + args.truncation_strategy, model=model) args.system = template.default_system logger.info(f'system: {args.system}') return model, template @@ -175,6 +175,10 @@ def llm_infer(args: InferArguments) -> None: logger.info( 'The current template only supports single-round dialogues.') history = [] + if 'cogagent' in args.model_type: + image = input('Input an image url<<< ') + from PIL import Image + image = Image.open(image) while True: if input_mode == 'S': query = input('<<< ') @@ -210,7 +214,7 @@ def llm_infer(args: InferArguments) -> None: print(response[print_idx:], end='', flush=True) print_idx = len(response) else: - gen = inference_stream(model, template, query, history) + gen = inference_stream(model, template, query, history, image=image) for response, new_history in gen: if len(response) > print_idx: print(response[print_idx:], end='', flush=True) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 5d76b2a297..9cc74cc6a9 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -28,7 +28,7 @@ def _remove_useless_columns(dataset: HfDataset) -> HfDataset: k_list = [] for k in dataset.features.keys(): - if k in {'query', 'response', 'system', 'history'}: + if k in {'query', 'response', 'system', 'history', 'image'}: k_list.append(k) dataset = dataset.select_columns(k_list) return dataset @@ -603,11 +603,10 @@ def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset: def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset: dataset = dataset.rename_columns({ - 'image': 'query', 'solution': 'response', }) def add_system(row): - row['system'] = 'CAPTCHA:' + row['query'] = 'CAPTCHA:' return row dataset = dataset.map(add_system) return dataset diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 7c6d82b217..3d579dfe14 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -350,7 +350,7 @@ def get_model_tokenizer_from_repo_cogagent(model_dir: str, model_config.torch_dtype = torch_dtype if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained( - 'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True) + 'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True, padding_side='left') eos_token = kwargs.get('eos_token') if eos_token is not None: tokenizer.eos_token = eos_token diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index dd383b91da..3e7c97c3c4 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -310,7 +310,7 @@ def _init_template( self.truncation_strategy = truncation_strategy def encode(self, example: Dict[str, - Any]) -> Dict[str, Optional[List[int]]]: + Any], **kwargs) -> Dict[str, Optional[List[int]]]: if not self._is_init: raise ValueError( 'Template has not been initialized, please call init_template(...) first.' @@ -347,6 +347,8 @@ def _init_template( **kwargs ) -> None: self.model = kwargs.pop('model') + self.suffix = [tokenizer.eos_token] + tokenizer.padding_side = 'left' super()._init_template(tokenizer, default_system, max_length, truncation_strategy) @staticmethod @@ -391,10 +393,11 @@ def build_conversation_input_ids( tokenizer: "PreTrainedTokenizer", *, query: str, - label: str, + label: Optional[str] = None, history: Optional[List[Tuple[str, str]]] = None, images: Optional[List["PIL.Image"]] = None, template_version: Optional[Literal["base", "chat", "vqa"]] = None, + train: Optional[bool] = True, ): from torchvision import transforms image_size: int = self.model.config.vision_config['image_size'] @@ -435,7 +438,10 @@ def build_conversation_input_ids( input_ids += [tokenizer.pad_token_id] * vision_token_num token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num text_ids = tokenizer.encode(text, add_special_tokens=False) - label_ids = tokenizer.encode(label, add_special_tokens=False) + if label is not None: + label_ids = tokenizer.encode(label, add_special_tokens=False) + else: + label_ids = [] if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1: if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1): return None @@ -443,33 +449,48 @@ def build_conversation_input_ids( text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):] input_ids += text_ids - labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id] - input_ids += label_ids + [tokenizer.eos_token_id] - token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1) + if label_ids: + labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id] + if train: + input_ids += label_ids + [tokenizer.eos_token_id] + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1) + else: + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids) attention_mask = [1] * len(input_ids) - if len(input_ids) < self.max_length: - padding_len = self.max_length - len(input_ids) - input_ids += [tokenizer.pad_token_id] * padding_len - token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len - attention_mask += [0] * padding_len - labels += [-100] * padding_len - - return { - 'input_ids': torch.tensor(input_ids, dtype=torch.long), - 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), - 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), - 'images': images, - 'cross_images': cross_images, - 'labels': labels - } + # if len(input_ids) < self.max_length: + # padding_len = self.max_length - len(input_ids) + # input_ids += [tokenizer.pad_token_id] * padding_len + # token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len + # attention_mask += [0] * padding_len + # if label_ids: + # labels += [-100] * padding_len + + if train: + return { + 'input_ids': torch.tensor(input_ids, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), + 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), + 'images': images, + 'cross_images': cross_images, + 'labels': labels, + } + else: + return { + 'input_ids': torch.tensor(input_ids, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0), + 'attention_mask': torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0), + 'images': [images], + 'cross_images': [cross_images], + } def encode(self, example: Dict[str, - Any]) -> Dict[str, Optional[List[int]]]: - return self.build_conversation_input_ids(self.tokenizer, query=example['system'], - label=example['response'], + Any], train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]: + return self.build_conversation_input_ids(self.tokenizer, query=example['query'], + label=example.get('response'), history=example.get('history'), - images=[example['query'].convert('RGB')]) + images=[example['image'].convert('RGB')], + train=train) TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index 60782a618c..ef6ef209a8 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -460,6 +460,7 @@ def inference_stream( query: str, history: Optional[History] = None, system: Optional[str] = None, + image: Optional['Image'] = None, *, generation_config: Optional[GenerationConfig] = None ) -> Iterator[Tuple[str, History]]: @@ -471,13 +472,18 @@ def inference_stream( else: history = deepcopy(history) example = {'query': query, 'history': history, 'system': system} - inputs = template.encode(example) + if image is not None: + example['image'] = image + inputs = template.encode(example, train=False) audio_info = inputs.get('audio_info') # Compatible with qwen-audio input_ids = inputs['input_ids'] tokenizer = template.tokenizer device = next(model.parameters()).device input_ids = torch.tensor(input_ids)[None].to(device) - attention_mask = torch.ones_like(input_ids).to(device) + if 'attention_mask' not in inputs: + attention_mask = torch.ones_like(input_ids).to(device) + else: + attention_mask = inputs['attention_mask'].to(device) model.eval() if generation_config is None: generation_config = getattr(model, 'generation_config', None) @@ -497,6 +503,12 @@ def inference_stream( stop_words = [template.suffix[-1]] decode_kwargs = {} model_kwargs = {} + if 'token_type_ids' in inputs: + model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device) + if 'images' in inputs: + model_kwargs['images'] = [[inputs['images'][0][0].to(device).to(torch.float16)]] + if 'cross_images' in inputs: + model_kwargs['cross_images'] = [[inputs['cross_images'][0][0].to(device).to(torch.float16)]] if audio_info is not None: audio_info = get_audio_info(tokenizer, audio_info=audio_info) decode_kwargs['audio_info'] = audio_info From 27c7631820ce0fa4b44cec96fd5e62023acce22d Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 20 Dec 2023 21:38:40 +0800 Subject: [PATCH 06/10] fix --- swift/llm/infer.py | 13 +- swift/llm/sft.py | 10 +- swift/llm/utils/dataset.py | 6 +- swift/llm/utils/model.py | 29 +++-- swift/llm/utils/template.py | 230 +++++++++++++++++++----------------- swift/llm/utils/utils.py | 26 ++-- 6 files changed, 166 insertions(+), 148 deletions(-) diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 55474f7962..e0f5ade9c9 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -140,9 +140,13 @@ def prepare_model_template( logger.info(get_model_info(model)) show_layers(model) - template: Template = get_template(args.template_type, tokenizer, - args.system, args.max_length, - args.truncation_strategy, model=model) + template: Template = get_template( + args.template_type, + tokenizer, + args.system, + args.max_length, + args.truncation_strategy, + model=model) args.system = template.default_system logger.info(f'system: {args.system}') return model, template @@ -214,7 +218,8 @@ def llm_infer(args: InferArguments) -> None: print(response[print_idx:], end='', flush=True) print_idx = len(response) else: - gen = inference_stream(model, template, query, history, image=image) + gen = inference_stream( + model, template, query, history, image=image) for response, new_history in gen: if len(response) > print_idx: print(response[print_idx:], end='', flush=True) diff --git a/swift/llm/sft.py b/swift/llm/sft.py index b81998ded7..8c57a11fc4 100644 --- a/swift/llm/sft.py +++ b/swift/llm/sft.py @@ -172,9 +172,13 @@ def llm_sft(args: SftArguments) -> str: logger.info(f'train_dataset: {train_dataset}') logger.info(f'val_dataset: {val_dataset}') - template: Template = get_template(args.template_type, tokenizer, - args.system, args.max_length, - args.truncation_strategy, model=model) + template: Template = get_template( + args.template_type, + tokenizer, + args.system, + args.max_length, + args.truncation_strategy, + model=model) args.system = template.default_system logger.info(f'system: {args.system}') if not args.lazy_tokenize: diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 9cc74cc6a9..d6a8803625 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -605,23 +605,23 @@ def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset: dataset = dataset.rename_columns({ 'solution': 'response', }) + def add_system(row): row['query'] = 'CAPTCHA:' return row + dataset = dataset.map(add_system) return dataset register_dataset( DatasetName.capcha_images, - 'AI-ModelScope/captcha-images', - [('default', 'train')], + 'AI-ModelScope/captcha-images', [('default', 'train')], [('default', 'validation')], _preprocess_capcha_images, get_dataset_from_repo, tags=['chat', 'multi-modal', 'vision', '🔥']) - register_dataset( DatasetName.cls_fudan_news_zh, 'damo/zh_cls_fudan-news', ['train'], diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 3d579dfe14..8253bb4e83 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -173,9 +173,11 @@ class LoRATM(NamedTuple): qwen = ['c_attn'] polylm = ['c_attn'] bloom = ['query_key_value'] - cogagent = ['vision_expert_query_key_value', 'vision_expert_dense', - 'language_expert_query_key_value', 'language_expert_dense', - 'query', 'key_value', 'dense'] + cogagent = [ + 'vision_expert_query_key_value', 'vision_expert_dense', + 'language_expert_query_key_value', 'language_expert_dense', 'query', + 'key_value', 'dense' + ] GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel], @@ -335,14 +337,15 @@ def get_model_tokenizer_from_repo(model_dir: str, TemplateType.cogagent, requires=['transformers>=4.36'], support_vllm=False) -def get_model_tokenizer_from_repo_cogagent(model_dir: str, - torch_dtype: Dtype, - model_kwargs: Dict[str, Any], - load_model: bool = True, - model_config=None, - tokenizer=None, - automodel_class=AutoModelForCausalLM, - **kwargs): +def get_model_tokenizer_from_repo_cogagent( + model_dir: str, + torch_dtype: Dtype, + model_kwargs: Dict[str, Any], + load_model: bool = True, + model_config=None, + tokenizer=None, + automodel_class=AutoModelForCausalLM, + **kwargs): """load from an independent repository""" if model_config is None: model_config = AutoConfig.from_pretrained( @@ -350,7 +353,9 @@ def get_model_tokenizer_from_repo_cogagent(model_dir: str, model_config.torch_dtype = torch_dtype if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained( - 'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True, padding_side='left') + 'AI-ModelScope/vicuna-7b-v1.5', + trust_remote_code=True, + padding_side='left') eos_token = kwargs.get('eos_token') if eos_token is not None: tokenizer.eos_token = eos_token diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 3e7c97c3c4..6488b49aa3 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -80,7 +80,7 @@ def get_audio_info( *, context: Optional[str] = None, audio_info: Optional[Dict[str, - Any]] = None) -> Optional[Dict[str, Any]]: + Any]] = None) -> Optional[Dict[str, Any]]: assert context is not None or audio_info is not None assert context is None or audio_info is None if context is None: @@ -93,13 +93,13 @@ def get_audio_info( def _concat_context_list( - context_list: List[Context], - res_context_list: List[Context], - compute_loss_idx: List[int], - system: Optional[str] = None, - query: Optional[str] = None, - response: Optional[str] = None, - round0: Optional[int] = None, + context_list: List[Context], + res_context_list: List[Context], + compute_loss_idx: List[int], + system: Optional[str] = None, + query: Optional[str] = None, + response: Optional[str] = None, + round0: Optional[int] = None, ) -> None: # concat context list and replace placeholder round1 = None @@ -124,10 +124,10 @@ def _concat_context_list( def _encode_context_list( - tokenizer: PreTrainedTokenizerBase, - context_list: List[Context], - compute_loss_idx: Optional[List[int]] = None, - **args, + tokenizer: PreTrainedTokenizerBase, + context_list: List[Context], + compute_loss_idx: Optional[List[int]] = None, + **args, ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]: input_ids: List[int] = [] labels: List[int] = [] @@ -292,14 +292,13 @@ def __init__(self, self.use_default_system = True self._is_init = False - def _init_template( - self, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs - ) -> None: + def _init_template(self, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal[ + 'delete', 'truncation_left'] = 'delete', + **kwargs) -> None: assert self._is_init is False self._is_init = True self.tokenizer = tokenizer @@ -310,7 +309,7 @@ def _init_template( self.truncation_strategy = truncation_strategy def encode(self, example: Dict[str, - Any], **kwargs) -> Dict[str, Optional[List[int]]]: + Any]) -> Dict[str, Optional[List[int]]]: if not self._is_init: raise ValueError( 'Template has not been initialized, please call init_template(...) first.' @@ -338,42 +337,41 @@ class CogAgentTemplate(Template): LANGUAGE_TOKEN_TYPE = 0 VISION_TOKEN_TYPE = 1 - def _init_template( - self, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs - ) -> None: + def _init_template(self, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal[ + 'delete', 'truncation_left'] = 'delete', + **kwargs) -> None: self.model = kwargs.pop('model') self.suffix = [tokenizer.eos_token] - tokenizer.padding_side = 'left' - super()._init_template(tokenizer, default_system, max_length, truncation_strategy) + super()._init_template(tokenizer, default_system, max_length, + truncation_strategy) @staticmethod def vqa_history_to_prompt(history, query): # Only support single round chat in vqa mode - prompt = "Question: " + prompt = 'Question: ' # for i, (old_query, response) in enumerate(history): # prompt += old_query + " Short answer: " + response + " Question: " - prompt += query + " Short answer:" + prompt += query + ' Short answer:' return prompt @staticmethod def chat_old_history_to_prompt(history, query): - prompt = "Question: " + prompt = 'Question: ' for i, (old_query, response) in enumerate(history): - prompt += old_query + " Answer: " + response + "\nQuestion: " - prompt += query + " Answer:" + prompt += old_query + ' Answer: ' + response + '\nQuestion: ' + prompt += query + ' Answer:' return prompt @staticmethod def chat_history_to_prompt(history, query): - prompt = " [INST] " + prompt = ' [INST] ' for i, (old_query, response) in enumerate(history): - prompt += old_query + " [/INST] " + response + " [INST] " - prompt += query + " [/INST] " + prompt += old_query + ' [/INST] ' + response + ' [INST] ' + prompt += query + ' [/INST] ' return prompt @staticmethod @@ -382,29 +380,29 @@ def base_history_to_prompt(history, query): return prompt _history_to_prompt = { - "base": base_history_to_prompt, - "chat": chat_history_to_prompt, - "chat_old": chat_old_history_to_prompt, - "vqa": vqa_history_to_prompt + 'base': base_history_to_prompt, + 'chat': chat_history_to_prompt, + 'chat_old': chat_old_history_to_prompt, + 'vqa': vqa_history_to_prompt } def build_conversation_input_ids( - self, - tokenizer: "PreTrainedTokenizer", - *, - query: str, - label: Optional[str] = None, - history: Optional[List[Tuple[str, str]]] = None, - images: Optional[List["PIL.Image"]] = None, - template_version: Optional[Literal["base", "chat", "vqa"]] = None, - train: Optional[bool] = True, + self, + tokenizer: 'PreTrainedTokenizer', + *, + query: str, + label: Optional[str] = None, + history: Optional[List[Tuple[str, str]]] = None, + images: Optional[List['PIL.Image']] = None, + template_version: Optional[Literal['base', 'chat', 'vqa']] = None, ): from torchvision import transforms image_size: int = self.model.config.vision_config['image_size'] cross_image_size: int = self.model.config.cross_image_size patch_size: int = self.model.config.vision_config['patch_size'] template_version = template_version or self.model.config.template_version - assert images is None or len(images) <= 1, f"not support multi images by now." + assert images is None or len( + images) <= 1, 'not support multi images by now.' history = history or [] text = self._history_to_prompt[template_version](history, query) @@ -413,84 +411,95 @@ def build_conversation_input_ids( if images is not None and len(images) == 1: ori = images # vision - transform = transforms.Compose( - [ - transforms.Resize( - (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC - ), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) + transform = transforms.Compose([ + transforms.Resize( + (image_size, image_size), + interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) images = [transform(ori[0])] - cross_transform = transforms.Compose( - [ - transforms.Resize( - (cross_image_size, cross_image_size), interpolation=transforms.InterpolationMode.BICUBIC - ), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) + cross_transform = transforms.Compose([ + transforms.Resize( + (cross_image_size, cross_image_size), + interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) cross_images = [cross_transform(ori[0])] # language - vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2 + vision_token_num = (image_size // patch_size) * (image_size + // patch_size) + 2 input_ids += [tokenizer.pad_token_id] * vision_token_num token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num text_ids = tokenizer.encode(text, add_special_tokens=False) - if label is not None: - label_ids = tokenizer.encode(label, add_special_tokens=False) - else: - label_ids = [] - if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1: - if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1): + train = label is not None + label_ids = tokenizer.encode( + label, add_special_tokens=False) if train else [] + if len(text_ids) + len(input_ids) + len( + label_ids) > self.max_length - 1: + if self.truncation_strategy == 'delete' or ( + len(input_ids) + len(label_ids) >= self.max_length - 1): return None else: - text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):] + text_ids = text_ids[-(self.max_length - len(input_ids) + - len(label_ids) - 1):] input_ids += text_ids - if label_ids: - labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id] if train: + labels = [-100] * len(input_ids) + label_ids + [ + tokenizer.eos_token_id + ] input_ids += label_ids + [tokenizer.eos_token_id] - token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1) + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * ( + len(text_ids) + len(label_ids) + 1) else: token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids) attention_mask = [1] * len(input_ids) - # if len(input_ids) < self.max_length: - # padding_len = self.max_length - len(input_ids) - # input_ids += [tokenizer.pad_token_id] * padding_len - # token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len - # attention_mask += [0] * padding_len - # if label_ids: - # labels += [-100] * padding_len - + if len(input_ids) < self.max_length and train: + padding_len = self.max_length - len(input_ids) + input_ids += [tokenizer.pad_token_id] * padding_len + token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len + attention_mask += [0] * padding_len + if label_ids: + labels += [-100] * padding_len + if train: return { 'input_ids': torch.tensor(input_ids, dtype=torch.long), - 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), - 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), + 'token_type_ids': + torch.tensor(token_type_ids, dtype=torch.long), + 'attention_mask': + torch.tensor(attention_mask, dtype=torch.long), 'images': images, 'cross_images': cross_images, 'labels': labels, } else: return { - 'input_ids': torch.tensor(input_ids, dtype=torch.long), - 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0), - 'attention_mask': torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0), + 'input_ids': + torch.tensor(input_ids, dtype=torch.long), + 'token_type_ids': + torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0), + 'attention_mask': + torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0), 'images': [images], 'cross_images': [cross_images], } - def encode(self, example: Dict[str, - Any], train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]: - return self.build_conversation_input_ids(self.tokenizer, query=example['query'], - label=example.get('response'), - history=example.get('history'), - images=[example['image'].convert('RGB')], - train=train) + def encode(self, + example: Dict[str, Any], + train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]: + return self.build_conversation_input_ids( + self.tokenizer, + query=example['query'], + label=example.get('response'), + history=example.get('history'), + images=[example['image'].convert('RGB')], + train=train) TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} @@ -647,18 +656,17 @@ def register_template(template_type: str, Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '], ['<|endoftext|>'], ['<|endoftext|>'], '')) -register_template( - TemplateType.cogagent, - CogAgentTemplate([], [], [], [], None, [])) +register_template(TemplateType.cogagent, + CogAgentTemplate([], [], [], [], None, [])) def get_template( - template_type: str, - tokenizer: PreTrainedTokenizerBase, - default_system: Optional[str] = None, - max_length: Optional[int] = None, - truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', - **kwargs, + template_type: str, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs, ) -> Template: template_info = TEMPLATE_MAPPING[template_type] template = deepcopy(template_info['template']) diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index ef6ef209a8..8a3b208ed0 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -342,20 +342,12 @@ def data_collate_fn(batch: List[Dict[str, Any]], for b in batch ] if batch[0].get('images') is not None: - res['images'] = [ - b['images'] - for b in batch - ] + res['images'] = [b['images'] for b in batch] if batch[0].get('cross_images') is not None: - res['cross_images'] = [ - b['cross_images'] - for b in batch - ] + res['cross_images'] = [b['cross_images'] for b in batch] if batch[0].get('token_type_ids') is not None: - res['token_type_ids'] = torch.stack([ - b['token_type_ids'] - for b in batch - ]) + res['token_type_ids'] = torch.stack( + [b['token_type_ids'] for b in batch]) return res @@ -474,7 +466,7 @@ def inference_stream( example = {'query': query, 'history': history, 'system': system} if image is not None: example['image'] = image - inputs = template.encode(example, train=False) + inputs = template.encode(example) audio_info = inputs.get('audio_info') # Compatible with qwen-audio input_ids = inputs['input_ids'] tokenizer = template.tokenizer @@ -506,9 +498,13 @@ def inference_stream( if 'token_type_ids' in inputs: model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device) if 'images' in inputs: - model_kwargs['images'] = [[inputs['images'][0][0].to(device).to(torch.float16)]] + model_kwargs['images'] = [[ + inputs['images'][0][0].to(device).to(torch.float16) + ]] if 'cross_images' in inputs: - model_kwargs['cross_images'] = [[inputs['cross_images'][0][0].to(device).to(torch.float16)]] + model_kwargs['cross_images'] = [[ + inputs['cross_images'][0][0].to(device).to(torch.float16) + ]] if audio_info is not None: audio_info = get_audio_info(tokenizer, audio_info=audio_info) decode_kwargs['audio_info'] = audio_info From 7a9bcdb5400edd6eea93e2431e2528fcc9bd374e Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 21:46:46 +0800 Subject: [PATCH 07/10] fix --- swift/llm/utils/template.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 6488b49aa3..fe27c62e7b 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -491,15 +491,13 @@ def build_conversation_input_ids( } def encode(self, - example: Dict[str, Any], - train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]: + example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]: return self.build_conversation_input_ids( self.tokenizer, query=example['query'], label=example.get('response'), history=example.get('history'), - images=[example['image'].convert('RGB')], - train=train) + images=[example['image'].convert('RGB')]) TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} From c5520ee882d1b5f02dc8d9de268c22052742b875 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 21:47:41 +0800 Subject: [PATCH 08/10] fix --- swift/llm/utils/template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index fe27c62e7b..c117132903 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -490,8 +490,8 @@ def build_conversation_input_ids( 'cross_images': [cross_images], } - def encode(self, - example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]: + def encode(self, example: Dict[str, + Any]) -> Dict[str, Optional[List[int]]]: return self.build_conversation_input_ids( self.tokenizer, query=example['query'], From 2973d9b91fb280466e576a3dfc0fc309619695d2 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 22:03:31 +0800 Subject: [PATCH 09/10] fix --- .../llm/scripts/cogagent_chat/lora/infer.sh | 2 +- .../llm/scripts/cogagent_chat/lora/sft.sh | 17 ++++++++--------- swift/llm/utils/model.py | 3 +++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh index 6dd1a0604d..4d0e48de20 100644 --- a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh @@ -2,7 +2,7 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0 \ python llm_infer.py \ - --ckpt_dir "/mnt/workspace/yzhao/tastelikefeet/swift/examples/pytorch/llm/output/cogagent-chat/v47-20231220-132558/checkpoint-400" \ + --ckpt_dir "/xxx/xxx/cogagent-chat/vx-xxx/checkpoint-xx" \ --load_args_from_ckpt_dir true \ --eval_human true \ --max_length 4096 \ diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh index c23bbb8749..0b642444db 100644 --- a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh @@ -1,23 +1,22 @@ -# Experimental environment: V100, A10, 3090 -# 18GB GPU memory +# Experimental environment: 2 * A100 +# 2 * 45GB PYTHONPATH=../../.. \ -CUDA_VISIBLE_DEVICES=0 \ +CUDA_VISIBLE_DEVICES=0,1 \ python llm_sft.py \ --model_type cogagent-chat \ --sft_type lora \ --tuner_backend swift \ --dtype fp16 \ --output_dir output \ - --custom_train_dataset_path xxx.jsonl \ - --custom_val_dataset_path yyy.jsonl \ + --dataset capcha-images \ --train_dataset_sample -1 \ - --num_train_epochs 1 \ - --max_length 4096 \ + --num_train_epochs 2 \ + --max_length 1024 \ --check_dataset_strategy warning \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0.05 \ - --gradient_checkpointing true \ + --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0.01 \ --learning_rate 1e-4 \ @@ -27,7 +26,7 @@ python llm_sft.py \ --eval_steps 100 \ --save_steps 100 \ --save_total_limit 2 \ - --logging_steps 10 \ + --logging_steps 10 --push_to_hub false \ --hub_model_id cogagent-chat-lora \ --hub_private_repo true \ diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 8253bb4e83..72f3fc8049 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -367,6 +367,9 @@ def get_model_tokenizer_from_repo_cogagent( torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs) + logger.info( + 'CogAgent with FusedLayerNorm will cause an training loss of Nan, ' + 'to avoid this, please uninstall apex.') return model, tokenizer From df1c0073e48d6c508c6c994206428a73d854cc83 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 20 Dec 2023 22:25:00 +0800 Subject: [PATCH 10/10] fix --- swift/llm/utils/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index e10217721e..5ddc05745e 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -180,7 +180,7 @@ class LoRATM(NamedTuple): 'language_expert_query_key_value', 'language_expert_dense', 'query', 'key_value', 'dense' ] - phi = ['Wqkv'] + phi = ['Wqkv'] GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],