From a9618198d004f05299f75c2f1b8bebf947495a29 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 19 Dec 2023 16:54:29 +0800
Subject: [PATCH 01/10] unfinished work

---
 .../llm/scripts/cogagent_chat/lora/infer.sh   | 15 ++++++++
 .../llm/scripts/cogagent_chat/lora/sft.sh     | 34 +++++++++++++++++++
 swift/llm/utils/model.py                      | 20 +++++++++++
 swift/llm/utils/template.py                   | 17 +++++++++-
 4 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
 create mode 100644 examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh

diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
new file mode 100644
index 0000000000..5edb32c604
--- /dev/null
+++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
@@ -0,0 +1,15 @@
+# Experimental environment: V100, A10, 3090
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir "output/codefuse-codellama-34b-chat/vx_xxx/checkpoint-xxx" \
+    --load_args_from_ckpt_dir true \
+    --eval_human false \
+    --max_length 4096 \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.3 \
+    --top_p 0.7 \
+    --repetition_penalty 1.05 \
+    --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh
new file mode 100644
index 0000000000..c23bbb8749
--- /dev/null
+++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh
@@ -0,0 +1,34 @@
+# Experimental environment: V100, A10, 3090
+# 18GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+    --model_type cogagent-chat \
+    --sft_type lora \
+    --tuner_backend swift \
+    --dtype fp16 \
+    --output_dir output \
+    --custom_train_dataset_path xxx.jsonl \
+    --custom_val_dataset_path yyy.jsonl \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 4096 \
+    --check_dataset_strategy warning \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.01 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --push_to_hub false \
+    --hub_model_id cogagent-chat-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 4aeb013db3..370eee36d9 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -152,6 +152,9 @@ class ModelType:
     deepseek_coder_33b = 'deepseek-coder-33b'
     deepseek_coder_33b_chat = 'deepseek-coder-33b-chat'
 
+    cogagent_chat = 'cogagent-chat'
+    cogagent_vqa = 'cogagent-vqa'
+
     @classmethod
     def get_model_name_list(cls) -> List[str]:
         res = []
@@ -170,6 +173,9 @@ class LoRATM(NamedTuple):
     qwen = ['c_attn']
     polylm = ['c_attn']
     bloom = ['query_key_value']
+    cogagent = ['vision_expert_query_key_value', 'vision_expert_dense',
+                     'language_expert_query_key_value', 'language_expert_dense',
+                     'query', 'key_value', 'dense']
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],
@@ -285,6 +291,20 @@ def _register_model(
     TemplateType.default_generation,
     requires=['transformers<4.34'],
     support_vllm=True)
+@register_model(
+    ModelType.cogagent_chat,
+    'ZhipuAI/cogagent-chat',
+    LoRATM.cogagent_chat,
+    TemplateType.llama,
+    requires=['transformers>=4.36'],
+    support_vllm=False)
+@register_model(
+    ModelType.cogagent_vqa,
+    'ZhipuAI/cogagent-vqa',
+    LoRATM.cogagent,
+    TemplateType.cogagent_vqa,
+    requires=['transformers>=4.36'],
+    support_vllm=False)
 def get_model_tokenizer_from_repo(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index fb46b7ad24..9c357228d9 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -35,6 +35,7 @@ class TemplateType:
     deepseek = 'deepseek'
     codefuse_codellama = 'codefuse-codellama'
     deepseek_coder = 'deepseek-coder'
+    cogagent = 'cogagent'
 
     @classmethod
     def get_template_name_list(cls) -> List[str]:
@@ -125,7 +126,8 @@ def _concat_context_list(
 def _encode_context_list(
     tokenizer: PreTrainedTokenizerBase,
     context_list: List[Context],
-    compute_loss_idx: Optional[List[int]] = None
+    compute_loss_idx: Optional[List[int]] = None,
+    **args,
 ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]:
     input_ids: List[int] = []
     labels: List[int] = []
@@ -154,6 +156,7 @@ def _encode_context_list(
                             [old_audio_info[k], audio_info[k]], dim=0)
                     for k in ['audio_span_tokens', 'audio_urls']:
                         old_audio_info[k] = old_audio_info[k] + audio_info[k]
+
             token_list = tokenizer(
                 context,
                 return_attention_mask=False,
@@ -330,6 +333,14 @@ def encode(self, example: Dict[str,
                        self.truncation_strategy)
 
 
+class CogAgentTemplate(Template):
+
+    def encode(self, example: Dict[str,
+                                   Any], model) -> Dict[str, Optional[List[int]]]:
+        image = Image.open(context).convert('RGB')
+        return model.build_conversation_input_ids(self.tokenizer, query=example['query'],
+                                                               history=example['history'], images=[example['image']])
+
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
 
 
@@ -484,6 +495,10 @@ def register_template(template_type: str,
     Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '],
              ['<|endoftext|>'], ['<|endoftext|>'], ''))
 
+register_template(
+    TemplateType.cogagent,
+    CogAgentTemplate([], [], [], [], None, []))
+
 
 def get_template(
     template_type: str,

From 175e20893da086ee468ee3ed8666f7e5debafee3 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 19 Dec 2023 20:17:19 +0800
Subject: [PATCH 02/10] fix

---
 swift/llm/sft.py            |  2 +-
 swift/llm/utils/dataset.py  | 24 ++++++++++++++++++++++++
 swift/llm/utils/model.py    |  6 +++---
 swift/llm/utils/template.py | 26 +++++++++++++++++++-------
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index 96854339da..b81998ded7 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -174,7 +174,7 @@ def llm_sft(args: SftArguments) -> str:
     logger.info(f'val_dataset: {val_dataset}')
     template: Template = get_template(args.template_type, tokenizer,
                                       args.system, args.max_length,
-                                      args.truncation_strategy)
+                                      args.truncation_strategy, model=model)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     if not args.lazy_tokenize:
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 0845fe03ea..5d76b2a297 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -106,6 +106,7 @@ class DatasetName:
     # vision
     coco_en = 'coco-en'
     coco_mini_en = 'coco-mini-en'
+    capcha_images = 'capcha-images'
     # audio
     aishell1_zh = 'aishell1-zh'
     aishell1_mini_zh = 'aishell1-mini-zh'
@@ -599,6 +600,29 @@ def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset:
     get_dataset_from_repo,
     tags=['chat', 'general', 'multi-round'])
 
+
+def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset:
+    dataset = dataset.rename_columns({
+        'image': 'query',
+        'solution': 'response',
+    })
+    def add_system(row):
+        row['system'] = 'CAPTCHA:'
+        return row
+    dataset = dataset.map(add_system)
+    return dataset
+
+
+register_dataset(
+    DatasetName.capcha_images,
+    'AI-ModelScope/captcha-images',
+    [('default', 'train')],
+    [('default', 'validation')],
+    _preprocess_capcha_images,
+    get_dataset_from_repo,
+    tags=['chat', 'multi-modal', 'vision', '🔥'])
+
+
 register_dataset(
     DatasetName.cls_fudan_news_zh,
     'damo/zh_cls_fudan-news', ['train'],
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 370eee36d9..48824a7732 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -294,15 +294,15 @@ def _register_model(
 @register_model(
     ModelType.cogagent_chat,
     'ZhipuAI/cogagent-chat',
-    LoRATM.cogagent_chat,
-    TemplateType.llama,
+    LoRATM.cogagent,
+    TemplateType.cogagent,
     requires=['transformers>=4.36'],
     support_vllm=False)
 @register_model(
     ModelType.cogagent_vqa,
     'ZhipuAI/cogagent-vqa',
     LoRATM.cogagent,
-    TemplateType.cogagent_vqa,
+    TemplateType.cogagent,
     requires=['transformers>=4.36'],
     support_vllm=False)
 def get_model_tokenizer_from_repo(model_dir: str,
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 9c357228d9..31143dc0f4 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -297,7 +297,8 @@ def _init_template(
         tokenizer: PreTrainedTokenizerBase,
         default_system: Optional[str] = None,
         max_length: Optional[int] = None,
-        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete'
+        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+        **kwargs
     ) -> None:
         assert self._is_init is False
         self._is_init = True
@@ -335,11 +336,21 @@ def encode(self, example: Dict[str,
 
 class CogAgentTemplate(Template):
 
+    def _init_template(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        default_system: Optional[str] = None,
+        max_length: Optional[int] = None,
+        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+        **kwargs
+    ) -> None:
+        self.model = kwargs.pop('model')
+        super()._init_template(tokenizer, default_system, max_length, truncation_strategy)
+
     def encode(self, example: Dict[str,
-                                   Any], model) -> Dict[str, Optional[List[int]]]:
-        image = Image.open(context).convert('RGB')
-        return model.build_conversation_input_ids(self.tokenizer, query=example['query'],
-                                                               history=example['history'], images=[example['image']])
+                                   Any]) -> Dict[str, Optional[List[int]]]:
+        return self.model.build_conversation_input_ids(self.tokenizer, query=example['response'],
+                                                               history=None, images=[example['query'].convert('RGB')])
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
 
@@ -505,11 +516,12 @@ def get_template(
     tokenizer: PreTrainedTokenizerBase,
     default_system: Optional[str] = None,
     max_length: Optional[int] = None,
-    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete'
+    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+    **kwargs,
 ) -> Template:
     template_info = TEMPLATE_MAPPING[template_type]
     template = deepcopy(template_info['template'])
     template._init_template(tokenizer, default_system, max_length,
-                            truncation_strategy)
+                            truncation_strategy, **kwargs)
     template.template_type = template_type
     return template

From d6c940d8605f27829d153d0c71bd8f26c9d24adc Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 19 Dec 2023 21:20:52 +0800
Subject: [PATCH 03/10] fix

---
 swift/llm/utils/template.py | 195 +++++++++++++++++++++++++++++-------
 1 file changed, 161 insertions(+), 34 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 31143dc0f4..c2a469a404 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -80,7 +80,7 @@ def get_audio_info(
         *,
         context: Optional[str] = None,
         audio_info: Optional[Dict[str,
-                                  Any]] = None) -> Optional[Dict[str, Any]]:
+        Any]] = None) -> Optional[Dict[str, Any]]:
     assert context is not None or audio_info is not None
     assert context is None or audio_info is None
     if context is None:
@@ -93,13 +93,13 @@ def get_audio_info(
 
 
 def _concat_context_list(
-    context_list: List[Context],
-    res_context_list: List[Context],
-    compute_loss_idx: List[int],
-    system: Optional[str] = None,
-    query: Optional[str] = None,
-    response: Optional[str] = None,
-    round0: Optional[int] = None,
+        context_list: List[Context],
+        res_context_list: List[Context],
+        compute_loss_idx: List[int],
+        system: Optional[str] = None,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        round0: Optional[int] = None,
 ) -> None:
     # concat context list and replace placeholder
     round1 = None
@@ -124,10 +124,10 @@ def _concat_context_list(
 
 
 def _encode_context_list(
-    tokenizer: PreTrainedTokenizerBase,
-    context_list: List[Context],
-    compute_loss_idx: Optional[List[int]] = None,
-    **args,
+        tokenizer: PreTrainedTokenizerBase,
+        context_list: List[Context],
+        compute_loss_idx: Optional[List[int]] = None,
+        **args,
 ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]:
     input_ids: List[int] = []
     labels: List[int] = []
@@ -293,12 +293,12 @@ def __init__(self,
         self._is_init = False
 
     def _init_template(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        default_system: Optional[str] = None,
-        max_length: Optional[int] = None,
-        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-        **kwargs
+            self,
+            tokenizer: PreTrainedTokenizerBase,
+            default_system: Optional[str] = None,
+            max_length: Optional[int] = None,
+            truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+            **kwargs
     ) -> None:
         assert self._is_init is False
         self._is_init = True
@@ -310,7 +310,7 @@ def _init_template(
         self.truncation_strategy = truncation_strategy
 
     def encode(self, example: Dict[str,
-                                   Any]) -> Dict[str, Optional[List[int]]]:
+    Any]) -> Dict[str, Optional[List[int]]]:
         if not self._is_init:
             raise ValueError(
                 'Template has not been initialized, please call init_template(...) first.'
@@ -335,22 +335,149 @@ def encode(self, example: Dict[str,
 
 
 class CogAgentTemplate(Template):
+    LANGUAGE_TOKEN_TYPE = 0
+    VISION_TOKEN_TYPE = 1
 
     def _init_template(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        default_system: Optional[str] = None,
-        max_length: Optional[int] = None,
-        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-        **kwargs
+            self,
+            tokenizer: PreTrainedTokenizerBase,
+            default_system: Optional[str] = None,
+            max_length: Optional[int] = None,
+            truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+            **kwargs
     ) -> None:
         self.model = kwargs.pop('model')
         super()._init_template(tokenizer, default_system, max_length, truncation_strategy)
 
+    @staticmethod
+    def vqa_history_to_prompt(history, query):
+        # Only support single round chat in vqa mode
+        prompt = "<EOI>Question: "
+        # for i, (old_query, response) in enumerate(history):
+        #     prompt += old_query + " Short answer: " + response + " Question: "
+        prompt += query + " Short answer:"
+        return prompt
+
+    @staticmethod
+    def chat_old_history_to_prompt(history, query):
+        prompt = "<EOI>Question: "
+        for i, (old_query, response) in enumerate(history):
+            prompt += old_query + " Answer: " + response + "\nQuestion: "
+        prompt += query + " Answer:"
+        return prompt
+
+    @staticmethod
+    def chat_history_to_prompt(history, query):
+        prompt = " [INST] "
+        for i, (old_query, response) in enumerate(history):
+            prompt += old_query + " [/INST] " + response + " [INST] "
+        prompt += query + " [/INST] "
+        return prompt
+
+    @staticmethod
+    def base_history_to_prompt(history, query):
+        prompt = query
+        return prompt
+
+    _history_to_prompt = {
+        "base": base_history_to_prompt,
+        "chat": chat_history_to_prompt,
+        "chat_old": chat_old_history_to_prompt,
+        "vqa": vqa_history_to_prompt
+    }
+
+    def build_conversation_input_ids(
+            self,
+            tokenizer: "PreTrainedTokenizer",
+            *,
+            query: str,
+            label: str,
+            history: Optional[List[Tuple[str, str]]] = None,
+            images: Optional[List["PIL.Image"]] = None,
+            template_version: Optional[Literal["base", "chat", "vqa"]] = None,
+    ):
+        from torchvision import transforms
+        image_size: int = self.config.vision_config['image_size']
+        cross_image_size: int = self.config.cross_image_size
+        patch_size: int = self.config.vision_config['patch_size']
+        template_version = template_version or self.config.template_version
+        assert images is None or len(images) <= 1, f"not support multi images by now."
+        history = history or []
+        text = self._history_to_prompt[template_version](history, query)
+
+        input_ids = [tokenizer.bos_token_id]
+        token_type_ids = [self.LANGUAGE_TOKEN_TYPE]
+        if images is not None and len(images) == 1:
+            ori = images
+            # vision
+            transform = transforms.Compose(
+                [
+                    transforms.Resize(
+                        (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC
+                    ),
+                    transforms.ToTensor(),
+                    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+                ]
+            )
+            images = [transform(ori[0])]
+            cross_transform = transforms.Compose(
+                [
+                    transforms.Resize(
+                        (cross_image_size, cross_image_size), interpolation=transforms.InterpolationMode.BICUBIC
+                    ),
+                    transforms.ToTensor(),
+                    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+                ]
+            )
+            cross_images = [cross_transform(ori[0])]
+            # language
+            vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2
+            input_ids += [tokenizer.pad_token_id] * vision_token_num
+            token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num
+        text_ids = tokenizer.encode(text, add_special_tokens=False)
+        label_ids = tokenizer.encode(label, add_special_tokens=False)
+        if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1:
+            if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1):
+                return None
+            else:
+                text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):]
+
+        input_ids += text_ids
+        labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id]
+        input_ids += label_ids + [tokenizer.eos_token_id]
+        token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids)
+        attention_mask = [1] * len(input_ids)
+
+        if len(input_ids) < self.max_length:
+            padding_len = self.max_length - len(input_ids)
+            input_ids += [tokenizer.pad_token_id] * padding_len
+            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len
+            attention_mask += [0] * padding_len
+            labels += [-100] * padding_len
+
+        return {
+            'input_ids': torch.tensor(input_ids, dtype=torch.long),
+            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
+            'images': images,
+            'cross_images': cross_images,
+            'labels': labels
+        }
+
     def encode(self, example: Dict[str,
-                                   Any]) -> Dict[str, Optional[List[int]]]:
-        return self.model.build_conversation_input_ids(self.tokenizer, query=example['response'],
-                                                               history=None, images=[example['query'].convert('RGB')])
+    Any]) -> Dict[str, Optional[List[int]]]:
+        input_kwargs = self.build_conversation_input_ids(self.tokenizer, query=example['system'],
+                                                         label=example['response'],
+                                                         history=example.get('history'),
+                                                         images=[example['query'].convert('RGB')])
+        if len(input_kwargs['input_ids']) > self.max_length - 1:
+            if self.truncation_strategy == 'delete':
+                return None
+            else:
+                input_kwargs['input_ids'] = input_kwargs['input_ids'][-self.max_length - 1:]
+                input_kwargs['attention_mask'] = input_kwargs['attention_mask'][-self.max_length - 1:]
+                input_kwargs['token_type_ids'] = input_kwargs['input_ids'][:self.max_length - 1]
+
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
 
@@ -512,12 +639,12 @@ def register_template(template_type: str,
 
 
 def get_template(
-    template_type: str,
-    tokenizer: PreTrainedTokenizerBase,
-    default_system: Optional[str] = None,
-    max_length: Optional[int] = None,
-    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-    **kwargs,
+        template_type: str,
+        tokenizer: PreTrainedTokenizerBase,
+        default_system: Optional[str] = None,
+        max_length: Optional[int] = None,
+        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+        **kwargs,
 ) -> Template:
     template_info = TEMPLATE_MAPPING[template_type]
     template = deepcopy(template_info['template'])

From 21a3477c756a9bfef04eafa0c8538de8b29c661a Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 13:16:11 +0800
Subject: [PATCH 04/10] fix

---
 swift/llm/utils/model.py    | 34 ++++++++++++++++++++++++++++++++--
 swift/llm/utils/template.py | 19 ++++++-------------
 swift/llm/utils/utils.py    | 15 +++++++++++++++
 3 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 48824a7732..7c6d82b217 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -291,6 +291,36 @@ def _register_model(
     TemplateType.default_generation,
     requires=['transformers<4.34'],
     support_vllm=True)
+def get_model_tokenizer_from_repo(model_dir: str,
+                                  torch_dtype: Dtype,
+                                  model_kwargs: Dict[str, Any],
+                                  load_model: bool = True,
+                                  model_config=None,
+                                  tokenizer=None,
+                                  automodel_class=AutoModelForCausalLM,
+                                  **kwargs):
+    """load from an independent repository"""
+    if model_config is None:
+        model_config = AutoConfig.from_pretrained(
+            model_dir, trust_remote_code=True)
+    model_config.torch_dtype = torch_dtype
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, trust_remote_code=True)
+    eos_token = kwargs.get('eos_token')
+    if eos_token is not None:
+        tokenizer.eos_token = eos_token
+    model = None
+    if load_model:
+        model = automodel_class.from_pretrained(
+            model_dir,
+            config=model_config,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+            **model_kwargs)
+    return model, tokenizer
+
+
 @register_model(
     ModelType.cogagent_chat,
     'ZhipuAI/cogagent-chat',
@@ -305,7 +335,7 @@ def _register_model(
     TemplateType.cogagent,
     requires=['transformers>=4.36'],
     support_vllm=False)
-def get_model_tokenizer_from_repo(model_dir: str,
+def get_model_tokenizer_from_repo_cogagent(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
                                   load_model: bool = True,
@@ -320,7 +350,7 @@ def get_model_tokenizer_from_repo(model_dir: str,
     model_config.torch_dtype = torch_dtype
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, trust_remote_code=True)
+            'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True)
     eos_token = kwargs.get('eos_token')
     if eos_token is not None:
         tokenizer.eos_token = eos_token
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index c2a469a404..dd383b91da 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -397,10 +397,10 @@ def build_conversation_input_ids(
             template_version: Optional[Literal["base", "chat", "vqa"]] = None,
     ):
         from torchvision import transforms
-        image_size: int = self.config.vision_config['image_size']
-        cross_image_size: int = self.config.cross_image_size
-        patch_size: int = self.config.vision_config['patch_size']
-        template_version = template_version or self.config.template_version
+        image_size: int = self.model.config.vision_config['image_size']
+        cross_image_size: int = self.model.config.cross_image_size
+        patch_size: int = self.model.config.vision_config['patch_size']
+        template_version = template_version or self.model.config.template_version
         assert images is None or len(images) <= 1, f"not support multi images by now."
         history = history or []
         text = self._history_to_prompt[template_version](history, query)
@@ -445,7 +445,7 @@ def build_conversation_input_ids(
         input_ids += text_ids
         labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id]
         input_ids += label_ids + [tokenizer.eos_token_id]
-        token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids)
+        token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1)
         attention_mask = [1] * len(input_ids)
 
         if len(input_ids) < self.max_length:
@@ -466,17 +466,10 @@ def build_conversation_input_ids(
 
     def encode(self, example: Dict[str,
     Any]) -> Dict[str, Optional[List[int]]]:
-        input_kwargs = self.build_conversation_input_ids(self.tokenizer, query=example['system'],
+        return self.build_conversation_input_ids(self.tokenizer, query=example['system'],
                                                          label=example['response'],
                                                          history=example.get('history'),
                                                          images=[example['query'].convert('RGB')])
-        if len(input_kwargs['input_ids']) > self.max_length - 1:
-            if self.truncation_strategy == 'delete':
-                return None
-            else:
-                input_kwargs['input_ids'] = input_kwargs['input_ids'][-self.max_length - 1:]
-                input_kwargs['attention_mask'] = input_kwargs['attention_mask'][-self.max_length - 1:]
-                input_kwargs['token_type_ids'] = input_kwargs['input_ids'][:self.max_length - 1]
 
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 23d5a61816..60782a618c 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -341,6 +341,21 @@ def data_collate_fn(batch: List[Dict[str, Any]],
             get_audio_info(tokenizer, audio_info=b['audio_info'])
             for b in batch
         ]
+    if batch[0].get('images') is not None:
+        res['images'] = [
+            b['images']
+            for b in batch
+        ]
+    if batch[0].get('cross_images') is not None:
+        res['cross_images'] = [
+            b['cross_images']
+            for b in batch
+        ]
+    if batch[0].get('token_type_ids') is not None:
+        res['token_type_ids'] = torch.stack([
+            b['token_type_ids']
+            for b in batch
+        ])
     return res
 
 

From 2be6724f25e454f6157ae0553694f6594a01d65f Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 21:23:38 +0800
Subject: [PATCH 05/10] fix

---
 .../llm/scripts/cogagent_chat/lora/infer.sh   |  4 +-
 swift/llm/infer.py                            |  8 ++-
 swift/llm/utils/dataset.py                    |  5 +-
 swift/llm/utils/model.py                      |  2 +-
 swift/llm/utils/template.py                   | 71 ++++++++++++-------
 swift/llm/utils/utils.py                      | 16 ++++-
 6 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
index 5edb32c604..6dd1a0604d 100644
--- a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
@@ -2,9 +2,9 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0 \
 python llm_infer.py \
-    --ckpt_dir "output/codefuse-codellama-34b-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "/mnt/workspace/yzhao/tastelikefeet/swift/examples/pytorch/llm/output/cogagent-chat/v47-20231220-132558/checkpoint-400" \
     --load_args_from_ckpt_dir true \
-    --eval_human false \
+    --eval_human true \
     --max_length 4096 \
     --use_flash_attn true \
     --max_new_tokens 2048 \
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 869e6ab4c0..55474f7962 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -142,7 +142,7 @@ def prepare_model_template(
 
     template: Template = get_template(args.template_type, tokenizer,
                                       args.system, args.max_length,
-                                      args.truncation_strategy)
+                                      args.truncation_strategy, model=model)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     return model, template
@@ -175,6 +175,10 @@ def llm_infer(args: InferArguments) -> None:
             logger.info(
                 'The current template only supports single-round dialogues.')
         history = []
+        if 'cogagent' in args.model_type:
+            image = input('Input an image url<<< ')
+            from PIL import Image
+            image = Image.open(image)
         while True:
             if input_mode == 'S':
                 query = input('<<< ')
@@ -210,7 +214,7 @@ def llm_infer(args: InferArguments) -> None:
                         print(response[print_idx:], end='', flush=True)
                         print_idx = len(response)
             else:
-                gen = inference_stream(model, template, query, history)
+                gen = inference_stream(model, template, query, history, image=image)
                 for response, new_history in gen:
                     if len(response) > print_idx:
                         print(response[print_idx:], end='', flush=True)
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 5d76b2a297..9cc74cc6a9 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -28,7 +28,7 @@
 def _remove_useless_columns(dataset: HfDataset) -> HfDataset:
     k_list = []
     for k in dataset.features.keys():
-        if k in {'query', 'response', 'system', 'history'}:
+        if k in {'query', 'response', 'system', 'history', 'image'}:
             k_list.append(k)
     dataset = dataset.select_columns(k_list)
     return dataset
@@ -603,11 +603,10 @@ def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset:
 
 def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset:
     dataset = dataset.rename_columns({
-        'image': 'query',
         'solution': 'response',
     })
     def add_system(row):
-        row['system'] = 'CAPTCHA:'
+        row['query'] = 'CAPTCHA:'
         return row
     dataset = dataset.map(add_system)
     return dataset
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 7c6d82b217..3d579dfe14 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -350,7 +350,7 @@ def get_model_tokenizer_from_repo_cogagent(model_dir: str,
     model_config.torch_dtype = torch_dtype
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(
-            'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True)
+            'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True, padding_side='left')
     eos_token = kwargs.get('eos_token')
     if eos_token is not None:
         tokenizer.eos_token = eos_token
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index dd383b91da..3e7c97c3c4 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -310,7 +310,7 @@ def _init_template(
         self.truncation_strategy = truncation_strategy
 
     def encode(self, example: Dict[str,
-    Any]) -> Dict[str, Optional[List[int]]]:
+    Any], **kwargs) -> Dict[str, Optional[List[int]]]:
         if not self._is_init:
             raise ValueError(
                 'Template has not been initialized, please call init_template(...) first.'
@@ -347,6 +347,8 @@ def _init_template(
             **kwargs
     ) -> None:
         self.model = kwargs.pop('model')
+        self.suffix = [tokenizer.eos_token]
+        tokenizer.padding_side = 'left'
         super()._init_template(tokenizer, default_system, max_length, truncation_strategy)
 
     @staticmethod
@@ -391,10 +393,11 @@ def build_conversation_input_ids(
             tokenizer: "PreTrainedTokenizer",
             *,
             query: str,
-            label: str,
+            label: Optional[str] = None,
             history: Optional[List[Tuple[str, str]]] = None,
             images: Optional[List["PIL.Image"]] = None,
             template_version: Optional[Literal["base", "chat", "vqa"]] = None,
+            train: Optional[bool] = True,
     ):
         from torchvision import transforms
         image_size: int = self.model.config.vision_config['image_size']
@@ -435,7 +438,10 @@ def build_conversation_input_ids(
             input_ids += [tokenizer.pad_token_id] * vision_token_num
             token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num
         text_ids = tokenizer.encode(text, add_special_tokens=False)
-        label_ids = tokenizer.encode(label, add_special_tokens=False)
+        if label is not None:
+            label_ids = tokenizer.encode(label, add_special_tokens=False)
+        else:
+            label_ids = []
         if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1:
             if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1):
                 return None
@@ -443,33 +449,48 @@ def build_conversation_input_ids(
                 text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):]
 
         input_ids += text_ids
-        labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id]
-        input_ids += label_ids + [tokenizer.eos_token_id]
-        token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1)
+        if label_ids:
+            labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id]
+        if train:
+            input_ids += label_ids + [tokenizer.eos_token_id]
+            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1)
+        else:
+            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids)
         attention_mask = [1] * len(input_ids)
 
-        if len(input_ids) < self.max_length:
-            padding_len = self.max_length - len(input_ids)
-            input_ids += [tokenizer.pad_token_id] * padding_len
-            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len
-            attention_mask += [0] * padding_len
-            labels += [-100] * padding_len
-
-        return {
-            'input_ids': torch.tensor(input_ids, dtype=torch.long),
-            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
-            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
-            'images': images,
-            'cross_images': cross_images,
-            'labels': labels
-        }
+        # if len(input_ids) < self.max_length:
+        #     padding_len = self.max_length - len(input_ids)
+        #     input_ids += [tokenizer.pad_token_id] * padding_len
+        #     token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len
+        #     attention_mask += [0] * padding_len
+        #     if label_ids:
+        #         labels += [-100] * padding_len
+        
+        if train:
+            return {
+                'input_ids': torch.tensor(input_ids, dtype=torch.long),
+                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+                'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
+                'images': images,
+                'cross_images': cross_images,
+                'labels': labels,
+            }
+        else:
+            return {
+                'input_ids': torch.tensor(input_ids, dtype=torch.long),
+                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0),
+                'attention_mask': torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0),
+                'images': [images],
+                'cross_images': [cross_images],
+            }
 
     def encode(self, example: Dict[str,
-    Any]) -> Dict[str, Optional[List[int]]]:
-        return self.build_conversation_input_ids(self.tokenizer, query=example['system'],
-                                                         label=example['response'],
+    Any], train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]:
+        return self.build_conversation_input_ids(self.tokenizer, query=example['query'],
+                                                         label=example.get('response'),
                                                          history=example.get('history'),
-                                                         images=[example['query'].convert('RGB')])
+                                                         images=[example['image'].convert('RGB')],
+                                                         train=train)
 
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 60782a618c..ef6ef209a8 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -460,6 +460,7 @@ def inference_stream(
     query: str,
     history: Optional[History] = None,
     system: Optional[str] = None,
+    image: Optional['Image'] = None,
     *,
     generation_config: Optional[GenerationConfig] = None
 ) -> Iterator[Tuple[str, History]]:
@@ -471,13 +472,18 @@ def inference_stream(
     else:
         history = deepcopy(history)
     example = {'query': query, 'history': history, 'system': system}
-    inputs = template.encode(example)
+    if image is not None:
+        example['image'] = image
+    inputs = template.encode(example, train=False)
     audio_info = inputs.get('audio_info')  # Compatible with qwen-audio
     input_ids = inputs['input_ids']
     tokenizer = template.tokenizer
     device = next(model.parameters()).device
     input_ids = torch.tensor(input_ids)[None].to(device)
-    attention_mask = torch.ones_like(input_ids).to(device)
+    if 'attention_mask' not in inputs:
+        attention_mask = torch.ones_like(input_ids).to(device)
+    else:
+        attention_mask = inputs['attention_mask'].to(device)
     model.eval()
     if generation_config is None:
         generation_config = getattr(model, 'generation_config', None)
@@ -497,6 +503,12 @@ def inference_stream(
     stop_words = [template.suffix[-1]]
     decode_kwargs = {}
     model_kwargs = {}
+    if 'token_type_ids' in inputs:
+        model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device)
+    if 'images' in inputs:
+        model_kwargs['images'] = [[inputs['images'][0][0].to(device).to(torch.float16)]]
+    if 'cross_images' in inputs:
+        model_kwargs['cross_images'] = [[inputs['cross_images'][0][0].to(device).to(torch.float16)]]
     if audio_info is not None:
         audio_info = get_audio_info(tokenizer, audio_info=audio_info)
         decode_kwargs['audio_info'] = audio_info

From 27c7631820ce0fa4b44cec96fd5e62023acce22d Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 21:38:40 +0800
Subject: [PATCH 06/10] fix

---
 swift/llm/infer.py          |  13 +-
 swift/llm/sft.py            |  10 +-
 swift/llm/utils/dataset.py  |   6 +-
 swift/llm/utils/model.py    |  29 +++--
 swift/llm/utils/template.py | 230 +++++++++++++++++++-----------------
 swift/llm/utils/utils.py    |  26 ++--
 6 files changed, 166 insertions(+), 148 deletions(-)

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 55474f7962..e0f5ade9c9 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -140,9 +140,13 @@ def prepare_model_template(
     logger.info(get_model_info(model))
     show_layers(model)
 
-    template: Template = get_template(args.template_type, tokenizer,
-                                      args.system, args.max_length,
-                                      args.truncation_strategy, model=model)
+    template: Template = get_template(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        args.truncation_strategy,
+        model=model)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     return model, template
@@ -214,7 +218,8 @@ def llm_infer(args: InferArguments) -> None:
                         print(response[print_idx:], end='', flush=True)
                         print_idx = len(response)
             else:
-                gen = inference_stream(model, template, query, history, image=image)
+                gen = inference_stream(
+                    model, template, query, history, image=image)
                 for response, new_history in gen:
                     if len(response) > print_idx:
                         print(response[print_idx:], end='', flush=True)
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index b81998ded7..8c57a11fc4 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -172,9 +172,13 @@ def llm_sft(args: SftArguments) -> str:
 
     logger.info(f'train_dataset: {train_dataset}')
     logger.info(f'val_dataset: {val_dataset}')
-    template: Template = get_template(args.template_type, tokenizer,
-                                      args.system, args.max_length,
-                                      args.truncation_strategy, model=model)
+    template: Template = get_template(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        args.truncation_strategy,
+        model=model)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     if not args.lazy_tokenize:
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 9cc74cc6a9..d6a8803625 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -605,23 +605,23 @@ def _preprocess_capcha_images(dataset: HfDataset) -> HfDataset:
     dataset = dataset.rename_columns({
         'solution': 'response',
     })
+
     def add_system(row):
         row['query'] = 'CAPTCHA:'
         return row
+
     dataset = dataset.map(add_system)
     return dataset
 
 
 register_dataset(
     DatasetName.capcha_images,
-    'AI-ModelScope/captcha-images',
-    [('default', 'train')],
+    'AI-ModelScope/captcha-images', [('default', 'train')],
     [('default', 'validation')],
     _preprocess_capcha_images,
     get_dataset_from_repo,
     tags=['chat', 'multi-modal', 'vision', '🔥'])
 
-
 register_dataset(
     DatasetName.cls_fudan_news_zh,
     'damo/zh_cls_fudan-news', ['train'],
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 3d579dfe14..8253bb4e83 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -173,9 +173,11 @@ class LoRATM(NamedTuple):
     qwen = ['c_attn']
     polylm = ['c_attn']
     bloom = ['query_key_value']
-    cogagent = ['vision_expert_query_key_value', 'vision_expert_dense',
-                     'language_expert_query_key_value', 'language_expert_dense',
-                     'query', 'key_value', 'dense']
+    cogagent = [
+        'vision_expert_query_key_value', 'vision_expert_dense',
+        'language_expert_query_key_value', 'language_expert_dense', 'query',
+        'key_value', 'dense'
+    ]
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],
@@ -335,14 +337,15 @@ def get_model_tokenizer_from_repo(model_dir: str,
     TemplateType.cogagent,
     requires=['transformers>=4.36'],
     support_vllm=False)
-def get_model_tokenizer_from_repo_cogagent(model_dir: str,
-                                  torch_dtype: Dtype,
-                                  model_kwargs: Dict[str, Any],
-                                  load_model: bool = True,
-                                  model_config=None,
-                                  tokenizer=None,
-                                  automodel_class=AutoModelForCausalLM,
-                                  **kwargs):
+def get_model_tokenizer_from_repo_cogagent(
+        model_dir: str,
+        torch_dtype: Dtype,
+        model_kwargs: Dict[str, Any],
+        load_model: bool = True,
+        model_config=None,
+        tokenizer=None,
+        automodel_class=AutoModelForCausalLM,
+        **kwargs):
     """load from an independent repository"""
     if model_config is None:
         model_config = AutoConfig.from_pretrained(
@@ -350,7 +353,9 @@ def get_model_tokenizer_from_repo_cogagent(model_dir: str,
     model_config.torch_dtype = torch_dtype
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(
-            'AI-ModelScope/vicuna-7b-v1.5', trust_remote_code=True, padding_side='left')
+            'AI-ModelScope/vicuna-7b-v1.5',
+            trust_remote_code=True,
+            padding_side='left')
     eos_token = kwargs.get('eos_token')
     if eos_token is not None:
         tokenizer.eos_token = eos_token
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 3e7c97c3c4..6488b49aa3 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -80,7 +80,7 @@ def get_audio_info(
         *,
         context: Optional[str] = None,
         audio_info: Optional[Dict[str,
-        Any]] = None) -> Optional[Dict[str, Any]]:
+                                  Any]] = None) -> Optional[Dict[str, Any]]:
     assert context is not None or audio_info is not None
     assert context is None or audio_info is None
     if context is None:
@@ -93,13 +93,13 @@ def get_audio_info(
 
 
 def _concat_context_list(
-        context_list: List[Context],
-        res_context_list: List[Context],
-        compute_loss_idx: List[int],
-        system: Optional[str] = None,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        round0: Optional[int] = None,
+    context_list: List[Context],
+    res_context_list: List[Context],
+    compute_loss_idx: List[int],
+    system: Optional[str] = None,
+    query: Optional[str] = None,
+    response: Optional[str] = None,
+    round0: Optional[int] = None,
 ) -> None:
     # concat context list and replace placeholder
     round1 = None
@@ -124,10 +124,10 @@ def _concat_context_list(
 
 
 def _encode_context_list(
-        tokenizer: PreTrainedTokenizerBase,
-        context_list: List[Context],
-        compute_loss_idx: Optional[List[int]] = None,
-        **args,
+    tokenizer: PreTrainedTokenizerBase,
+    context_list: List[Context],
+    compute_loss_idx: Optional[List[int]] = None,
+    **args,
 ) -> Tuple[List[int], Optional[List[int]], Dict[str, Any]]:
     input_ids: List[int] = []
     labels: List[int] = []
@@ -292,14 +292,13 @@ def __init__(self,
         self.use_default_system = True
         self._is_init = False
 
-    def _init_template(
-            self,
-            tokenizer: PreTrainedTokenizerBase,
-            default_system: Optional[str] = None,
-            max_length: Optional[int] = None,
-            truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-            **kwargs
-    ) -> None:
+    def _init_template(self,
+                       tokenizer: PreTrainedTokenizerBase,
+                       default_system: Optional[str] = None,
+                       max_length: Optional[int] = None,
+                       truncation_strategy: Literal[
+                           'delete', 'truncation_left'] = 'delete',
+                       **kwargs) -> None:
         assert self._is_init is False
         self._is_init = True
         self.tokenizer = tokenizer
@@ -310,7 +309,7 @@ def _init_template(
         self.truncation_strategy = truncation_strategy
 
     def encode(self, example: Dict[str,
-    Any], **kwargs) -> Dict[str, Optional[List[int]]]:
+                                   Any]) -> Dict[str, Optional[List[int]]]:
         if not self._is_init:
             raise ValueError(
                 'Template has not been initialized, please call init_template(...) first.'
@@ -338,42 +337,41 @@ class CogAgentTemplate(Template):
     LANGUAGE_TOKEN_TYPE = 0
     VISION_TOKEN_TYPE = 1
 
-    def _init_template(
-            self,
-            tokenizer: PreTrainedTokenizerBase,
-            default_system: Optional[str] = None,
-            max_length: Optional[int] = None,
-            truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-            **kwargs
-    ) -> None:
+    def _init_template(self,
+                       tokenizer: PreTrainedTokenizerBase,
+                       default_system: Optional[str] = None,
+                       max_length: Optional[int] = None,
+                       truncation_strategy: Literal[
+                           'delete', 'truncation_left'] = 'delete',
+                       **kwargs) -> None:
         self.model = kwargs.pop('model')
         self.suffix = [tokenizer.eos_token]
-        tokenizer.padding_side = 'left'
-        super()._init_template(tokenizer, default_system, max_length, truncation_strategy)
+        super()._init_template(tokenizer, default_system, max_length,
+                               truncation_strategy)
 
     @staticmethod
     def vqa_history_to_prompt(history, query):
         # Only support single round chat in vqa mode
-        prompt = "<EOI>Question: "
+        prompt = '<EOI>Question: '
         # for i, (old_query, response) in enumerate(history):
         #     prompt += old_query + " Short answer: " + response + " Question: "
-        prompt += query + " Short answer:"
+        prompt += query + ' Short answer:'
         return prompt
 
     @staticmethod
     def chat_old_history_to_prompt(history, query):
-        prompt = "<EOI>Question: "
+        prompt = '<EOI>Question: '
         for i, (old_query, response) in enumerate(history):
-            prompt += old_query + " Answer: " + response + "\nQuestion: "
-        prompt += query + " Answer:"
+            prompt += old_query + ' Answer: ' + response + '\nQuestion: '
+        prompt += query + ' Answer:'
         return prompt
 
     @staticmethod
     def chat_history_to_prompt(history, query):
-        prompt = " [INST] "
+        prompt = ' [INST] '
         for i, (old_query, response) in enumerate(history):
-            prompt += old_query + " [/INST] " + response + " [INST] "
-        prompt += query + " [/INST] "
+            prompt += old_query + ' [/INST] ' + response + ' [INST] '
+        prompt += query + ' [/INST] '
         return prompt
 
     @staticmethod
@@ -382,29 +380,29 @@ def base_history_to_prompt(history, query):
         return prompt
 
     _history_to_prompt = {
-        "base": base_history_to_prompt,
-        "chat": chat_history_to_prompt,
-        "chat_old": chat_old_history_to_prompt,
-        "vqa": vqa_history_to_prompt
+        'base': base_history_to_prompt,
+        'chat': chat_history_to_prompt,
+        'chat_old': chat_old_history_to_prompt,
+        'vqa': vqa_history_to_prompt
     }
 
     def build_conversation_input_ids(
-            self,
-            tokenizer: "PreTrainedTokenizer",
-            *,
-            query: str,
-            label: Optional[str] = None,
-            history: Optional[List[Tuple[str, str]]] = None,
-            images: Optional[List["PIL.Image"]] = None,
-            template_version: Optional[Literal["base", "chat", "vqa"]] = None,
-            train: Optional[bool] = True,
+        self,
+        tokenizer: 'PreTrainedTokenizer',
+        *,
+        query: str,
+        label: Optional[str] = None,
+        history: Optional[List[Tuple[str, str]]] = None,
+        images: Optional[List['PIL.Image']] = None,
+        template_version: Optional[Literal['base', 'chat', 'vqa']] = None,
     ):
         from torchvision import transforms
         image_size: int = self.model.config.vision_config['image_size']
         cross_image_size: int = self.model.config.cross_image_size
         patch_size: int = self.model.config.vision_config['patch_size']
         template_version = template_version or self.model.config.template_version
-        assert images is None or len(images) <= 1, f"not support multi images by now."
+        assert images is None or len(
+            images) <= 1, 'not support multi images by now.'
         history = history or []
         text = self._history_to_prompt[template_version](history, query)
 
@@ -413,84 +411,95 @@ def build_conversation_input_ids(
         if images is not None and len(images) == 1:
             ori = images
             # vision
-            transform = transforms.Compose(
-                [
-                    transforms.Resize(
-                        (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC
-                    ),
-                    transforms.ToTensor(),
-                    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-                ]
-            )
+            transform = transforms.Compose([
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                     (0.26862954, 0.26130258, 0.27577711)),
+            ])
             images = [transform(ori[0])]
-            cross_transform = transforms.Compose(
-                [
-                    transforms.Resize(
-                        (cross_image_size, cross_image_size), interpolation=transforms.InterpolationMode.BICUBIC
-                    ),
-                    transforms.ToTensor(),
-                    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-                ]
-            )
+            cross_transform = transforms.Compose([
+                transforms.Resize(
+                    (cross_image_size, cross_image_size),
+                    interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                     (0.26862954, 0.26130258, 0.27577711)),
+            ])
             cross_images = [cross_transform(ori[0])]
             # language
-            vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2
+            vision_token_num = (image_size // patch_size) * (image_size
+                                                             // patch_size) + 2
             input_ids += [tokenizer.pad_token_id] * vision_token_num
             token_type_ids += [self.VISION_TOKEN_TYPE] * vision_token_num
         text_ids = tokenizer.encode(text, add_special_tokens=False)
-        if label is not None:
-            label_ids = tokenizer.encode(label, add_special_tokens=False)
-        else:
-            label_ids = []
-        if len(text_ids) + len(input_ids) + len(label_ids) > self.max_length - 1:
-            if self.truncation_strategy == 'delete' or (len(input_ids) + len(label_ids) >= self.max_length - 1):
+        train = label is not None
+        label_ids = tokenizer.encode(
+            label, add_special_tokens=False) if train else []
+        if len(text_ids) + len(input_ids) + len(
+                label_ids) > self.max_length - 1:
+            if self.truncation_strategy == 'delete' or (
+                    len(input_ids) + len(label_ids) >= self.max_length - 1):
                 return None
             else:
-                text_ids = text_ids[-(self.max_length - len(input_ids) - len(label_ids) - 1):]
+                text_ids = text_ids[-(self.max_length - len(input_ids)
+                                      - len(label_ids) - 1):]
 
         input_ids += text_ids
-        if label_ids:
-            labels = [-100] * len(input_ids) + label_ids + [tokenizer.eos_token_id]
         if train:
+            labels = [-100] * len(input_ids) + label_ids + [
+                tokenizer.eos_token_id
+            ]
             input_ids += label_ids + [tokenizer.eos_token_id]
-            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (len(text_ids) + len(label_ids) + 1)
+            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * (
+                len(text_ids) + len(label_ids) + 1)
         else:
             token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * len(text_ids)
         attention_mask = [1] * len(input_ids)
 
-        # if len(input_ids) < self.max_length:
-        #     padding_len = self.max_length - len(input_ids)
-        #     input_ids += [tokenizer.pad_token_id] * padding_len
-        #     token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len
-        #     attention_mask += [0] * padding_len
-        #     if label_ids:
-        #         labels += [-100] * padding_len
-        
+        if len(input_ids) < self.max_length and train:
+            padding_len = self.max_length - len(input_ids)
+            input_ids += [tokenizer.pad_token_id] * padding_len
+            token_type_ids += [self.LANGUAGE_TOKEN_TYPE] * padding_len
+            attention_mask += [0] * padding_len
+            if label_ids:
+                labels += [-100] * padding_len
+
         if train:
             return {
                 'input_ids': torch.tensor(input_ids, dtype=torch.long),
-                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
-                'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
+                'token_type_ids':
+                torch.tensor(token_type_ids, dtype=torch.long),
+                'attention_mask':
+                torch.tensor(attention_mask, dtype=torch.long),
                 'images': images,
                 'cross_images': cross_images,
                 'labels': labels,
             }
         else:
             return {
-                'input_ids': torch.tensor(input_ids, dtype=torch.long),
-                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0),
-                'attention_mask': torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0),
+                'input_ids':
+                torch.tensor(input_ids, dtype=torch.long),
+                'token_type_ids':
+                torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0),
+                'attention_mask':
+                torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0),
                 'images': [images],
                 'cross_images': [cross_images],
             }
 
-    def encode(self, example: Dict[str,
-    Any], train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]:
-        return self.build_conversation_input_ids(self.tokenizer, query=example['query'],
-                                                         label=example.get('response'),
-                                                         history=example.get('history'),
-                                                         images=[example['image'].convert('RGB')],
-                                                         train=train)
+    def encode(self,
+               example: Dict[str, Any],
+               train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]:
+        return self.build_conversation_input_ids(
+            self.tokenizer,
+            query=example['query'],
+            label=example.get('response'),
+            history=example.get('history'),
+            images=[example['image'].convert('RGB')],
+            train=train)
 
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
@@ -647,18 +656,17 @@ def register_template(template_type: str,
     Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '],
              ['<|endoftext|>'], ['<|endoftext|>'], ''))
 
-register_template(
-    TemplateType.cogagent,
-    CogAgentTemplate([], [], [], [], None, []))
+register_template(TemplateType.cogagent,
+                  CogAgentTemplate([], [], [], [], None, []))
 
 
 def get_template(
-        template_type: str,
-        tokenizer: PreTrainedTokenizerBase,
-        default_system: Optional[str] = None,
-        max_length: Optional[int] = None,
-        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-        **kwargs,
+    template_type: str,
+    tokenizer: PreTrainedTokenizerBase,
+    default_system: Optional[str] = None,
+    max_length: Optional[int] = None,
+    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+    **kwargs,
 ) -> Template:
     template_info = TEMPLATE_MAPPING[template_type]
     template = deepcopy(template_info['template'])
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index ef6ef209a8..8a3b208ed0 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -342,20 +342,12 @@ def data_collate_fn(batch: List[Dict[str, Any]],
             for b in batch
         ]
     if batch[0].get('images') is not None:
-        res['images'] = [
-            b['images']
-            for b in batch
-        ]
+        res['images'] = [b['images'] for b in batch]
     if batch[0].get('cross_images') is not None:
-        res['cross_images'] = [
-            b['cross_images']
-            for b in batch
-        ]
+        res['cross_images'] = [b['cross_images'] for b in batch]
     if batch[0].get('token_type_ids') is not None:
-        res['token_type_ids'] = torch.stack([
-            b['token_type_ids']
-            for b in batch
-        ])
+        res['token_type_ids'] = torch.stack(
+            [b['token_type_ids'] for b in batch])
     return res
 
 
@@ -474,7 +466,7 @@ def inference_stream(
     example = {'query': query, 'history': history, 'system': system}
     if image is not None:
         example['image'] = image
-    inputs = template.encode(example, train=False)
+    inputs = template.encode(example)
     audio_info = inputs.get('audio_info')  # Compatible with qwen-audio
     input_ids = inputs['input_ids']
     tokenizer = template.tokenizer
@@ -506,9 +498,13 @@ def inference_stream(
     if 'token_type_ids' in inputs:
         model_kwargs['token_type_ids'] = inputs['token_type_ids'].to(device)
     if 'images' in inputs:
-        model_kwargs['images'] = [[inputs['images'][0][0].to(device).to(torch.float16)]]
+        model_kwargs['images'] = [[
+            inputs['images'][0][0].to(device).to(torch.float16)
+        ]]
     if 'cross_images' in inputs:
-        model_kwargs['cross_images'] = [[inputs['cross_images'][0][0].to(device).to(torch.float16)]]
+        model_kwargs['cross_images'] = [[
+            inputs['cross_images'][0][0].to(device).to(torch.float16)
+        ]]
     if audio_info is not None:
         audio_info = get_audio_info(tokenizer, audio_info=audio_info)
         decode_kwargs['audio_info'] = audio_info

From 7a9bcdb5400edd6eea93e2431e2528fcc9bd374e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 21:46:46 +0800
Subject: [PATCH 07/10] fix

---
 swift/llm/utils/template.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 6488b49aa3..fe27c62e7b 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -491,15 +491,13 @@ def build_conversation_input_ids(
             }
 
     def encode(self,
-               example: Dict[str, Any],
-               train: Optional[bool] = True) -> Dict[str, Optional[List[int]]]:
+               example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]:
         return self.build_conversation_input_ids(
             self.tokenizer,
             query=example['query'],
             label=example.get('response'),
             history=example.get('history'),
-            images=[example['image'].convert('RGB')],
-            train=train)
+            images=[example['image'].convert('RGB')])
 
 
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}

From c5520ee882d1b5f02dc8d9de268c22052742b875 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 21:47:41 +0800
Subject: [PATCH 08/10] fix

---
 swift/llm/utils/template.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index fe27c62e7b..c117132903 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -490,8 +490,8 @@ def build_conversation_input_ids(
                 'cross_images': [cross_images],
             }
 
-    def encode(self,
-               example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]:
+    def encode(self, example: Dict[str,
+                                   Any]) -> Dict[str, Optional[List[int]]]:
         return self.build_conversation_input_ids(
             self.tokenizer,
             query=example['query'],

From 2973d9b91fb280466e576a3dfc0fc309619695d2 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 22:03:31 +0800
Subject: [PATCH 09/10] fix

---
 .../llm/scripts/cogagent_chat/lora/infer.sh     |  2 +-
 .../llm/scripts/cogagent_chat/lora/sft.sh       | 17 ++++++++---------
 swift/llm/utils/model.py                        |  3 +++
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
index 6dd1a0604d..4d0e48de20 100644
--- a/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/infer.sh
@@ -2,7 +2,7 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0 \
 python llm_infer.py \
-    --ckpt_dir "/mnt/workspace/yzhao/tastelikefeet/swift/examples/pytorch/llm/output/cogagent-chat/v47-20231220-132558/checkpoint-400" \
+    --ckpt_dir "/xxx/xxx/cogagent-chat/vx-xxx/checkpoint-xx" \
     --load_args_from_ckpt_dir true \
     --eval_human true \
     --max_length 4096 \
diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh
index c23bbb8749..0b642444db 100644
--- a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh
+++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh
@@ -1,23 +1,22 @@
-# Experimental environment: V100, A10, 3090
-# 18GB GPU memory
+# Experimental environment: 2 * A100
+# 2 * 45GB
 PYTHONPATH=../../.. \
-CUDA_VISIBLE_DEVICES=0 \
+CUDA_VISIBLE_DEVICES=0,1 \
 python llm_sft.py \
     --model_type cogagent-chat \
     --sft_type lora \
     --tuner_backend swift \
     --dtype fp16 \
     --output_dir output \
-    --custom_train_dataset_path xxx.jsonl \
-    --custom_val_dataset_path yyy.jsonl \
+    --dataset capcha-images \
     --train_dataset_sample -1 \
-    --num_train_epochs 1 \
-    --max_length 4096 \
+    --num_train_epochs 2 \
+    --max_length 1024 \
     --check_dataset_strategy warning \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --gradient_checkpointing true \
+    --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0.01 \
     --learning_rate 1e-4 \
@@ -27,7 +26,7 @@ python llm_sft.py \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
-    --logging_steps 10 \
+    --logging_steps 10
     --push_to_hub false \
     --hub_model_id cogagent-chat-lora \
     --hub_private_repo true \
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 8253bb4e83..72f3fc8049 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -367,6 +367,9 @@ def get_model_tokenizer_from_repo_cogagent(
             torch_dtype=torch_dtype,
             trust_remote_code=True,
             **model_kwargs)
+        logger.info(
+            'CogAgent with FusedLayerNorm will cause an training loss of Nan, '
+            'to avoid this, please uninstall apex.')
     return model, tokenizer
 
 

From df1c0073e48d6c508c6c994206428a73d854cc83 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 Dec 2023 22:25:00 +0800
Subject: [PATCH 10/10] fix

---
 swift/llm/utils/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index e10217721e..5ddc05745e 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -180,7 +180,7 @@ class LoRATM(NamedTuple):
         'language_expert_query_key_value', 'language_expert_dense', 'query',
         'key_value', 'dense'
     ]
-	phi = ['Wqkv']
+    phi = ['Wqkv']
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],