diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 85c82980ee..ecf11ad48d 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -491,6 +491,7 @@ |internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| |internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| |internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|deepseek-janus|[deepseek-ai/Janus-1.3B](https://modelscope.cn/models/deepseek-ai/Janus-1.3B/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-janus|✔|✘|✘|✘||vision|[deepseek-ai/Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |ovis1_6-gemma2-9b|[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B/summary)|^(llm)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|ovis1_6|✔|✘|✘|✘|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index 4cb94ec658..43c1ac19d0 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -491,6 +491,7 @@ The table below introcudes all models supported by SWIFT: |internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| |internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| |internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|deepseek-janus|[deepseek-ai/Janus-1.3B](https://modelscope.cn/models/deepseek-ai/Janus-1.3B/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-janus|✔|✘|✘|✘||vision|[deepseek-ai/Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |ovis1_6-gemma2-9b|[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B/summary)|^(llm)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|ovis1_6|✔|✘|✘|✘|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)| diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 6acb24a0da..f72ba4afea 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -439,6 +439,7 @@ class ModelType: # numina-math numina_math_7b = 'numina-math-7b' # deepseek-vl + deepseek_janus_1_3b = 'deepseek-janus-1_3b' deepseek_vl_1_3b_chat = 'deepseek-vl-1_3b-chat' deepseek_vl_7b_chat = 'deepseek-vl-7b-chat' # deepseek-v2 @@ -664,6 +665,7 @@ class LoRATM(NamedTuple): llama3_2_vision = 'llama3_2_vision' ovis1_6 = 'ovis1_6' molmo = 'molmo' + deepseek_janus = 'deepseek_janus' # default lora target modules for nlp llms. minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj'] baichuan = ['W_pack'] @@ -3800,7 +3802,8 @@ def get_model_tokenizer_qwen2_audio(model_dir: str, **kwargs): from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor processor = AutoProcessor.from_pretrained(model_dir) - kwargs['automodel_class'] = Qwen2AudioForConditionalGeneration + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = Qwen2AudioForConditionalGeneration model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs) tokenizer.processor = processor return model, tokenizer @@ -3830,7 +3833,8 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float, from transformers import Qwen2VLForConditionalGeneration, AutoProcessor processor = AutoProcessor.from_pretrained(model_dir) - kwargs['automodel_class'] = Qwen2VLForConditionalGeneration + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = Qwen2VLForConditionalGeneration model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs) tokenizer.processor = processor if model is not None: @@ -4833,6 +4837,37 @@ def _new_func(self, *args, **kwargs): setattr(submodel, key, MethodType(_get_new_func(key), submodel)) # fix device_map +@register_model( + ModelType.deepseek_janus_1_3b, + 'deepseek-ai/Janus-1.3B', + LoRATM.deepseek_janus, + TemplateType.deepseek_janus, + support_flash_attn=True, + tags=['multi-modal', 'vision'], + placeholder_tokens=[''], + hf_model_id='deepseek-ai/Janus-1.3B') +def get_model_tokenizer_deepseek_janus(model_dir: str, *args, **kwargs): + if 'local_repo_path' in kwargs: + local_repo_path = kwargs['local_repo_path'] + else: + local_repo_path = git_clone_github('https://github.com/deepseek-ai/Janus') + sys.path.append(os.path.join(local_repo_path)) + from janus.models import MultiModalityCausalLM, VLChatProcessor + from janus.utils.io import load_pil_images + + processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_dir) + tokenizer = processor.tokenizer + model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, *args, tokenizer=tokenizer, **kwargs) + tokenizer.processor = processor + if model: + model.language_model.model.embed_tokens.register_forward_hook(_clone_hook) + model.language_model.model.embed_tokens.register_forward_hook(_output_device_map_hook) + func_list = ['generate', 'get_input_embeddings', 'forward', 'gradient_checkpointing_enable'] + _use_submodel_func(model, 'language_model', func_list) + model.generation_config = model.language_model.generation_config + return model, tokenizer + + @register_model( ModelType.deepseek_vl_7b_chat, 'deepseek-ai/deepseek-vl-7b-chat', @@ -6418,7 +6453,8 @@ def get_model_tokenizer_llava_hf(model_dir: str, *args, **kwargs): hf_model_id='meta-llama/Llama-3.2-90B-Vision-Instruct') def get_model_tokenizer_llama3_2_vision(*args, **kwargs): from transformers import MllamaForConditionalGeneration - kwargs['automodel_class'] = MllamaForConditionalGeneration + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = MllamaForConditionalGeneration return get_model_tokenizer_llava_hf(*args, **kwargs) @@ -6484,7 +6520,8 @@ def get_model_tokenizer_llava_1_5(*args, **kwargs): hf_model_id='llava-hf/llava-onevision-qwen2-72b-ov-hf') def get_model_tokenizer_llava_onevision(*args, **kwargs): from transformers import LlavaOnevisionForConditionalGeneration - kwargs['automodel_class'] = LlavaOnevisionForConditionalGeneration + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = LlavaOnevisionForConditionalGeneration return get_model_tokenizer_llava_hf(*args, **kwargs) @@ -6614,7 +6651,8 @@ def get_model_tokenizer_llava_next_yi(*args, **kwargs): hf_model_id='llava-hf/LLaVA-NeXT-Video-7B-hf') def get_model_tokenizer_llava_next_video(*args, **kwargs): from transformers import LlavaNextVideoForConditionalGeneration - kwargs['automodel_class'] = LlavaNextVideoForConditionalGeneration + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = LlavaNextVideoForConditionalGeneration return get_model_tokenizer_llava_hf(*args, **kwargs) @@ -6737,7 +6775,8 @@ def _new_forward(*args, **kwargs): def get_model_tokenizer_idefics(model_dir: str, *args, **kwargs): from transformers import AutoProcessor, AutoModelForVision2Seq processor = AutoProcessor.from_pretrained(model_dir) - kwargs['automodel_class'] = AutoModelForVision2Seq + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = AutoModelForVision2Seq model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs) tokenizer.processor = processor return model, tokenizer @@ -6823,7 +6862,8 @@ def get_model_tokenizer_omnli(model_dir: str, model_config.speech_encoder = os.path.join(model_dir, 'large-v3.pt') if not os.path.exists(model_config.speech_encoder): whisper.load_model('large-v3', download_root=model_dir) - kwargs['automodel_class'] = OmniSpeech2SLlamaForCausalLM + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = OmniSpeech2SLlamaForCausalLM kwargs['model_config'] = model_config for key in ['forward', 'generate']: try: @@ -6851,7 +6891,8 @@ def get_model_tokenizer_omnli(model_dir: str, tags=['multi-modal', 'audio'], hf_model_id='stepfun-ai/GOT-OCR2_0') def get_model_tokenizer_got_ocr2(*args, **kwargs): - kwargs['automodel_class'] = AutoModel + if 'automodel_class' not in kwargs: + kwargs['automodel_class'] = AutoModel model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs) return model, tokenizer diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 9f8d4093b1..0e0e7b1459 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -145,11 +145,12 @@ class TemplateType: chatml = 'chatml' got_ocr2 = 'got_ocr2' ovis1_6 = 'ovis1_6' + molmo = 'molmo' + deepseek_janus = 'deepseek-janus' # compatibility. (Deprecated) default_generation_bos = 'default-generation-bos' yi = 'yi' yi1_5 = 'yi1_5' - molmo = 'molmo' @classmethod def get_template_name_list(cls) -> List[str]: @@ -3322,6 +3323,8 @@ def __init__(self): ['<|end▁of▁sentence|>'], ['<|end▁of▁sentence|>'], self.DEEPSEEK_VL_SYSTEM) def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: + is_janus = getattr(self, 'is_janus', False) + inputs, _ = super()._encode(example) if len(inputs) == 0: return inputs, {} @@ -3335,15 +3338,22 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An new_input_ids += input_ids[lo:hi] if labels is not None: new_labels += labels[lo:hi] - new_input_ids += [processor.image_id] * processor.num_image_tokens - new_labels += [-100] * processor.num_image_tokens + image_tokens = [processor.image_id] * processor.num_image_tokens + if is_janus: + image_tokens = [processor.image_start_id] + image_tokens + [processor.image_end_id] + new_input_ids += image_tokens + new_labels += [-100] * len(image_tokens) lo = hi + 1 new_input_ids += input_ids[lo:] if labels is not None: new_labels += labels[lo:] else: new_labels = None - from deepseek_vl.models.processing_vlm import VLChatProcessorOutput + if is_janus: + from janus.models.processing_vlm import VLChatProcessorOutput + else: + from deepseek_vl.models.processing_vlm import VLChatProcessorOutput + images_outputs = processor.image_processor(images, return_tensors='pt') output = VLChatProcessorOutput( sft_format=None, @@ -3366,6 +3376,14 @@ def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int register_template(TemplateType.deepseek_vl, DeepseekVLTemplate(), use_model=True, lazy_tokenize=True) + +class DeepseekJanus(DeepseekVLTemplate): + is_janus = True + image_placeholder = ['\n'] + + +register_template(TemplateType.deepseek_janus, DeepseekJanus(), use_model=True, lazy_tokenize=True) + register_template( TemplateType.zephyr, Template([], ['<|user|>\n{{QUERY}}\n<|assistant|>\n'], ['\n'], [''], None, diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py index 36af7a10c1..10666ab165 100644 --- a/swift/utils/module_mapping.py +++ b/swift/utils/module_mapping.py @@ -311,6 +311,11 @@ def __post_init__(self): language_model='model.transformer', vision_tower='model.vision_backbone', ) +DEEPSPEED_JANUS = MultiModelKeys( + language_model='language_model', + vision_tower='vision_model', + connector='aligner', + generator=['gen_vision_model', 'gen_aligner', 'gen_head', 'gen_embed']) MODEL_KEYS_MAPPING = OrderedDict([ # MLLM here @@ -336,6 +341,7 @@ def __post_init__(self): ('llama3_2_vision', LLAMA3_2_VISION), ('ovis1_6', OVIS1_6), ('molmo', MOLMO_KEYS), + ('deepseek_janus', DEEPSPEED_JANUS), # LLM begins here ('llama', LLAMA_KEYS), ('mistral', LLAMA_KEYS),