Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/支持的模型和数据集.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@
|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)|
|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)|
|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)|
|deepseek-janus|[deepseek-ai/Janus-1.3B](https://modelscope.cn/models/deepseek-ai/Janus-1.3B/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-janus|✔|✘|✘|✘||vision|[deepseek-ai/Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)|
|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
|ovis1_6-gemma2-9b|[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B/summary)|^(llm)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|ovis1_6|✔|✘|✘|✘|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
Expand Down
1 change: 1 addition & 0 deletions docs/source_en/Instruction/Supported-models-datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ The table below introcudes all models supported by SWIFT:
|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)|
|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)|
|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)|
|deepseek-janus|[deepseek-ai/Janus-1.3B](https://modelscope.cn/models/deepseek-ai/Janus-1.3B/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-janus|✔|✘|✘|✘||vision|[deepseek-ai/Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)|
|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
|ovis1_6-gemma2-9b|[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B/summary)|^(llm)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|ovis1_6|✔|✘|✘|✘|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
Expand Down
57 changes: 49 additions & 8 deletions swift/llm/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ class ModelType:
# numina-math
numina_math_7b = 'numina-math-7b'
# deepseek-vl
deepseek_janus_1_3b = 'deepseek-janus-1_3b'
deepseek_vl_1_3b_chat = 'deepseek-vl-1_3b-chat'
deepseek_vl_7b_chat = 'deepseek-vl-7b-chat'
# deepseek-v2
Expand Down Expand Up @@ -664,6 +665,7 @@ class LoRATM(NamedTuple):
llama3_2_vision = 'llama3_2_vision'
ovis1_6 = 'ovis1_6'
molmo = 'molmo'
deepseek_janus = 'deepseek_janus'
# default lora target modules for nlp llms.
minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj']
baichuan = ['W_pack']
Expand Down Expand Up @@ -3800,7 +3802,8 @@ def get_model_tokenizer_qwen2_audio(model_dir: str,
**kwargs):
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
processor = AutoProcessor.from_pretrained(model_dir)
kwargs['automodel_class'] = Qwen2AudioForConditionalGeneration
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = Qwen2AudioForConditionalGeneration
model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs)
tokenizer.processor = processor
return model, tokenizer
Expand Down Expand Up @@ -3830,7 +3833,8 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float,

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
processor = AutoProcessor.from_pretrained(model_dir)
kwargs['automodel_class'] = Qwen2VLForConditionalGeneration
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = Qwen2VLForConditionalGeneration
model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs)
tokenizer.processor = processor
if model is not None:
Expand Down Expand Up @@ -4833,6 +4837,37 @@ def _new_func(self, *args, **kwargs):
setattr(submodel, key, MethodType(_get_new_func(key), submodel)) # fix device_map


@register_model(
ModelType.deepseek_janus_1_3b,
'deepseek-ai/Janus-1.3B',
LoRATM.deepseek_janus,
TemplateType.deepseek_janus,
support_flash_attn=True,
tags=['multi-modal', 'vision'],
placeholder_tokens=['<image_placeholder>'],
hf_model_id='deepseek-ai/Janus-1.3B')
def get_model_tokenizer_deepseek_janus(model_dir: str, *args, **kwargs):
if 'local_repo_path' in kwargs:
local_repo_path = kwargs['local_repo_path']
else:
local_repo_path = git_clone_github('https://github.com/deepseek-ai/Janus')
sys.path.append(os.path.join(local_repo_path))
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images

processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_dir)
tokenizer = processor.tokenizer
model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, *args, tokenizer=tokenizer, **kwargs)
tokenizer.processor = processor
if model:
model.language_model.model.embed_tokens.register_forward_hook(_clone_hook)
model.language_model.model.embed_tokens.register_forward_hook(_output_device_map_hook)
func_list = ['generate', 'get_input_embeddings', 'forward', 'gradient_checkpointing_enable']
_use_submodel_func(model, 'language_model', func_list)
model.generation_config = model.language_model.generation_config
return model, tokenizer


@register_model(
ModelType.deepseek_vl_7b_chat,
'deepseek-ai/deepseek-vl-7b-chat',
Expand Down Expand Up @@ -6418,7 +6453,8 @@ def get_model_tokenizer_llava_hf(model_dir: str, *args, **kwargs):
hf_model_id='meta-llama/Llama-3.2-90B-Vision-Instruct')
def get_model_tokenizer_llama3_2_vision(*args, **kwargs):
from transformers import MllamaForConditionalGeneration
kwargs['automodel_class'] = MllamaForConditionalGeneration
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = MllamaForConditionalGeneration
return get_model_tokenizer_llava_hf(*args, **kwargs)


Expand Down Expand Up @@ -6484,7 +6520,8 @@ def get_model_tokenizer_llava_1_5(*args, **kwargs):
hf_model_id='llava-hf/llava-onevision-qwen2-72b-ov-hf')
def get_model_tokenizer_llava_onevision(*args, **kwargs):
from transformers import LlavaOnevisionForConditionalGeneration
kwargs['automodel_class'] = LlavaOnevisionForConditionalGeneration
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = LlavaOnevisionForConditionalGeneration
return get_model_tokenizer_llava_hf(*args, **kwargs)


Expand Down Expand Up @@ -6614,7 +6651,8 @@ def get_model_tokenizer_llava_next_yi(*args, **kwargs):
hf_model_id='llava-hf/LLaVA-NeXT-Video-7B-hf')
def get_model_tokenizer_llava_next_video(*args, **kwargs):
from transformers import LlavaNextVideoForConditionalGeneration
kwargs['automodel_class'] = LlavaNextVideoForConditionalGeneration
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = LlavaNextVideoForConditionalGeneration
return get_model_tokenizer_llava_hf(*args, **kwargs)


Expand Down Expand Up @@ -6737,7 +6775,8 @@ def _new_forward(*args, **kwargs):
def get_model_tokenizer_idefics(model_dir: str, *args, **kwargs):
from transformers import AutoProcessor, AutoModelForVision2Seq
processor = AutoProcessor.from_pretrained(model_dir)
kwargs['automodel_class'] = AutoModelForVision2Seq
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = AutoModelForVision2Seq
model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
tokenizer.processor = processor
return model, tokenizer
Expand Down Expand Up @@ -6823,7 +6862,8 @@ def get_model_tokenizer_omnli(model_dir: str,
model_config.speech_encoder = os.path.join(model_dir, 'large-v3.pt')
if not os.path.exists(model_config.speech_encoder):
whisper.load_model('large-v3', download_root=model_dir)
kwargs['automodel_class'] = OmniSpeech2SLlamaForCausalLM
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = OmniSpeech2SLlamaForCausalLM
kwargs['model_config'] = model_config
for key in ['forward', 'generate']:
try:
Expand Down Expand Up @@ -6851,7 +6891,8 @@ def get_model_tokenizer_omnli(model_dir: str,
tags=['multi-modal', 'audio'],
hf_model_id='stepfun-ai/GOT-OCR2_0')
def get_model_tokenizer_got_ocr2(*args, **kwargs):
kwargs['automodel_class'] = AutoModel
if 'automodel_class' not in kwargs:
kwargs['automodel_class'] = AutoModel
model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs)
return model, tokenizer

Expand Down
26 changes: 22 additions & 4 deletions swift/llm/utils/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,12 @@ class TemplateType:
chatml = 'chatml'
got_ocr2 = 'got_ocr2'
ovis1_6 = 'ovis1_6'
molmo = 'molmo'
deepseek_janus = 'deepseek-janus'
# compatibility. (Deprecated)
default_generation_bos = 'default-generation-bos'
yi = 'yi'
yi1_5 = 'yi1_5'
molmo = 'molmo'

@classmethod
def get_template_name_list(cls) -> List[str]:
Expand Down Expand Up @@ -3322,6 +3323,8 @@ def __init__(self):
['<|end▁of▁sentence|>'], ['<|end▁of▁sentence|>'], self.DEEPSEEK_VL_SYSTEM)

def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
is_janus = getattr(self, 'is_janus', False)

inputs, _ = super()._encode(example)
if len(inputs) == 0:
return inputs, {}
Expand All @@ -3335,15 +3338,22 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
new_input_ids += input_ids[lo:hi]
if labels is not None:
new_labels += labels[lo:hi]
new_input_ids += [processor.image_id] * processor.num_image_tokens
new_labels += [-100] * processor.num_image_tokens
image_tokens = [processor.image_id] * processor.num_image_tokens
if is_janus:
image_tokens = [processor.image_start_id] + image_tokens + [processor.image_end_id]
new_input_ids += image_tokens
new_labels += [-100] * len(image_tokens)
lo = hi + 1
new_input_ids += input_ids[lo:]
if labels is not None:
new_labels += labels[lo:]
else:
new_labels = None
from deepseek_vl.models.processing_vlm import VLChatProcessorOutput
if is_janus:
from janus.models.processing_vlm import VLChatProcessorOutput
else:
from deepseek_vl.models.processing_vlm import VLChatProcessorOutput

images_outputs = processor.image_processor(images, return_tensors='pt')
output = VLChatProcessorOutput(
sft_format=None,
Expand All @@ -3366,6 +3376,14 @@ def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int

register_template(TemplateType.deepseek_vl, DeepseekVLTemplate(), use_model=True, lazy_tokenize=True)


class DeepseekJanus(DeepseekVLTemplate):
is_janus = True
image_placeholder = ['<image_placeholder>\n']


register_template(TemplateType.deepseek_janus, DeepseekJanus(), use_model=True, lazy_tokenize=True)

register_template(
TemplateType.zephyr,
Template([], ['<|user|>\n{{QUERY}}</s>\n<|assistant|>\n'], ['</s>\n'], ['</s>'], None,
Expand Down
6 changes: 6 additions & 0 deletions swift/utils/module_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,11 @@ def __post_init__(self):
language_model='model.transformer',
vision_tower='model.vision_backbone',
)
DEEPSPEED_JANUS = MultiModelKeys(
language_model='language_model',
vision_tower='vision_model',
connector='aligner',
generator=['gen_vision_model', 'gen_aligner', 'gen_head', 'gen_embed'])

MODEL_KEYS_MAPPING = OrderedDict([
# MLLM here
Expand All @@ -336,6 +341,7 @@ def __post_init__(self):
('llama3_2_vision', LLAMA3_2_VISION),
('ovis1_6', OVIS1_6),
('molmo', MOLMO_KEYS),
('deepseek_janus', DEEPSPEED_JANUS),
# LLM begins here
('llama', LLAMA_KEYS),
('mistral', LLAMA_KEYS),
Expand Down
Loading