Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/Instruction/支持的模型和数据集.md
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,13 @@
|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)|
|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)|
|[OpenGVLab/InternVL2-4B](https://modelscope.cn/models/OpenGVLab/InternVL2-4B)|internvl2_phi3|internvl2_phi3|transformers>=4.36,<4.42, timm|vision, video|[OpenGVLab/InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B)|
|[OpenGVLab/InternVL2_5-1B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-1B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B)|
|[OpenGVLab/InternVL2_5-2B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-2B](https://huggingface.co/OpenGVLab/InternVL2_5-2B)|
|[OpenGVLab/InternVL2_5-4B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-4B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B)|
|[OpenGVLab/InternVL2_5-8B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-8B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B)|
|[OpenGVLab/InternVL2_5-26B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-26B](https://huggingface.co/OpenGVLab/InternVL2_5-26B)|
|[OpenGVLab/InternVL2_5-38B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-38B](https://huggingface.co/OpenGVLab/InternVL2_5-38B)|
|[OpenGVLab/InternVL2_5-78B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-78B](https://huggingface.co/OpenGVLab/InternVL2_5-78B)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
Expand Down
7 changes: 7 additions & 0 deletions docs/source_en/Instruction/Supported-models-datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,13 @@ The table below provides relevant information about the models accessed by ms-sw
|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)|
|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ)|internvl2|internvl2|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)|
|[OpenGVLab/InternVL2-4B](https://modelscope.cn/models/OpenGVLab/InternVL2-4B)|internvl2_phi3|internvl2_phi3|transformers>=4.36,<4.42, timm|vision, video|[OpenGVLab/InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B)|
|[OpenGVLab/InternVL2_5-1B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-1B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B)|
|[OpenGVLab/InternVL2_5-2B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-2B](https://huggingface.co/OpenGVLab/InternVL2_5-2B)|
|[OpenGVLab/InternVL2_5-4B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-4B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B)|
|[OpenGVLab/InternVL2_5-8B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-8B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B)|
|[OpenGVLab/InternVL2_5-26B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-26B](https://huggingface.co/OpenGVLab/InternVL2_5-26B)|
|[OpenGVLab/InternVL2_5-38B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-38B](https://huggingface.co/OpenGVLab/InternVL2_5-38B)|
|[OpenGVLab/InternVL2_5-78B](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B)|internvl2_5|internvl2_5|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2_5-78B](https://huggingface.co/OpenGVLab/InternVL2_5-78B)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
Expand Down
1 change: 1 addition & 0 deletions swift/llm/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class MLLMModelType:
internvl_phi3 = 'internvl_phi3'
internvl2 = 'internvl2'
internvl2_phi3 = 'internvl2_phi3'
internvl2_5 = 'internvl2_5'
xcomposer2 = 'xcomposer2'
xcomposer2_4khd = 'xcomposer2_4khd'
xcomposer2_5 = 'xcomposer2_5'
Expand Down
22 changes: 22 additions & 0 deletions swift/llm/model/model/internlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,28 @@ def get_model_tokenizer_internvl(model_dir: str,
tags=['vision', 'video'],
))

register_model(
ModelMeta(
MLLMModelType.internvl2_5,
[
ModelGroup([
Model('OpenGVLab/InternVL2_5-1B', 'OpenGVLab/InternVL2_5-1B'),
Model('OpenGVLab/InternVL2_5-2B', 'OpenGVLab/InternVL2_5-2B'),
Model('OpenGVLab/InternVL2_5-4B', 'OpenGVLab/InternVL2_5-4B'),
Model('OpenGVLab/InternVL2_5-8B', 'OpenGVLab/InternVL2_5-8B'),
Model('OpenGVLab/InternVL2_5-26B', 'OpenGVLab/InternVL2_5-26B'),
Model('OpenGVLab/InternVL2_5-38B', 'OpenGVLab/InternVL2_5-38B'),
Model('OpenGVLab/InternVL2_5-78B', 'OpenGVLab/InternVL2_5-78B'),
], ),
],
TemplateType.internvl2_5,
get_model_tokenizer_internvl,
architectures=['InternVLChatModel'],
model_arch=ModelArch.internvl,
requires=['transformers>=4.36', 'timm'],
tags=['vision', 'video'],
))

register_model(
ModelMeta(
MLLMModelType.xcomposer2_5,
Expand Down
1 change: 1 addition & 0 deletions swift/llm/template/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class MLLMTemplateType:
internvl_phi3 = 'internvl_phi3'
internvl2 = 'internvl2'
internvl2_phi3 = 'internvl2_phi3'
internvl2_5 = 'internvl2_5'

xcomposer2 = 'ixcomposer2'
xcomposer2_4khd = 'xcomposer2_4khd'
Expand Down
7 changes: 7 additions & 0 deletions swift/llm/template/template/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,10 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
template_cls=Internvl2Template,
placeholder_tokens=['<IMG_CONTEXT>'],
))

register_template(
ChatmlTemplateMeta(
MLLMTemplateType.internvl2_5,
template_cls=Internvl2Template,
placeholder_tokens=['<IMG_CONTEXT>'],
default_system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'))
32 changes: 25 additions & 7 deletions tests/test_align/test_template/test_video.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
import os

logger = get_logger()
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'


def _infer_model(pt_engine, system=None, messages=None, images=None):
def _infer_model(pt_engine, system=None, messages=None, videos=None):
seed_everything(42)
request_config = RequestConfig(max_tokens=128, temperature=0)
if messages is None:
Expand All @@ -14,17 +13,36 @@ def _infer_model(pt_engine, system=None, messages=None, images=None):
messages += [{'role': 'user', 'content': '你好'}]
resp = pt_engine.infer([{'messages': messages}], request_config=request_config)
response = resp[0].choices[0].message.content
messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<image>这是什么'}]
images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png']
resp = pt_engine.infer([{'messages': messages, 'images': images}], request_config=request_config)
messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<video>描述视频'}]
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
resp = pt_engine.infer([{'messages': messages, 'videos': videos}], request_config=request_config)
response = resp[0].choices[0].message.content
messages += [{'role': 'assistant', 'content': response}]
logger.info(f'model: {pt_engine.model_info.model_name}, messages: {messages}')
return messages


def test_qwen2_vl():
os.environ['NFRAMES'] = '24'
os.environ['MAX_PIXELS'] = '100352'
os.environ['VIDEO_MAX_PIXELS'] = str(100352 // 4)
os.environ['SIZE_FACTOR'] = '12'
pt_engine = PtEngine('qwen/Qwen2-VL-2B-Instruct')
_infer_model(pt_engine)
pt_engine.default_template.template_backend = 'jinja'
_infer_model(pt_engine)


def test_internvl2_5():
pt_engine = PtEngine('OpenGVLab/InternVL2_5-2B')
_infer_model(pt_engine)
pt_engine.default_template.template_backend = 'jinja'
_infer_model(pt_engine, system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')


if __name__ == '__main__':
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
logger = get_logger()
test_qwen2_vl()
test_internvl2_5()
8 changes: 8 additions & 0 deletions tests/test_align/test_template/test_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def test_glm_edge_v():
_infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>这是什么'}])


def test_internvl2_5():
pt_engine = PtEngine('OpenGVLab/InternVL2_5-26B')
_infer_model(pt_engine)
pt_engine.default_template.template_backend = 'jinja'
_infer_model(pt_engine, system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')


if __name__ == '__main__':
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
Expand Down Expand Up @@ -201,3 +208,4 @@ def test_glm_edge_v():
#
# test_mplug_owl3()
# test_phi3_vision()
test_internvl2_5()
Loading