Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/支持的模型和数据集.md
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-ol-7b:base](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b:base)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:audio](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:audio)|xcomposer2_5_ol_audio|qwen2_audio|transformers>=4.45|audio|[internlm/internlm-xcomposer2d5-ol-7b:audio](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b:audio)|
|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ The table below introduces the models integrated with ms-swift:
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base)|xcomposer2_5|xcomposer2_5|decord|vision|[internlm/internlm-xcomposer2d5-ol-7b:base](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b:base)|
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:audio](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:audio)|xcomposer2_5_ol_audio|qwen2_audio|transformers>=4.45|audio|[internlm/internlm-xcomposer2d5-ol-7b:audio](https://huggingface.co/internlm/internlm-xcomposer2d5-ol-7b:audio)|
|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct)|llama3_2_vision|llama3_2_vision|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
Expand Down
4 changes: 3 additions & 1 deletion swift/llm/model/model/internlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ def get_model_tokenizer_internvl(model_dir: str,
[
ModelGroup([
Model('Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b', 'internlm/internlm-xcomposer2d5-7b'),
], ),
Model('Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base',
'internlm/internlm-xcomposer2d5-ol-7b:base')
]),
],
TemplateType.xcomposer2_5,
partial(get_model_tokenizer_xcomposer2, version='v2.5'),
Expand Down
28 changes: 24 additions & 4 deletions swift/llm/template/template/internlm.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Literal, Optional

import torch
from PIL import Image
from transformers.dynamic_module_utils import get_class_from_dynamic_module

from swift.utils import get_env_args
from ..base import Template
from ..constant import LLMTemplateType, MLLMTemplateType
from ..register import TemplateMeta, register_template
from ..template_inputs import StdTemplateInputs
from ..utils import Prompt, Word
from ..utils import Context, Prompt, Word
from ..vision_utils import load_file
from .utils import ChatmlTemplateMeta

INTERNLM_SYSTEM = (
Expand Down Expand Up @@ -39,6 +41,13 @@ class InternLMXComposer2Template(Template):
skip_prompt = False
use_model = True

def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
inputs: StdTemplateInputs) -> List[Context]:
if media_type == 'video':
inputs.images.insert(inputs.image_idx, inputs.videos[index])
inputs.image_idx += 1
return self.image_placeholder

def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
model = self.model
encoded = super()._encode(inputs)
Expand All @@ -49,8 +58,19 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
if len(images) > 1:
hd_num = 6
hd_num = get_env_args('hd_num', int, hd_num)
Image_transform = get_class_from_dynamic_module('ixc_utils.Image_transform', model.model_dir)
images = [Image_transform(image, hd_num=hd_num) for image in images]
images_origin = images
images = []
for image in images_origin:
if isinstance(image, Image.Image):
Image_transform = get_class_from_dynamic_module('ixc_utils.Image_transform', model.model_dir)
images.append(Image_transform(image, hd_num=hd_num))
else:
load_video = get_class_from_dynamic_module('ixc_utils.load_video', model.model_dir)
frame2img = get_class_from_dynamic_module('ixc_utils.frame2img', model.model_dir)
Video_transform = get_class_from_dynamic_module('ixc_utils.Video_transform', model.model_dir)
image = load_video(load_file(image))
image = frame2img(image, model.font)
images.append(Video_transform(image, hd_num=hd_num))
elif self.version == 'v2-4khd':
hd_num = 55
hd_num = get_env_args('hd_num', int, hd_num)
Expand Down
40 changes: 34 additions & 6 deletions tests/test_align/test_template/test_video.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
import os

import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
os.environ['SWIFT_DEBUG'] = '1'


def _infer_model(pt_engine, system=None, messages=None, videos=None):
def _infer_model(pt_engine, system=None, messages=None, videos=None, max_tokens=128):
seed_everything(42)
request_config = RequestConfig(max_tokens=128, temperature=0)
request_config = RequestConfig(max_tokens=max_tokens, temperature=0)
if messages is None:
messages = []
if not messages:
if system is not None:
messages += [{'role': 'system', 'content': system}]
messages += [{'role': 'user', 'content': '你好'}]
resp = pt_engine.infer([{'messages': messages}], request_config=request_config)
response = resp[0].choices[0].message.content
messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<video>描述视频'}]
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
resp = pt_engine.infer([{'messages': messages, 'videos': videos}], request_config=request_config)
response = resp[0].choices[0].message.content
messages += [{'role': 'assistant', 'content': response}]
logger.info(f'model: {pt_engine.model_info.model_name}, messages: {messages}')
return messages
return response


def test_qwen2_vl():
Expand All @@ -40,9 +44,33 @@ def test_internvl2_5():
_infer_model(pt_engine, system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')


def test_xcomposer2_5():
pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base', torch.float16)
messages = [{'role': 'user', 'content': '<video>Describe the video'}]
messages_with_system = messages.copy()
messages_with_system.insert(0, {'role': 'system', 'content': ''})
response = _infer_model(pt_engine, messages=messages_with_system)
pt_engine.default_template.template_backend = 'jinja'
response2 = _infer_model(pt_engine, messages=messages, system='')
assert response == response2

response = _infer_model(pt_engine, messages=messages)
std_response = (
'The video features a young child sitting on a bed, deeply engaged in reading a book. '
'The child is dressed in a light blue sleeveless top and pink pants, and is wearing glasses. '
'The bed is covered with a textured white blanket, and there are various items scattered on it, '
'including a white cloth and a striped piece of clothing. In the background, '
'a wooden crib and a dresser with a mirror can be seen. The child flips through the pages of the book, '
'occasionally pausing to look at the illustrations. The child appears to be enjoying the book, '
'and the overall atmosphere is one of quiet concentration and enjoyment.')

assert response == std_response[:len(response)]


if __name__ == '__main__':
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
logger = get_logger()
test_qwen2_vl()
test_internvl2_5()
# test_qwen2_vl()
# test_internvl2_5()
test_xcomposer2_5()
14 changes: 9 additions & 5 deletions tests/test_align/test_template/test_vision.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['SWIFT_DEBUG'] = '1'

Expand Down Expand Up @@ -125,10 +127,12 @@ def test_llava_onevision_hf():


def test_xcomposer2_5():
pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b')
_infer_model(pt_engine, system='')
pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base', torch.float16)
# pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b')
response = _infer_model(pt_engine, system='')
pt_engine.default_template.template_backend = 'jinja'
_infer_model(pt_engine)
response2 = _infer_model(pt_engine)
assert response == response2


def test_deepseek_vl():
Expand Down Expand Up @@ -197,7 +201,7 @@ def test_internvl2_5():
# test_ovis1_6()
# test_yi_vl()
# test_deepseek_vl()
test_deepseek_vl2()
# test_deepseek_vl2()
# test_qwen_vl()
# test_glm4v()
# test_minicpmv()
Expand All @@ -212,4 +216,4 @@ def test_internvl2_5():
# test_phi3_vision()
# test_internvl2_5()
# test_mplug_owl3()
# test_xcomposer2_5()
test_xcomposer2_5()
Loading