diff --git a/examples/infer/demo_mllm.py b/examples/infer/demo_mllm.py index 8d3c77212a..b277818957 100644 --- a/examples/infer/demo_mllm.py +++ b/examples/infer/demo_mllm.py @@ -104,20 +104,33 @@ def get_data(mm_type: Literal['text', 'image', 'video', 'audio']): if infer_backend == 'pt': model = 'Qwen/Qwen2-Audio-7B-Instruct' mm_type = 'audio' - dataset = 'speech_asr/speech_asr_aishell1_trainsets:validation#1000' engine = PtEngine(model, max_batch_size=64) elif infer_backend == 'vllm': + # test env: vllm==0.6.5, transformers==4.48.* from swift.llm import VllmEngine model = 'Qwen/Qwen2-VL-2B-Instruct' - mm_type = 'video' - dataset = 'AI-ModelScope/LaTeX_OCR:small#1000' + # If you encounter insufficient GPU memory, please reduce `max_model_len` and set `max_num_seqs=5`. engine = VllmEngine(model, max_model_len=32768, limit_mm_per_prompt={'image': 5, 'video': 2}) + mm_type = 'video' # or 'image' elif infer_backend == 'lmdeploy': + # test env: lmdeploy==0.6.4 from swift.llm import LmdeployEngine model = 'OpenGVLab/InternVL2_5-1B' - mm_type = 'video' - dataset = 'AI-ModelScope/LaTeX_OCR:small#1000' engine = LmdeployEngine(model, vision_batch_size=8) + mm_type = 'video' # or 'image' + + # infer dataset + if mm_type == 'audio': + dataset = 'speech_asr/speech_asr_aishell1_trainsets:validation#1000' + elif mm_type == 'image': + # The meaning of environment variables can be found at: + # https://swift.readthedocs.io/zh-cn/latest/Instruction/%E5%91%BD%E4%BB%A4%E8%A1%8C%E5%8F%82%E6%95%B0.html#id17 + os.environ['MAX_PIXELS'] = '1003520' + dataset = 'AI-ModelScope/LaTeX_OCR:small#1000' + elif mm_type == 'video': + os.environ['VIDEO_MAX_PIXELS'] = '50176' + os.environ['FPS_MAX_FRAMES'] = '12' + dataset = 'swift/VideoChatGPT:Generic#100' # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset. dataset = load_dataset([dataset], seed=42)[0]