In [1]:
%pip install -q "git+https://github.com/huggingface/transformers.git" "torch>=2.1" "torchvision" "qwen-vl-utils" "Pillow" "gradio>=4.36" --extra-index-url https://download.pytorch.org/whl/cpu
%pip install -qU "openvino>=2024.3.0" "nncf>=2.12.0"

Note: you may need to restart the kernel to use updated packages.
    numpy (>=1.19.*) ; python_version >= "3.7"
           ~~~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import requests

if not Path("ov_qwen2_vl.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py")
    open("ov_qwen2_vl.py", "w").write(r.text)

if not Path("notebook_utils.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py")
    open("notebook_utils.py", "w").write(r.text)

In [3]:
from ov_qwen2_vl import model_selector

model_id = model_selector()

model_id

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


2024-09-11 10:13:52.108624: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-11 10:13:52.110484: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-11 10:13:52.147922: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dropdown(description='Model:', options=('Qwen/Qwen2-VL-2B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct'), value='Qwen…

In [4]:
print(f"Selected {model_id.value}")
pt_model_id = model_id.value
model_dir = Path(pt_model_id.split("/")[-1])

Selected Qwen/Qwen2-VL-2B-Instruct


In [5]:
from ov_qwen2_vl import convert_qwen2vl_model

# uncomment these lines to see model conversion code
# convert_qwen2vl_model??

In [6]:
import nncf

compression_configuration = {
    "mode": nncf.CompressWeightsMode.INT4_ASYM,
    "group_size": 128,
    "ratio": 1.0,
}

convert_qwen2vl_model(pt_model_id, model_dir, compression_configuration)

✅ Qwen/Qwen2-VL-2B-Instruct model already converted. You can find results in Qwen2-VL-2B-Instruct


In [7]:
from transformers import AutoProcessor, AutoTokenizer

min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

if processor.chat_template is None:
    tok = AutoTokenizer.from_pretrained(Path("qwen2-vl-2b-instruct"))
    processor.chat_template = tok.chat_template

In [8]:
from ov_qwen2_vl import OVQwen2VLModel
# Uncomment below lines to see the model inference class code
# OVQwen2VLModel??

In [9]:
from notebook_utils import device_widget

device = device_widget(default="AUTO", exclude=["NPU"])

device

Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')

In [10]:
model = OVQwen2VLModel(model_dir, device.value)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


In [11]:
from qwen_vl_utils import process_vision_info
from transformers import TextStreamer

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

generated_ids = model.generate(**inputs, max_new_tokens=100, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The image depicts a woman sitting on a sandy beach with a large dog. The dog is wearing a colorful harness and is sitting on its hind legs, with its front paws raised in a high-five gesture. The woman is smiling and appears to be enjoying the moment. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting either sunrise or sunset. The scene conveys a sense of companionship and relaxation.


In [12]:
if not Path("gradio_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/internvl2/gradio_helper.py")
    open("gradio_helper.py", "w").write(r.text)

In [13]:
from gradio_helper import make_demo


demo = make_demo(model, processor)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


