# Lab 2. Multimodal

In [None]:
import requests
from pathlib import Path

In [None]:
if not Path("ov_phi3_vision.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/phi-3-vision/ov_phi3_vision.py")
    open("ov_phi3_vision.py", "w").write(r.text)

if not Path("notebook_utils.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py")
    open("notebook_utils.py", "w").write(r.text)

### Download VLM model from ModelScope

In [None]:
from modelscope import snapshot_download

vlm_model_id = "snake7gun/Phi-3.5-vision-instruct-int4-ov"
vlm_local_path  = "./model/snake7gun/Phi-3___5-vision-instruct-int4-ov"

if not Path(vlm_local_path).exists():
    model_dir = snapshot_download(vlm_model_id, cache_dir="./model/")

### Initialize Phi-3.5-vision

In [None]:
from notebook_utils import device_widget

device = device_widget(default="GPU", exclude=["NPU"])

device

In [None]:
from ov_phi3_vision import OvPhi3Vision

model = OvPhi3Vision(vlm_local_path, device.value)

In [None]:
import requests
from PIL import Image

image = Image.open(r"./examples/demo.png")
image

### Q&A over image

In [None]:
from transformers import AutoProcessor, TextStreamer

messages = [
    {"role": "user", "content": "<|image_1|>\nPlease create Python code for image, and use plt to save the new picture under imgs/ and name it phi-3-vision.jpg."},
]

processor = AutoProcessor.from_pretrained(vlm_local_path, trust_remote_code=True)

prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt")

generation_args = {"max_new_tokens": 3072, "do_sample": False, "streamer": TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)}

print("Coding:")
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

### Q&A over image in Chinese

In [None]:
image = Image.open(r"./examples/demo.jpeg")
image

In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\n用中文描述一下该图片"},
]
prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(prompt, [image], return_tensors="pt")

print("Answering:")
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)