In [None]:
%pip install "transformers>=4.45.0" "torch>=2.1" "torchvision" "Pillow" "tqdm" "datasets>=2.14.6" "gradio>=4.36" "nncf>=2.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
%pip install -U --pre "openvino>=2024.4.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly

In [2]:
import requests
from pathlib import Path

if not Path("ov_mllama_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/ov_mllama_helper.py")
    open("ov_mllama_helper.py", "w").write(r.text)

if not Path("gradio_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/flux.1-image-generation/gradio_helper.py")
    open("gradio_helper.py", "w").write(r.text)

if not Path("ov_mllama_compression.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/ov_mllama_compression.py")
    open("ov_mllama_compression.py", "w").write(r.text)

if not Path("data_preprocessing.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/data_preprocessing.py")
    open("data_preprocessing", "w").write(r.text)

if not Path("notebook_utils.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py")
    open("notebook_utils.py", "w").write(r.text)

In [3]:
from pathlib import Path
from ov_mllama_helper import convert_mllama

model_id = "Llama-3.2-11B-Vision-Instruct"
model_dir = Path(model_id.split("/")[-1]) / "OV"

# uncomment the line to see model conversion code 
# convert_mllama??

2024-09-23 14:24:20.729672: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 14:24:20.731581: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-23 14:24:20.767644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
  deprecate("VQEncoderOutput", "0.31", deprecation_message)
  deprecate("VQModel", "0.31", deprecation_message)


In [4]:
convert_mllama(model_id, model_dir)

model already converted and can be found in Llama-3.2-11B-Vision-Instruct/OV


In [5]:
from notebook_utils import device_widget

device = device_widget()

device

Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')

In [6]:
from ov_mllama_compression import compress
# uncomment the line to see compression code
# compress??

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


In [7]:
language_model_path = compress(model_dir, awq=False, scale_estimation=False)

Compressed model already exists and can be found in Llama-3.2-11B-Vision-Instruct/OV/llm_int4_asym_r10_gs64_max_activation_variance_all_layers.xml


In [8]:
from transformers import AutoProcessor
import nncf
import openvino as ov
import gc

from data_preprocessing import prepare_dataset_vision

processor = AutoProcessor.from_pretrained(model_dir)
core = ov.Core()

vision_encoder_path = model_dir / "openvino_vision_encoder.xml"
int8_vision_encoder_path = model_dir / vision_encoder_path.name.replace('.xml', '_int8.xml')

if not int8_vision_encoder_path.exists() and device.value != "GPU":
    calibration_data = prepare_dataset_vision(processor, 100)
    ov_model = core.read_model(vision_encoder_path)
    calibration_dataset = nncf.Dataset(calibration_data)
    quantized_model = nncf.quantize(
        model=ov_model,
        calibration_dataset=calibration_dataset,
        model_type=nncf.ModelType.TRANSFORMER,
        advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.6)
    )
    ov.save_model(quantized_model, int8_vision_encoder_path)
    del quantized_model
    del ov_model
    del calibration_dataset
    del calibration_data
    gc.collect()

vision_encoder_path = int8_vision_encoder_path if device.value != "GPU" else vision_encoder_path

In [9]:
from ov_mllama_helper import OVMLlamaForConditionalGeneration

# Uncomment this line to see model inference code
# OVMLlamaForConditionalGeneration??

ov_model = OVMLlamaForConditionalGeneration(model_dir, device=device.value, language_model_name=language_model_path.name, image_encoder_name=vision_encoder_path.name)
processor = AutoProcessor.from_pretrained(model_dir)

applied slice for lm head


In [10]:
from PIL import Image
from transformers import TextStreamer
import numpy as np

messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Describe image in two sentences"}
            ]
        },
]
text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
url = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=text, images=[raw_image], return_tensors="pt")
streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=50, streamer=streamer)
print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000} ms")
print(f"First token latency {ov_model.llm_infer_time[0] * 1000}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000}ms")



The image depicts a serene lake scene, featuring a long wooden dock that extends from the foreground into the distance, with a mountain range in the background and a cloudy sky above.

The dock is constructed from wooden planks and features metal railings along its
Visual encoder time 19400.806584046222 ms
First token latency 2414.042363059707ms, Second token latency 423.10670324677255ms


In [11]:
from gradio_helper import make_demo

processor.chat_template = processor.tokenizer.chat_template
demo = make_demo(ov_model, processor)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


