# Installation

In [None]:
!pip install -q "openvino-dev>=2023.0.0"
!pip install -q "python-ffmpeg<=1.0.16" moviepy transformers onnx
!pip install -q -I "git+https://github.com/garywu007/pytube.git"
!pip install optimum
!pip install git+https://github.com/huggingface/optimum-intel.git
!pip install sounddevice

from IPython import display
display.clear_output()

In [None]:
from pathlib import Path

REPO_DIR = Path("whisper")
if not REPO_DIR.exists():
    !git clone https://github.com/openai/whisper.git -b v20230124
!cd whisper && pip install .
from IPython import display
display.clear_output()

<a id="6"></a>
# 1. Prepare Whisper inference pipeline [&#8657;](#0)

In [None]:
from openvino.runtime import Core
from collections import namedtuple
from functools import partial
from OV_whisper_helper_utils import *
from whisper_preprocess_helper_utils import *

In [None]:
import ipywidgets as widgets

device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value='AUTO',
    description='Device:',
    disabled=False,
)

In [None]:
import whisper
model = whisper.load_model("base")
model.to("cpu")
model.eval()
pass
del model.decoder
del model.encoder

In [None]:
core = Core()
model.encoder = OpenVINOAudioEncoder(core, 'whisper_encoder.xml', device=device.value)
model.decoder = OpenVINOTextDecoder(core, 'whisper_decoder.xml', device=device.value)
model.decode = partial(decode, model)

Parameter = namedtuple('Parameter', ['device'])
def parameters():
    return iter([Parameter(torch.device('cpu'))])
model.parameters = parameters

model.logits = partial(logits, model)

<a id="9"></a>
## 1.1 Run transcription pipeline [&#8657;](#0)

In [None]:
audio = get_audio(output_file)

In [None]:
task = widgets.Select(
    options=["transcribe", "translate"],
    value="translate",
    description="Select task:",
    disabled=False
)
task

Select(description='Select task:', index=1, options=('transcribe', 'translate'), value='translate')

In [None]:
transcription = model.transcribe(audio, beam_size=5, best_of=5, task=task.value)




In [None]:
srt_lines = prepare_srt(transcription)
# save transcription
with output_file.with_suffix(".srt").open("w") as f:
    f.writelines(srt_lines)

In [None]:
widgets.Video.from_file(output_file, loop=False, width=800, height=800)

Video(value=b'\x00\x00\x00\x18ftypmp42\x00\x00\x00\x00isommp42\x00\x00Aimoov\x00\x00\x00lmvhd\x00\x00\x00\x00\…

In [None]:
print("".join(srt_lines))

1
00:00:00,000 --> 00:00:05,000
 Oh, what's that?

2
00:00:05,000 --> 00:00:09,000
 Oh, wow.

3
00:00:09,000 --> 00:00:10,000
 Hello, humans.

4
00:00:13,000 --> 00:00:15,000
 Focus on me.

5
00:00:15,000 --> 00:00:18,000
 Focus on the guard.

6
00:00:18,000 --> 00:00:22,000
 Don't tell anyone what you've seen in here.

7
00:00:22,000 --> 00:00:30,000
 Have you seen what's in there?




# 2. LLAMA

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
prompt = "Write a prompt for an AI stable diffusion prompt, building on the phrase 'a beautiful scenic landscape'"

In [None]:
from optimum.intel.openvino import OVModelForCausalLM

In [None]:
#DELETE THIS AFTER USAGE
from pathlib import Path
model_path = Path('./ir_model')

ov_model = OVModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf',
                                              compile=False,
                                              export=True)
ov_model.half()
ov_model.save_pretrained(model_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Framework not specified. Using pt to export to ONNX.


Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

In [None]:
from transformers import LlamaTokenizer
from optimum.intel.openvino import OVModelForCausalLM
import time
from pathlib import Path

In [None]:
model_path = Path('../quantized_model')

if model_path.exists():
    print("--- using local model ---")
    ov_model = OVModelForCausalLM.from_pretrained(model_path, compile=False, device=args.device)
else:
    print("--- using remote model ---")
    ov_model = OVModelForCausalLM.from_pretrained(args.model_id, compile=False, device=args.device, export=True)
    ov_model.save_pretrained(model_path)

ov_model.compile()
tokenizer = LlamaTokenizer.from_pretrained(args.model_id)

inputs = tokenizer(prompt, return_tensors="pt")
#start = time.perf_counter()
#generate_ids = ov_model.generate(inputs.input_ids,
                                 max_length=args.max_sequence_length)
end = time.perf_counter()

print(" --- text decoding --- ")
output_text = tokenizer.batch_decode(generate_ids,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=False)[0]
print(f"Generation took {end - start:.3f} s on {args.device}")
print(f"Response: {output_text}")

In [None]:
output_text = "A beautiful scenic landscape filled with flowers"

# 3. Stable Diffusion

In [None]:
from openvino.runtime import Core

ie = Core()

#devices = ie.available_devices
#for device in devices:
#    device_name = ie.get_property(device, "FULL_DEVICE_NAME")
#    print(f"{device}: {device_name}")

In [None]:
%pip install -q "optimum-intel[openvino,diffusers]" "ipywidgets"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from optimum.intel.openvino import OVStableDiffusionPipeline
# download the pre-converted SD v2.1 model from Hugging Face Hub
name = "helenai/stabilityai-stable-diffusion-2-1-base-ov"

pipe = OVStableDiffusionPipeline.from_pretrained(name, compile=False)
pipe.reshape(batch_size=1, height=512, width=512, num_images_per_prompt=1)

Downloading (…)ain/model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)r/openvino_model.xml:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/834 [00:00<?, ?B/s]

Downloading (…)t/openvino_model.xml:   0%|          | 0.00/7.64M [00:00<?, ?B/s]

Downloading (…)r/openvino_model.xml:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Downloading openvino_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Downloading openvino_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading openvino_model.bin:   0%|          | 0.00/99.0M [00:00<?, ?B/s]



OVStableDiffusionPipeline {
  "_class_name": "OVStableDiffusionPipeline",
  "_diffusers_version": "0.19.3",
  "feature_extractor": [
    "transformers",
    "CLIPFeatureExtractor"
  ],
  "requires_safety_checker": false,
  "safety_checker": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "optimum",
    "OVModelTextEncoder"
  ],
  "text_encoder_2": [
    null,
    null
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "optimum",
    "OVModelUnet"
  ],
  "vae_decoder": [
    "optimum",
    "OVModelVaeDecoder"
  ],
  "vae_encoder": [
    null,
    null
  ]
}

In [None]:
pipe.to("CPU") #GPU
pipe.compile()

Compiling the vae_decoder...
Compiling the unet...
Compiling the text_encoder...


In [None]:
# Generate an image.
prompt = output_text
output = pipe(prompt, num_inference_steps=17, output_type="pil").images[0]
output.save("image.png")
output

  0%|          | 0/18 [00:00<?, ?it/s]

# 4. CLIP

"query"

In [None]:
from pathlib import Path
from typing import Tuple, Union

from matplotlib import colors
import matplotlib.pyplot as plt
import numpy as np
import requests
import torch
import tqdm
from PIL import Image
from transformers import CLIPModel, CLIPProcessor

In [None]:
import ipywidgets as widgets


def build_saliency_map(image: Image, query: str, n_iters: int = n_iters, min_crop_size=min_crop_size):
    x_dim, y_dim = image.size
    im_tensor = np.array(image)

    text_inputs = dict(
        processor(text=[query], images=[im_tensor], return_tensors="np")
    )
    image_inputs = text_inputs.pop("pixel_values")

    text_embeds = text_model(text_inputs)[text_model.output()]
    image_embeds = image_model(image_inputs)[image_model.output()]

    initial_similarity = cosine_similarity(text_embeds, image_embeds)
    saliency_map = np.zeros((y_dim, x_dim))

    with tqdm.notebook.tqdm(total=n_iters) as pbar:
        for _ in range(n_iters):
            x, y, crop_size = get_random_crop_params(y_dim, x_dim, min_crop_size)
            im_crop = get_cropped_image(im_tensor, x, y, crop_size)

            image_inputs = processor(images=[im_crop], return_tensors="np")
            infer_queue.start_async(
                image_inputs.pixel_values,
                {
                    "text_embeds": text_embeds,
                    "saliency_map": saliency_map,
                    "initial_similarity": initial_similarity,
                    "x": x,
                    "y": y,
                    "crop_size": crop_size,
                    "pbar": pbar,
                }
            )
        infer_queue.wait_all()

    plot_saliency_map(im_tensor, saliency_map, query)

In [None]:
    image = Image.open(image_bytes)
    image = image.convert("RGB")  # remove transparency channel or convert grayscale 1 channel to 3 channels

    build_saliency_map(image, query, n_iters, min_crop_size)