In [None]:
import requests
import platform
from pathlib import Path

if platform.system() == "Darwin":
    %pip install -q "numpy<2.0.0"

if not Path("notebook_utils.py").exists():
    r = requests.get(
        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
    )
    open("notebook_utils.py", "w").write(r.text)

# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry
from notebook_utils import collect_telemetry

collect_telemetry("distil-whisper-asr.ipynb")

In [None]:
import ipywidgets as widgets

model_ids = {
    "Distil-Whisper": [
        "distil-whisper/distil-large-v2",
        "distil-whisper/distil-large-v3",
        "distil-whisper/distil-medium.en",
        "distil-whisper/distil-small.en",
    ],
    "Whisper": [
        "openai/whisper-large-v3-turbo",
        "openai/whisper-large-v3",
        "openai/whisper-large-v2",
        "openai/whisper-large",
        "openai/whisper-medium",
        "openai/whisper-small",
        "openai/whisper-base",
        "openai/whisper-tiny",
        "openai/whisper-medium.en",
        "openai/whisper-small.en",
        "openai/whisper-base.en",
        "openai/whisper-tiny.en",
    ],
}

model_type = widgets.Dropdown(
    options=model_ids.keys(),
    value="Distil-Whisper",
    description="Model type:",
    disabled=False,
)

model_type

In [None]:
model_id = widgets.Dropdown(
    options=model_ids[model_type.value],
    value=model_ids[model_type.value][0],
    description="Model:",
    disabled=False,
)

model_id

In [None]:
model_id.value

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained(model_id.value)

pt_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id.value)
pt_model.eval();

In [None]:
from datasets import load_dataset


def extract_input_features(sample):
    input_features = processor(
        sample["audio"]["array"],
        sampling_rate=sample["audio"]["sampling_rate"],
        return_tensors="pt",
    ).input_features
    return input_features


dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
sample = dataset[0]
input_features = extract_input_features(sample)

In [None]:
import IPython.display as ipd

predicted_ids = pt_model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

display(ipd.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]))
print(f"Reference: {sample['text']}")
print(f"Result: {transcription[0]}")

In [None]:
#from transformers import AutoModelForSpeechSeq2Seq
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
from transformers import AutoTokenizer, pipeline

model_id = 'distil-whisper/distil-small.en'
#-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)

In [None]:
model_id

In [None]:
from pathlib import Path
from optimum.intel.openvino import OVModelForSpeechSeq2Seq

model_path = Path(model_id.replace("/", "_"))
ov_config = {"CACHE_DIR": ""}

if not model_path.exists():
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
        model_id,
        ov_config=ov_config,
        export=True,
        compile=False,
        load_in_8bit=False,
    )
    ov_model.half()
    ov_model.save_pretrained(model_path)
else:
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_path, ov_config=ov_config, compile=False)

In [None]:
from notebook_utils import device_widget

device = device_widget()

device

In [None]:
ov_model.to(device.value)
ov_model.compile()

In [None]:
predicted_ids = ov_model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

display(ipd.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]))
print(f"Reference: {sample['text']}")
print(f"Result: {transcription[0]}")

In [None]:
import time
import numpy as np
from tqdm.notebook import tqdm


def measure_perf(model, sample, n=10):
    timers = []
    input_features = extract_input_features(sample)
    for _ in tqdm(range(n), desc="Measuring performance"):
        start = time.perf_counter()
        model.generate(input_features)
        end = time.perf_counter()
        timers.append(end - start)
    return np.median(timers)

In [None]:
perf_torch = measure_perf(pt_model, sample)
perf_ov = measure_perf(ov_model, sample)

In [None]:
print(f"Mean torch {model_id} generation time: {perf_torch:.3f}s")
print(f"Mean openvino {model_id} generation time: {perf_ov:.3f}s")
print(f"Performance {model_id} openvino speedup: {perf_torch / perf_ov:.3f}")

In [None]:
from transformers import pipeline
import torch

ov_model.generation_config = pt_model.generation_config

pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    device=torch.device("cpu"),
)

In [None]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation", trust_remote_code=True)
sample_long = dataset[0]


def format_timestamp(seconds: float):
    """
    format time in srt-file expected format
    """
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def prepare_srt(transcription):
    """
    Format transcription into srt file format
    """
    segment_lines = []
    for idx, segment in enumerate(transcription["chunks"]):
        segment_lines.append(str(idx + 1) + "\n")
        timestamps = segment["timestamp"]
        time_start = format_timestamp(timestamps[0])
        time_end = format_timestamp(timestamps[1])
        time_str = f"{time_start} --> {time_end}\n"
        segment_lines.append(time_str)
        segment_lines.append(segment["text"] + "\n\n")
    return segment_lines

In [None]:
result = pipe(sample_long["audio"].copy(), return_timestamps=True)

In [None]:
srt_lines = prepare_srt(result)

display(ipd.Audio(sample_long["audio"]["array"], rate=sample_long["audio"]["sampling_rate"]))
print("".join(srt_lines))

In [None]:
from notebook_utils import quantization_widget

to_quantize = quantization_widget()

to_quantize

In [None]:
# Fetch `skip_kernel_extension` module
import requests

if not Path("skip_kernel_extension.py").exists():
    r = requests.get(
        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py",
    )
    open("skip_kernel_extension.py", "w").write(r.text)

%load_ext skip_kernel_extension

In [None]:
%%skip not $to_quantize.value

from itertools import islice
from optimum.intel.openvino.quantization import InferRequestWrapper


def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):
    # Overwrite model request properties, saving the original ones for restoring later
    encoder_calibration_data = []
    decoder_calibration_data = []
    ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True)
    ov_model.decoder.request = InferRequestWrapper(ov_model.decoder.request,
                                                             decoder_calibration_data,
                                                             apply_caching=True)

    try:
        calibration_dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True)
        for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data",
                           total=calibration_dataset_size):
            input_features = extract_input_features(sample)
            ov_model.generate(input_features)
    finally:
        ov_model.encoder.request = ov_model.encoder.request.request
        ov_model.decoder.request = ov_model.decoder.request.request

    return encoder_calibration_data, decoder_calibration_data

In [None]:
%%skip not $to_quantize.value

import gc
import shutil
import nncf
import openvino as ov

CALIBRATION_DATASET_SIZE = 50
quantized_model_path = Path(f"{model_path}_quantized")


def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):
    if not quantized_model_path.exists():
        encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(
            ov_model, calibration_dataset_size
        )
        print("Quantizing encoder")
        quantized_encoder = nncf.quantize(
            ov_model.encoder.model,
            nncf.Dataset(encoder_calibration_data),
            subset_size=len(encoder_calibration_data),
            model_type=nncf.ModelType.TRANSFORMER,
            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)
        )
        ov.save_model(quantized_encoder, quantized_model_path / "openvino_encoder_model.xml")
        del quantized_encoder
        del encoder_calibration_data
        gc.collect()

        print("Quantizing decoder with past")
        quantized_decoder = nncf.quantize(
            ov_model.decoder.model,
            nncf.Dataset(decoder_calibration_data),
            subset_size=len(decoder_calibration_data),
            model_type=nncf.ModelType.TRANSFORMER,
            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)
        )
        ov.save_model(quantized_decoder, quantized_model_path / "openvino_decoder_model.xml")
        del quantized_decoder
        del decoder_calibration_data
        gc.collect()

        # Copy the config file and the first-step-decoder manually
        shutil.copy(model_path / "config.json", quantized_model_path / "config.json")

    quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False)
    quantized_ov_model.to(device.value)
    quantized_ov_model.compile()
    return quantized_ov_model


ov_quantized_model = quantize(ov_model, CALIBRATION_DATASET_SIZE)

In [None]:
%%skip not $to_quantize.value

dataset = load_dataset(
    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
sample = dataset[0]
input_features = extract_input_features(sample)

predicted_ids = ov_model.generate(input_features)
transcription_original = processor.batch_decode(predicted_ids, skip_special_tokens=True)

predicted_ids = ov_quantized_model.generate(input_features)
transcription_quantized = processor.batch_decode(predicted_ids, skip_special_tokens=True)

display(ipd.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]))
print(f"Original : {transcription_original[0]}")
print(f"Quantized: {transcription_quantized[0]}")

In [None]:


# import time
# from contextlib import contextmanager
# from jiwer import wer, wer_standardize


# TEST_DATASET_SIZE = 50
# MEASURE_TIME = False

# @contextmanager
# def time_measurement():
#     global MEASURE_TIME
#     try:
#         MEASURE_TIME = True
#         yield
#     finally:
#         MEASURE_TIME = False

# def time_fn(obj, fn_name, time_list):
#     original_fn = getattr(obj, fn_name)

#     def wrapper(*args, **kwargs):
#         if not MEASURE_TIME:
#             return original_fn(\*args, \*\*kwargs)
#         start_time = time.perf_counter()
#         result = original_fn(\*args, \*\*kwargs)
#         end_time = time.perf_counter()
#         time_list.append(end_time - start_time)
#         return result

#     setattr(obj, fn_name, wrapper)

# def calculate_transcription_time_and_accuracy(ov_model, test_samples):
#     encoder_infer_times = []
#     decoder_with_past_infer_times = []
#     whole_infer_times = []
#     time_fn(ov_model, "generate", whole_infer_times)
#     time_fn(ov_model.encoder, "forward", encoder_infer_times)
#     time_fn(ov_model.decoder, "forward", decoder_with_past_infer_times)

#     ground_truths = []
#     predictions = []
#     for data_item in tqdm(test_samples, desc="Measuring performance and accuracy"):
#         input_features = extract_input_features(data_item)

#         with time_measurement():
#             predicted_ids = ov_model.generate(input_features)
#         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

#         ground_truths.append(data_item["text"])
#         predictions.append(transcription[0])

#     word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,
#                              hypothesis_transform=wer_standardize)) * 100
#     mean_whole_infer_time = sum(whole_infer_times)
#     mean_encoder_infer_time = sum(encoder_infer_times)
#     mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)
#     return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)

# test_dataset = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
# test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)
# test_samples = [sample for sample in test_dataset]

# accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_model, test_samples)
# accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_model, test_samples)
# print(f"Encoder performance speedup: {times_original[1] / times_quantized[1]:.3f}")
# print(f"Decoder with past performance speedup: {times_original[2] / times_quantized[2]:.3f}")
# print(f"Whole pipeline performance speedup: {times_original[0] / times_quantized[0]:.3f}")
# print(f"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.")
# print(f"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.")