# Whisper and Distil-Whisper model evaluation

Here we perform evaluation of Whisper and Distil-Whisper speech-to-text models on a Common Voice 13 dataset.

In order to run this notebook, first the [267-distil-whisper-asr](../../../notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb) notebook needs to be run to export `openai/whisper-large-v2` and `distil-whisper/distil-large-v2` models.

In [1]:
import ipywidgets as widgets
from openvino import Core
core = Core()

device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value='CPU',
    description='Device:',
    disabled=False,
)

In [2]:
SAMPLING_RATE = 16000

AVAILABLE_MODELS = []

In [3]:
from IPython.display import display

load_whisper_model_pt = widgets.Checkbox(
    value=True,
    description=f'Load Whisper large-v2 PyTorch',
    disabled=False
)
load_whisper_model_ov = widgets.Checkbox(
    value=True,
    description=f'Load Whisper large-v2 OpenVINO',
    disabled=False
)
load_whisper_model_ov_int8 = widgets.Checkbox(
    value=True,
    description=f'Load Whisper large-v2 OpenVINO Quantized',
    disabled=False
)

load_distil_whisper_model_pt = widgets.Checkbox(
    value=True,
    description='Load Distil-Whisper large-v2 PyTorch',
    disabled=False
)
load_distil_whisper_model_ov = widgets.Checkbox(
    value=True,
    description='Load Distil-Whisper large-v2 OpenVINO',
    disabled=False
)
load_distil_whisper_model_ov_int8 = widgets.Checkbox(
    value=True,
    description='Load Distil-Whisper large-v2 OpenVINO Quantized',
    disabled=False
)

display(load_whisper_model_pt)
display(load_whisper_model_ov)
display(load_whisper_model_ov_int8)
display(load_distil_whisper_model_pt)
display(load_distil_whisper_model_ov)
display(load_distil_whisper_model_ov_int8)

Checkbox(value=True, description='Load Whisper large-v2 PyTorch')

Checkbox(value=True, description='Load Whisper large-v2 OpenVINO')

Checkbox(value=True, description='Load Whisper large-v2 OpenVINO Quantized')

Checkbox(value=True, description='Load Distil-Whisper large-v2 PyTorch')

Checkbox(value=True, description='Load Distil-Whisper large-v2 OpenVINO')

Checkbox(value=True, description='Load Distil-Whisper large-v2 OpenVINO Quantized')

## Whisper

In [4]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

# PyTorch Whisper

whisper_model_id = "openai/whisper-large-v2"

if load_whisper_model_pt.value or load_whisper_model_ov.value or load_whisper_model_ov_int8.value:
    whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

    if load_whisper_model_pt.value:
        whisper_model_pt = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_id).eval()
        AVAILABLE_MODELS.append((whisper_model_pt, whisper_processor, f"Whisper large-v2 PyTorch"))

In [5]:
from pathlib import Path
from optimum.intel.openvino import OVModelForSpeechSeq2Seq

# OV FP32 Whisper

DISTIL_WHISPER_DIR = Path("../../../notebooks/267-distil-whisper-asr")
WHISPER_OV_PATH = DISTIL_WHISPER_DIR / whisper_model_id.replace('/', '_')


if load_whisper_model_ov.value:
    whisper_model_ov_fp32 = OVModelForSpeechSeq2Seq.from_pretrained(WHISPER_OV_PATH, compile=False).to(device.value)
    whisper_model_ov_fp32.compile()
    AVAILABLE_MODELS.append((whisper_model_ov_fp32, whisper_processor, f"Whisper large-v2 OpenVINO"))

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'
Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [6]:
# OV INT8 Whisper

WHISPER_OV_INT8_PATH = Path(f"{WHISPER_OV_PATH}_quantized")

if load_whisper_model_ov_int8.value:
    whisper_model_ov_int8 = OVModelForSpeechSeq2Seq.from_pretrained(WHISPER_OV_INT8_PATH, compile=False).to(device.value)
    whisper_model_ov_int8.compile()
    AVAILABLE_MODELS.append((whisper_model_ov_int8, whisper_processor, f"Whisper large-v2 OpenVINO Quantized"))

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [7]:
def recognize_audio(model, processor, audio):
    input_features = processor(
        audio,
        sampling_rate=SAMPLING_RATE,
        return_tensors="pt",
    ).input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return transcription

## Distil Whisper

In [8]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

# PyTorch Distil Whisper

distil_whisper_model_id = "distil-whisper/distil-large-v2"

if load_distil_whisper_model_pt.value or load_distil_whisper_model_ov.value or load_distil_whisper_model_ov_int8.value:
    distil_whisper_processor = AutoProcessor.from_pretrained(distil_whisper_model_id)

    if load_distil_whisper_model_pt.value:
        distil_whisper_model_pt = AutoModelForSpeechSeq2Seq.from_pretrained(distil_whisper_model_id).eval()
        AVAILABLE_MODELS.append((distil_whisper_model_pt, distil_whisper_processor, "Distil-Whisper large-v2 PyTorch"))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
from optimum.intel.openvino import OVModelForSpeechSeq2Seq

# OV FP32 Distil Whisper

DISTIL_WHISPER_OV_PATH = DISTIL_WHISPER_DIR / distil_whisper_model_id.replace('/', '_')

if load_distil_whisper_model_ov.value:
    distil_whisper_model_ov_fp32 = OVModelForSpeechSeq2Seq.from_pretrained(DISTIL_WHISPER_OV_PATH, compile=False).to(device.value)
    distil_whisper_model_ov_fp32.compile()
    AVAILABLE_MODELS.append((distil_whisper_model_ov_fp32, distil_whisper_processor, "Distil-Whisper large-v2 OpenVINO"))

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [10]:
# OV INT8 Distil Whisper

DISTIL_WHISPER_OV_INT8_PATH = Path(f"{DISTIL_WHISPER_OV_PATH}_quantized")

if load_distil_whisper_model_ov_int8.value:
    distil_whisper_model_ov_int8 = OVModelForSpeechSeq2Seq.from_pretrained(DISTIL_WHISPER_OV_INT8_PATH, compile=False).to(device.value)
    distil_whisper_model_ov_int8.compile()
    AVAILABLE_MODELS.append((distil_whisper_model_ov_int8, distil_whisper_processor, "Distil-Whisper large-v2 OpenVINO Quantized"))

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


## Prepare test dataset

In order not to download all splits for all languages of `common_voice_13` below we cache only the needed part of `en/test` split. This is performed only once.

In [11]:
from huggingface_hub import login
import os

if "HF_TOKEN" in os.environ:
    login(os.environ["HF_TOKEN"])
else:
    login()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/nsavel/.cache/huggingface/token
Login successful


In [12]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

def resample(audio, src_sample_rate, dst_sample_rate):
    """
    Resample audio to specific sample rate

    Parameters:
      audio: input audio signal
      src_sample_rate: source audio sample rate
      dst_sample_rate: destination audio sample rate
    Returns:
      resampled_audio: input audio signal resampled with dst_sample_rate
    """
    if src_sample_rate == dst_sample_rate:
        return audio
    duration = audio.shape[0] / src_sample_rate
    resampled_data = np.zeros(shape=(int(duration * dst_sample_rate)), dtype=np.float32)
    x_old = np.linspace(0, duration, audio.shape[0], dtype=np.float32)
    x_new = np.linspace(0, duration, resampled_data.shape[0], dtype=np.float32)
    resampled_audio = np.interp(x_new, x_old, audio)
    return resampled_audio.astype(np.float32)

def save_parquet(df: pd.DataFrame, save_filepath: Path):
    if save_filepath.exists():
        df.to_parquet(save_filepath, engine='fastparquet', append=True)
    else:
        df.to_parquet(save_filepath, engine='fastparquet')


TEST_DATASET_SIZE = 16372
chunk_size = 1000
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True)
dataset_filepath = Path("common_voice_13_0_en_test.parquet")

if not dataset_filepath.exists():
    columns = ["audio_bytes", "transcription"]
    df = pd.DataFrame(columns=columns)
    total_size = 0
    for i, data_item in tqdm(enumerate(test_dataset), total=TEST_DATASET_SIZE, desc="Preparing test dataset"):
        audio_data = data_item["audio"]
        audio = resample(audio_data["array"].astype(np.float32), audio_data["sampling_rate"], SAMPLING_RATE)
        df.loc[len(df)] = (audio.tobytes(), data_item["sentence"])
        if len(df) == chunk_size:
            total_size += chunk_size
            save_parquet(df, dataset_filepath)
            df = pd.DataFrame(columns=columns)
    if len(df) > 0:
        total_size += len(df)
        save_parquet(df, dataset_filepath)
    assert total_size == TEST_DATASET_SIZE, f"Acquired dataset size does not equal the expected size: {total_size} {TEST_DATASET_SIZE}"

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Inference

In [None]:
import pyarrow.parquet as pq
import time
from datetime import datetime
from itertools import islice
from jiwer import wer, wer_standardize
import warnings
warnings.filterwarnings("ignore")


def calculate_transcription_time_and_accuracy(model, processor, dataset_size, model_name):
    ground_truths = []
    predictions = []
    total_ground_truth_duration = 0
    total_prediction_time = 0
    parquet_file = pq.ParquetFile(dataset_filepath)
    dataset_size = TEST_DATASET_SIZE if dataset_size == -1 else dataset_size
    for batch in tqdm(islice(parquet_file.iter_batches(batch_size=1), dataset_size),
                      desc=model_name, total=dataset_size):
        data_item = batch.to_pandas().iloc[0].to_dict()
        audio = np.frombuffer(data_item["audio_bytes"], dtype=np.float32)
        ground_truth = data_item["transcription"]

        audio_duration = audio.shape[0] / SAMPLING_RATE
        total_ground_truth_duration += audio_duration

        start_time = time.perf_counter()
        transcription = recognize_audio(model, processor, audio)
        end_time = time.perf_counter()
        total_prediction_time += end_time - start_time

        ground_truths.append(ground_truth)
        predictions.append(transcription)

    word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,
                             hypothesis_transform=wer_standardize)) * 100
    mean_performace_coefficient = total_ground_truth_duration / total_prediction_time
    return word_accuracy, mean_performace_coefficient

def evaluate(dataset_size, print_intermediate_results=False):
    save_dir = Path(f"results/eval_{dataset_size}_{datetime.now().replace(microsecond=0)}")
    save_dir.mkdir(parents=True, exist_ok=True)
    log_file = open(save_dir / "log.txt", "a")

    df = pd.DataFrame(columns=["Model", "Accuracy", "Real-time Performance"])
    for model, processor, model_name in AVAILABLE_MODELS:
        accuracy, mean_performace_coefficient = \
            calculate_transcription_time_and_accuracy(model, processor, dataset_size, model_name)
        df.loc[len(df)] = (model_name, f"{accuracy:.2f}%", f"{mean_performace_coefficient:.2f}x")
        if print_intermediate_results:
            log_msg = f"{model_name}: {accuracy:.2f}% {mean_performace_coefficient:.2f}x"
            print(log_msg)
            log_file.write(log_msg + '\n')
            log_file.flush()

    formatters = {}
    for col in df.select_dtypes("object"):
        len_max = df[col].str.len().max()
        formatters[col] = eval('lambda x: f"{x:<{'+str(len_max)+'}s}"')
    df.to_csv(save_dir / f"result.csv", index=False)
    log_msg = df.to_string(index=False, formatters=formatters, justify="left")
    print(log_msg)
    log_file.write(log_msg + '\n')
    log_file.flush()

evaluate(dataset_size=1)
evaluate(dataset_size=10)
evaluate(dataset_size=100)
evaluate(dataset_size=-1, print_intermediate_results=True)