### Evaluate Whisper on ATCO2

In [None]:
from tqdm import tqdm
from transformers import pipeline
from datasets import load_from_disk

from evaluate import load

# these two file comes from OpenAI Whisper, for text normalization
from basic import *
from english import *

In [2]:
# load the metric definition
wer = load("wer")

# apply the same text normalization rules as Whisper
normalizer = EnglishTextNormalizer()

In [3]:
TASK = "automatic-speech-recognition"
MODEL_LABEL = "luigisaetta/whisper-atco2-medium"

HF_DIR = "atco2_hf"

In [4]:
# load the dataset from local
atco2_hf = load_from_disk(HF_DIR)

In [5]:
ds_test = atco2_hf["test"]

ds_test

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 56
})

In [6]:
# define the pipeline and a utility method
pipe = pipeline(task=TASK, model=MODEL_LABEL)


def transcribe(audio):
    text = pipe(audio)["text"]

    return text

#### Loop all over the test dataset and compute transcriptions

In [11]:
predicted = []
expected = []

# loop over all test set
for row in tqdm(ds_test):
    # to get the right WER we neeed to apply same normalization rules as Whisper
    # in the local hf dataset text is NOT normalized
    expected.append(normalizer(row["sentence"]))

    # my Whisper model produces text normalized, since it has been trained on
    # luigisaetta/atco2_normalized_augmented ds
    text_predicted = transcribe(row["audio"])

    predicted.append(normalizer(text_predicted))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [09:14<00:00,  9.89s/it]


#### Compute WER

In [12]:
wer_score = wer.compute(predictions=predicted, references=expected)

print(f"WER computed on test set is {round(wer_score, 2)}.")

WER computed on test set is 0.01.


On this dataset WER is really good

Maybe we're overfitting...