In [4]:
%pip install transformers unidecode tabulate numpy

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: unidecode
Successfully installed unidecode-1.3.6


In [18]:
import os
from tqdm.auto import tqdm

import soundfile
import numpy as np
from unidecode import unidecode
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import tabulate
import librosa

In [13]:
FILES = [
    # My sentences
    ["en", "Audio_Files/EN/your_sentence1.wav", "Where is my gate?"],
    ["en", "Audio_Files/EN/your_sentence2.wav", "Which country is it?"],

    # Files provided by the coursework
    ["en", "Audio_Files/EN/checkin.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/checkin_child.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/parents.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/parents_child.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/suitcase.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/suitcase_child.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/what_time.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/what_time_child.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/where.wav", "Where are the restaurants and shops?"],
    ["en", "Audio_Files/EN/where_child.wav", "Where are the restaurants and shops?"],
]

In [22]:
LANGUAGES = {
    "en": "English",
    "it": "Italian",
    "es": "Spanish"
}

In [8]:
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    ref_words_count = len(ref_words)

    hyp_words = hypothesis.split()
    hyp_words_count = len(hyp_words)

    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
    deletions = ref_words_count - hyp_words_count
    insertions = hyp_words_count - ref_words_count

    # Calculating the Word Error Rate (WER)
    wer = (substitutions + deletions + insertions) / ref_words_count
    return wer

In [9]:
def normalise_text(text):
    text = unidecode(text).lower().strip()

    # remove all special characters
    return(''.join(c for c in text if c.isalnum() or c == ' '))

In [10]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
results = []

for lang, audio_file, translation in tqdm(FILES):
      data, sr = soundfile.read(audio_file)
      data = librosa.resample(data, orig_sr=sr, target_sr=16000)
      inputs = processor(data, sampling_rate=16000, return_tensors="pt")
      with torch.no_grad():
          y = model(**inputs)
          logits = y.logits
          ids = torch.argmax(logits, dim=-1)
          hypothesis = processor.batch_decode(ids)[0]
          print(f"'{audio_file} : '{normalise_text(translation)}' vs '{normalise_text(hypothesis)}'")
          results.append([lang, audio_file, calculate_wer(normalise_text(translation), normalise_text(hypothesis))])

  0%|          | 0/12 [00:00<?, ?it/s]

'Audio_Files/EN/your_sentence1.wav : 'where is my gate' vs 'where is my gat'
'Audio_Files/EN/your_sentence2.wav : 'which country is it' vs 'which country is it'
'Audio_Files/EN/checkin.wav : 'where is the checkin desk' vs 'where is the checken desk'
'Audio_Files/EN/checkin_child.wav : 'where is the checkin desk' vs 'where is the jackin guess'
'Audio_Files/EN/parents.wav : 'i have lost my parents' vs 'i have lost my parenis'
'Audio_Files/EN/parents_child.wav : 'i have lost my parents' vs 'have lost my parentts'
'Audio_Files/EN/suitcase.wav : 'please i have lost my suitcase' vs 'please owi have lost my siccesse'
'Audio_Files/EN/suitcase_child.wav : 'please i have lost my suitcase' vs 'dreas i lost my threet case'
'Audio_Files/EN/what_time.wav : 'what time is my plane' vs 'what time is my playing'
'Audio_Files/EN/what_time_child.wav : 'what time is my plane' vs 'wot tine is my gra'
'Audio_Files/EN/where.wav : 'where are the restaurants and shops' vs 'where are the restaurats and shops'
'A

In [23]:
table_data = [
    [
        LANGUAGES[lang],
        os.path.basename(audio_file),
        "{0:.0%}".format(wer)
    ]
    for lang, audio_file, wer in results
]

table = tabulate.tabulate(table_data, tablefmt="grid", headers=["Langugae", "File", "WER"])
print(table)

+------------+---------------------+-------+
| Langugae   | File                | WER   |
| English    | your_sentence1.wav  | 25%   |
+------------+---------------------+-------+
| English    | your_sentence2.wav  | 0%    |
+------------+---------------------+-------+
| English    | checkin.wav         | 20%   |
+------------+---------------------+-------+
| English    | checkin_child.wav   | 40%   |
+------------+---------------------+-------+
| English    | parents.wav         | 20%   |
+------------+---------------------+-------+
| English    | parents_child.wav   | 80%   |
+------------+---------------------+-------+
| English    | suitcase.wav        | 33%   |
+------------+---------------------+-------+
| English    | suitcase_child.wav  | 83%   |
+------------+---------------------+-------+
| English    | what_time.wav       | 20%   |
+------------+---------------------+-------+
| English    | what_time_child.wav | 60%   |
+------------+---------------------+-------+
| English 

In [24]:
mean_wer = np.mean([r[2] for r in results])
print("Mean WER: {:.0%}".format(mean_wer))

Mean WER: 39%
