In [1]:
# %pip install -r requirements.txt

In [2]:
import os
from unidecode import unidecode
from tqdm.auto import tqdm

import numpy as np
import librosa as lr
import noisereduce as nr
from deepspeech import Model, version
import tabulate


Here's a list of sample files to process:

[lang, audio_file, translation]

In [3]:
FILES = [
    # My sentences
    ["en", "Audio_Files/EN/your_sentence1.wav", "Where is my gate?"],
    ["en", "Audio_Files/EN/your_sentence2.wav", "Which country is it?"],

    # Files provided by the coursework    
    ["en", "Audio_Files/EN/checkin.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/checkin_child.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/parents.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/parents_child.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/suitcase.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/suitcase_child.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/what_time.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/what_time_child.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/where.wav", "Where are the restaurants and shops?"],
    ["en", "Audio_Files/EN/where_child.wav", "Where are the restaurants and shops?"],

    ["it", "Audio_Files/IT/checkin_it.wav", "Dove e' il bancone?"],
    ["it", "Audio_Files/IT/parents_it.wav", "Ho perso i miei genitori."],
    ["it", "Audio_Files/IT/suitcase_it.wav", "Per favore, ho perso la mia valigia."],
    ["it", "Audio_Files/IT/what_time_it.wav", "A che ora e’ il mio aereo?"],
    ["it", "Audio_Files/IT/where_it.wav", "Dove sono i ristoranti e i negozi?"],

    ["es", "Audio_Files/ES/checkin_es.wav", "¿Dónde están los mostradores?"],
    ["es", "Audio_Files/ES/parents_es.wav", "He perdido a mis padres."],
    ["es", "Audio_Files/ES/suitcase_es.wav", "Por favor, he perdido mi maleta."],
    ["es", "Audio_Files/ES/what_time_es.wav", "¿A qué hora es mi avión?"],
    ["es", "Audio_Files/ES/where_es.wav", "¿Dónde están los restaurantes y las tiendas?"]
]

A language models for DeepSpeech:

In [4]:
MODELS = [
    ["en", "Models/deepspeech-0.9.3-models.pbmm", "Models/deepspeech-0.9.3-models.scorer"],
    ["it", "Models/output_graph_it.pbmm", "Models/kenlm_it.scorer"],
    ["es", "Models/output_graph_es.pbmm", "Models/kenlm_es.scorer"]
]

In [5]:
LANGUAGES = {
    "en": "English",
    "it": "Italian",
    "es": "Spanish"
}

In [6]:
print("Load language models...")
models = {}
for lang, model, scorer in MODELS:
    print(LANGUAGES[lang])
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    models[lang] = ds

Load language models...
English
Italian
Spanish


A function to calculate a Word error rate (WER):

In [7]:
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    ref_words_count = len(ref_words)

    hyp_words = hypothesis.split()
    hyp_words_count = len(hyp_words)

    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
    deletions = ref_words_count - hyp_words_count
    insertions = hyp_words_count - ref_words_count

    # Calculating the Word Error Rate (WER)
    wer = (substitutions + deletions + insertions) / ref_words_count
    return wer

To ensure accurate comparison between reference and hypothesis strings, it is necessary to normalize them, considering that different languages may have distinct conventions for character representation. The normalization process includes the following steps:

* Converting all characters to lowercase.
* Removing leading and trailing whitespace characters from the strings.
* Eliminating all special characters except spaces.

In [8]:
def normalise_text(text):
    text = unidecode(text).lower().strip()

    # remove all special characters    
    return(''.join(c for c in text if c.isalnum() or c == ' '))

Run files through Mozilla DeepSpeech:

In [9]:
results = []

for lang, audio_file, translation in tqdm(FILES):
    ds = models[lang]
    audio, sr = lr.load(audio_file, sr=ds.sampleRate())
    
    audio = (audio * 32767).astype(np.int16) # scale from -1 to 1 to +/-32767
    hypothesis = ds.stt(audio)
    
    print(f"'{audio_file} : '{normalise_text(translation)}' vs '{normalise_text(hypothesis)}'")
    
    results.append([lang, audio_file, calculate_wer(normalise_text(translation), normalise_text(hypothesis))])

HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))

'Audio_Files/EN/your_sentence1.wav : 'where is my gate' vs 'or is my gate'
'Audio_Files/EN/your_sentence2.wav : 'which country is it' vs 'which countries it'
'Audio_Files/EN/checkin.wav : 'where is the checkin desk' vs 'where is the checking desk'
'Audio_Files/EN/checkin_child.wav : 'where is the checkin desk' vs 'aristeides'
'Audio_Files/EN/parents.wav : 'i have lost my parents' vs 'i had lost my parents'
'Audio_Files/EN/parents_child.wav : 'i have lost my parents' vs 'i had lost my parents'
'Audio_Files/EN/suitcase.wav : 'please i have lost my suitcase' vs 'please i have lost my suitcase'
'Audio_Files/EN/suitcase_child.wav : 'please i have lost my suitcase' vs 'this i had lost my sakes'
'Audio_Files/EN/what_time.wav : 'what time is my plane' vs 'what time is my plan'
'Audio_Files/EN/what_time_child.wav : 'what time is my plane' vs 'what time is my plan'
'Audio_Files/EN/where.wav : 'where are the restaurants and shops' vs 'where are the restaurants and shops'
'Audio_Files/EN/where_chi

Display a table with results:

In [10]:
table_data = [
    [
        LANGUAGES[lang],
        os.path.basename(audio_file),
        "{0:.0%}".format(wer)
    ]
    for lang, audio_file, wer in results
]

table = tabulate.tabulate(table_data, tablefmt='html', headers=["Langugae", "File", "WER"])
table

Langugae,File,WER
English,your_sentence1.wav,25%
English,your_sentence2.wav,50%
English,checkin.wav,20%
English,checkin_child.wav,20%
English,parents.wav,20%
English,parents_child.wav,20%
English,suitcase.wav,0%
English,suitcase_child.wav,50%
English,what_time.wav,20%
English,what_time_child.wav,20%


In [11]:
for code, lang in list(LANGUAGES.items()):
    mean_wer = np.mean([r[2] for r in results if r[0] == code])
    print("Mean WER in {}: {:.0%}".format(lang, mean_wer))

Mean WER in English: 20%
Mean WER in Italian: 32%
Mean WER in Spanish: 24%
