# SpeechBrain

Evaluation of the SpeechBrain toolkit on the provided dataset.

BEGIN: I wrote this code personally without assistance. Any fragments taken from external sources will be explicitly marked.

In [26]:
%pip install speechbrain==0.5.14 transformers unidecode tabulate numpy



In [27]:
import os
from tqdm.auto import tqdm

import numpy as np
from unidecode import unidecode
from speechbrain.pretrained import EncoderDecoderASR
import tabulate

In [7]:
asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-wav2vec2-commonvoice-en",
    savedir="./speechbrain",
)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-lv60 were not used when initializing Wav2Vec2Model: ['project_hid.weight', 'project_hid.bias', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading wav2vec2.ckpt:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading asr.ckpt:   0%|          | 0.00/64.9M [00:00<?, ?B/s]

Downloading tokenizer.ckpt:   0%|          | 0.00/253k [00:00<?, ?B/s]

In [16]:
FILES = [
    # My sentences
    ["en", "Audio_Files/EN/your_sentence1.wav", "Where is my gate?"],
    ["en", "Audio_Files/EN/your_sentence2.wav", "Which country is it?"],

    # Files provided by the coursework
    ["en", "Audio_Files/EN/checkin.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/checkin_child.wav", "Where is the check-in desk?"],
    ["en", "Audio_Files/EN/parents.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/parents_child.wav", "I have lost my parents."],
    ["en", "Audio_Files/EN/suitcase.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/suitcase_child.wav", "Please, I have lost my suitcase."],
    ["en", "Audio_Files/EN/what_time.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/what_time_child.wav", "What time is my plane?"],
    ["en", "Audio_Files/EN/where.wav", "Where are the restaurants and shops?"],
    ["en", "Audio_Files/EN/where_child.wav", "Where are the restaurants and shops?"],
]

In [20]:
LANGUAGES = {
    "en": "English",
    "it": "Italian",
    "es": "Spanish"
}

In [9]:
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    ref_words_count = len(ref_words)

    hyp_words = hypothesis.split()
    hyp_words_count = len(hyp_words)

    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
    deletions = ref_words_count - hyp_words_count
    insertions = hyp_words_count - ref_words_count

    # Calculating the Word Error Rate (WER)
    wer = (substitutions + deletions + insertions) / ref_words_count
    return wer

In [10]:
def normalise_text(text):
    text = unidecode(text).lower().strip()

    # remove all special characters
    return(''.join(c for c in text if c.isalnum() or c == ' '))

In [17]:
results = []

for lang, audio_file, translation in tqdm(FILES):
    hypothesis = asr_model.transcribe_file(audio_file)

    print(f"'{audio_file} : '{normalise_text(translation)}' vs '{normalise_text(hypothesis)}'")

    results.append([lang, audio_file, calculate_wer(normalise_text(translation), normalise_text(hypothesis))])

  0%|          | 0/12 [00:00<?, ?it/s]

'Audio_Files/EN/your_sentence1.wav : 'where is my gate' vs 'where is my gate'
'Audio_Files/EN/your_sentence2.wav : 'which country is it' vs 'which country is it'
'Audio_Files/EN/checkin.wav : 'where is the checkin desk' vs 'where is the check in desk'
'Audio_Files/EN/checkin_child.wav : 'where is the checkin desk' vs 'where is the checking desk'
'Audio_Files/EN/parents.wav : 'i have lost my parents' vs 'i have lost my parents'
'Audio_Files/EN/parents_child.wav : 'i have lost my parents' vs 'i had lost my parents'
'Audio_Files/EN/suitcase.wav : 'please i have lost my suitcase' vs 'please i have lost my suitcase'
'Audio_Files/EN/suitcase_child.wav : 'please i have lost my suitcase' vs 'please i ve lost my suitcase'
'Audio_Files/EN/what_time.wav : 'what time is my plane' vs 'what time is my plane'
'Audio_Files/EN/what_time_child.wav : 'what time is my plane' vs 'what time is my play'
'Audio_Files/EN/where.wav : 'where are the restaurants and shops' vs 'where are the restaurants and shops'

In [24]:
table_data = [
    [
        LANGUAGES[lang],
        os.path.basename(audio_file),
        "{0:.0%}".format(wer)
    ]
    for lang, audio_file, wer in results
]

table = tabulate.tabulate(table_data, tablefmt="grid", headers=["Langugae", "File", "WER"])
print(table)

+------------+---------------------+-------+
| Langugae   | File                | WER   |
| English    | your_sentence1.wav  | 0%    |
+------------+---------------------+-------+
| English    | your_sentence2.wav  | 0%    |
+------------+---------------------+-------+
| English    | checkin.wav         | 40%   |
+------------+---------------------+-------+
| English    | checkin_child.wav   | 20%   |
+------------+---------------------+-------+
| English    | parents.wav         | 0%    |
+------------+---------------------+-------+
| English    | parents_child.wav   | 20%   |
+------------+---------------------+-------+
| English    | suitcase.wav        | 0%    |
+------------+---------------------+-------+
| English    | suitcase_child.wav  | 17%   |
+------------+---------------------+-------+
| English    | what_time.wav       | 0%    |
+------------+---------------------+-------+
| English    | what_time_child.wav | 20%   |
+------------+---------------------+-------+
| English 

In [28]:
mean_wer = np.mean([r[2] for r in results])
print("Mean WER: {:.0%}".format(mean_wer))

Mean WER: 10%
