# LibriSpeech Benchmarks

dependecies:
- joeynmt
- datasets
- transformers
- pytorch
- numpy

check package versions

In [1]:
from pathlib import Path
import numpy as np
import torch
import datasets
import transformers

print('torch', torch.__version__)
print('datasets', datasets.__version__)
print('transformers', transformers.__version__)

torch 1.11.0+cu115
datasets 2.3.3.dev0
transformers 4.20.1


## metrics: WER

use joeyS2T implementation

In [2]:
from joeynmt.metrics import wer
from joeynmt.tokenizers import EvaluationTokenizer

tok = EvaluationTokenizer(lowercase=True, tokenize="13a", no_punc=True)

## data: LibriSpeech dev/test

In [3]:
from datasets import load_dataset

librispeech_eval = load_dataset("librispeech_asr", name="all")
librispeech_eval

Reusing dataset librispeech_asr (/workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)


  0%|          | 0/7 [00:00<?, ?it/s]

DatasetDict({
    train.clean.100: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 28539
    })
    train.clean.360: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 104014
    })
    train.other.500: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 148688
    })
    validation.clean: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2703
    })
    validation.other: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2864
    })
    test.clean: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2620
    })
    test.other: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2939
    })
})

## SpeechBrain

https://huggingface.co/speechbrain/asr-transformer-transformerlm-librispeaech

In [None]:
from speechbrain.pretrained import EncoderDecoderASR

asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech",
    run_opts={"device":"cuda"},
)
asr_model.eval()

batch_size = 4

def map_to_pred(batch):
    lengths = [len(b['array']) for b in batch['audio']]
    curr_batch_size = len(lengths)
    max_len = max(lengths)
    input_array = np.zeros((batch_size, max_len))
    length_array = np.zeros((batch_size,))
    
    for i, b in enumerate(batch['audio']):
        input_array[i, :lengths[i]] = b['array']
        length_array[i] = lengths[i] / max_len
        
    transcription, _ = asr_model.transcribe_batch(torch.tensor(input_array), torch.tensor(length_array))
    
    batch["transcription"] = transcription[:curr_batch_size]
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=True,
                                         batch_size=batch_size)
                                         #remove_columns=["audio"])
    
    print(split, wer(hypotheses=result["transcription"], references=result["text"], tokenizer=tok))



  0%|          | 0/676 [00:00<?, ?ba/s]

validation.clean 2.1322745487298262


  0%|          | 0/716 [00:00<?, ?ba/s]

validation.other 5.513464709115176


  0%|          | 0/655 [00:00<?, ?ba/s]

test.clean 2.3128423615337796


  0%|          | 0/735 [00:00<?, ?ba/s]

In [4]:
print(split, wer(hypotheses=result["transcription"], references=result["text"], tokenizer=tok))

test.other 5.614886422253215


number of parameters

In [5]:
import numpy as np

#model_parameters = filter(lambda p: p.requires_grad, asr_model.parameters())
n_params = sum([np.prod(p.size()) for p in asr_model.parameters()])
n_params

164859096

## facebook wav2vec2

https://huggingface.co/facebook/wav2vec2-base-960h

In [8]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

def map_to_pred(batch):
    input_values = processor(batch["audio"]["array"],
                             sampling_rate=batch["audio"]["sampling_rate"],
                             return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values.to("cuda")).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=False,
                                         batch_size=1)
                                         #remove_columns=["audio"])

    hyp = [s[0] for s in result["transcription"]]
    print(split, wer(hypotheses=hyp, references=result['text'], tokenizer=tok))

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2703 [00:00<?, ?ex/s]

validation.clean 3.167162971949561


  0%|          | 0/2864 [00:00<?, ?ex/s]

validation.other 8.860014132056214


  0%|          | 0/2620 [00:00<?, ?ex/s]

test.clean 3.3855751673767496


  0%|          | 0/2939 [00:00<?, ?ex/s]

test.other 8.568480981220029


number of parameters

In [9]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
n_params = sum([np.prod(p.size()) for p in model_parameters])
n_params

94396320

## facebook s2t

https://huggingface.co/facebook/s2t-medium-librispeech-asr

In [12]:
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
import torch

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr").to("cuda")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=False)
model.eval()

def map_to_pred(batch):
    features = processor(batch["audio"]["array"],
                         sampling_rate=batch["audio"]["sampling_rate"],
                         padding=True, return_tensors="pt")
    input_features = features.input_features.to("cuda")
    attention_mask = features.attention_mask.to("cuda")

    with torch.no_grad():
        gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
    
    transcription = processor.batch_decode(gen_tokens, skip_special_tokens=True)
    batch["transcription"] = transcription
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=False,
                                         batch_size=1)
                                         #remove_columns=["audio"])
    
    ref = [s.lower() for s in result['text']]
    hyp = [s[0] for s in result["transcription"]]
    print(split, wer(hypotheses=hyp, references=ref, tokenizer=tok))

  0%|          | 0/2703 [00:00<?, ?ex/s]

validation.clean 3.23149884195434


  0%|          | 0/2864 [00:00<?, ?ex/s]

validation.other 8.008165188034859


  0%|          | 0/2620 [00:00<?, ?ex/s]

test.clean 3.5225197808886186


  0%|          | 0/2939 [00:00<?, ?ex/s]

test.other 7.832948054181074


number of parameters

In [11]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
n_params = sum([np.prod(p.size()) for p in model_parameters])
n_params

71207936

## joeynmt

In [5]:
from pathlib import Path

root_dir = Path('/workspace/mitarb/ohta/models')
random_seeds = [321, 42, 987]
models = ['librispeech100h', 'librispeech960h']

In [20]:
from collections import defaultdict

for model in models:
    print('---', model, '---')
    scores = defaultdict(list)
    for seed in random_seeds:
        model_dir = root_dir / f'{model}_seed{seed}'
        for split, key in [('clean.dev', 'validation.clean'),
                          ('other.dev', 'validation.other'),
                          ('clean.test', 'test.clean'),
                          ('other.test', 'test.other')]:
            data = librispeech_eval[key].sort('id')
            hyp = (model_dir / f'avg10_{split}').read_text().splitlines()
            ref = [s.lower() for s in data['text']]
            assert len(hyp) == len(ref)
            score = wer(hypotheses=hyp, references=ref, tokenizer=tok)
            scores[split].append(score)
    for k, v in scores.items():
        print('%10s: mean=%.2f std=%.2f %r' % (k, np.mean(v), np.std(v), v))
    print()

Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-5adcea4a61262a78.arrow


--- librispeech100h ---


Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-7b98d6c54890e5b7.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-27804ea70aad4fbd.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-2f192d8dc95cb9bc.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-5adcea4a61262a78.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-7b98d6c54890e5b7.arrow
Loading cached sorted indices 

 clean.dev: mean=10.66 std=0.36 [11.036358957391272, 10.769824638800044, 10.179772802470497]
 other.dev: mean=23.82 std=0.34 [24.228625264976053, 23.847844861427337, 23.386590248881213]
clean.test: mean=12.02 std=0.32 [12.334525258673159, 12.150030432136335, 11.577525867315885]
other.test: mean=24.75 std=0.37 [25.204898458246568, 24.744473950671534, 24.310796094988824]

--- librispeech960h ---


Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-27804ea70aad4fbd.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-2f192d8dc95cb9bc.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-5adcea4a61262a78.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-7b98d6c54890e5b7.arrow
Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-27804ea70aad4fbd.arrow
Loading cached sorted indices 

 clean.dev: mean=3.79 std=0.27 [4.145068196022205, 3.501709495974413, 3.720451453990662]
 other.dev: mean=8.84 std=0.39 [9.362487241893696, 8.436052445630839, 8.728507497840935]
clean.test: mean=4.31 std=0.52 [5.025106512477176, 3.78119293974437, 4.123554473524042]
other.test: mean=8.66 std=0.35 [9.132071146094034, 8.318208738513269, 8.524540053111208]

