In [17]:
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf
import time
from jiwer import wer
import time

import numpy as np
import pandas as pd

In [18]:
PATH_TRANSCRIPTIONS = "./../eval/transcriptions.csv"

In [19]:
pre_train_file = "facebook/s2t-large-librispeech-asr"
model = Speech2TextForConditionalGeneration.from_pretrained(pre_train_file)
processor = Speech2TextProcessor.from_pretrained(pre_train_file)



In [20]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset(
    "patrickvonplaten/librispeech_asr_dummy",
    "clean",
    split="validation"
)
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-9d56804bc919a18f.arrow


In [21]:
all_transcriptions = []
inference_time = []

len_ds = len(ds["speech"])
BATCH_SIZE = 1

for i in range(len_ds//BATCH_SIZE):

    audio = ds["speech"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
    
    start = time.time()

    input_features = processor(
        audio,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).input_features 
    with torch.no_grad():
        generated_ids = model.generate(input_features)

    transcription = processor.batch_decode(generated_ids)

    inference_time.append(time.time() - start)

    all_transcriptions.append(transcription)

all_transcriptions = list(np.array(all_transcriptions).reshape(-1))


In [22]:
df_transcriptions = pd.read_csv(PATH_TRANSCRIPTIONS, index_col=0)

df_transcriptions[pre_train_file] = all_transcriptions
df_transcriptions[pre_train_file] = df_transcriptions[pre_train_file].apply(lambda x: x.upper())

df_transcriptions[pre_train_file + "_inf_time"] = inference_time
df_transcriptions

Unnamed: 0,ground_truth,facebook/wav2vec2-large-960h-lv60-self,facebook/wav2vec2-large-960h-lv60-self_inf_time,facebook/s2t-large-librispeech-asr,facebook/s2t-large-librispeech-asr_inf_time
0,A MAN SAID TO THE UNIVERSE SIR I EXIST,A MAN SAID TO THE UNIVERSE SIR I EXIST,5.948975,A MAN SAID TO THE UNIVERSE SIR I EXIST,28.164044
1,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,7.317595,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,21.338112
2,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,14.338364,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,32.019719
3,HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL S...,HIS INSTANT PANIC WAS FOLLOWED BY A SMALL SHAR...,6.099911,HIS INSTANT PANIC WAS FOLLOWED BY A SMALL SHAR...,14.342224
4,ONE MINUTE A VOICE SAID AND THE TIME BUZZER SO...,ONE MINUTE A VOICE SAID AND THE TIME BUZZER SO...,5.535601,ONE MINUTE A VOICE SAID AND A TIME BUZZER SOUNDED,12.898235
...,...,...,...,...,...
68,I DON'T BELIEVE ANN KNEW ANY MAGIC OR SHE'D HA...,I DON'T BELIEVE ANNE KNEW ANY MAGIC OR SHE'D H...,12.336592,I DON'T BELIEVE ANNE KNEW ANY MAGIC OR SHE'D H...,18.082567
69,I DO NOT KNOW CONFESSED SHAGGY,I DO NOT KNOW CONFESSED SHAGGY,19.080031,I DO NOT KNOW CONFESSED SHAGGY,22.220248
70,TRUE AGREED KALIKO,TRUE A GREEN CALICO,15.415310,TRUE AGREED KALIKO,30.322897
71,KALIKO WENT TO THE BIG GONG AND POUNDED ON IT ...,CALICO WENT TO THE BIG GONG AND POUNDED ON IT ...,18.959109,KALIKO WENT TO THE BIG GONG AND POUNDED ON IT ...,2244.000314


In [23]:
df_transcriptions.to_csv(PATH_TRANSCRIPTIONS)