In [1]:
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf
import time
from jiwer import wer
import time

import numpy as np
import pandas as pd

In [2]:
PATH_TRANSCRIPTIONS = "transcriptions.csv"

In [3]:
pre_train_file = "facebook/s2t-large-librispeech-asr"
model = Speech2TextForConditionalGeneration.from_pretrained(pre_train_file)
processor = Speech2TextProcessor.from_pretrained(pre_train_file)



In [4]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset(
    "patrickvonplaten/librispeech_asr_dummy",
    "clean",
    split="validation"
)
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-9d56804bc919a18f.arrow


In [5]:
all_transcriptions = []
inference_time = []
texts = []

len_ds = len(ds["speech"])
BATCH_SIZE = 1

for i in range(len_ds//BATCH_SIZE):
    
    texts.append(ds["text"][i])
    audio = ds["speech"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]
    
    start = time.time()

    input_features = processor(
        audio,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).input_features 
    with torch.no_grad():
        generated_ids = model.generate(input_features)

    transcription = processor.batch_decode(generated_ids)

    inference_time.append(time.time() - start)

    all_transcriptions.append(transcription)

all_transcriptions = list(np.array(all_transcriptions).reshape(-1))


  input_lengths = (input_lengths - 1) // 2 + 1


In [6]:
# df_transcriptions = pd.read_csv(PATH_TRANSCRIPTIONS, index_col=0)

df_transcriptions = pd.DataFrame()

df_transcriptions[pre_train_file] = all_transcriptions
df_transcriptions[pre_train_file] = df_transcriptions[pre_train_file].apply(lambda x: x.upper())
df_transcriptions[pre_train_file + "_inf_time"] = inference_time
df_transcriptions['ground_truth'] = texts

df_transcriptions

Unnamed: 0,facebook/s2t-large-librispeech-asr,facebook/s2t-large-librispeech-asr_inf_time,ground_truth
0,A MAN SAID TO THE UNIVERSE SIR I EXIST,2.873919,A MAN SAID TO THE UNIVERSE SIR I EXIST
1,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,3.680889,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...
2,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,7.644466,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...
3,HIS INSTANT PANIC WAS FOLLOWED BY A SMALL SHAR...,3.364908,HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL S...
4,ONE MINUTE A VOICE SAID AND A TIME BUZZER SOUNDED,2.906659,ONE MINUTE A VOICE SAID AND THE TIME BUZZER SO...
...,...,...,...
68,I DON'T BELIEVE ANNE KNEW ANY MAGIC OR SHE'D H...,2.605747,I DON'T BELIEVE ANN KNEW ANY MAGIC OR SHE'D HA...
69,I DO NOT KNOW CONFESSED SHAGGY,1.352835,I DO NOT KNOW CONFESSED SHAGGY
70,TRUE AGREED KALIKO,1.373060,TRUE AGREED KALIKO
71,KALIKO WENT TO THE BIG GONG AND POUNDED ON IT ...,4.733405,KALIKO WENT TO THE BIG GONG AND POUNDED ON IT ...


In [8]:
df_transcriptions.to_csv(PATH_TRANSCRIPTIONS)