In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import torch
from jiwer import wer
import time

import numpy as np
import pandas as pd
 

In [2]:
PATH_TRANSCRIPTIONS = "./../eval/transcriptions.csv"

In [3]:
# load model and tokenizer
pre_train_file = "facebook/wav2vec2-large-960h-lv60-self"
processor = Wav2Vec2Processor.from_pretrained(pre_train_file)
model = Wav2Vec2ForCTC.from_pretrained(pre_train_file)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# define function to read in sound file
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /Users/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-9d56804bc919a18f.arrow


In [5]:
# from IPython.display import Audio
# import numpy

# wave_audio = numpy.sin(numpy.linspace(0, 3000, 20000))
# Audio(wave_audio, rate=20000)

In [6]:

# initialize constants
transcriptions = []
error = []
inference_time = []

len_ds = len(ds["text"])
BATCH_SIZE = 1



for i in range(len_ds//BATCH_SIZE):

    # import audio
    audio = ds["speech"][i*BATCH_SIZE:(i+1)*BATCH_SIZE]

    # count time of inference
    start = time.time()
    
    # tokenize
    input_values = processor(audio, sampling_rate=16000, return_tensors="pt", padding="longest").input_values  # Batch size 1

    # retrieve logits
    with torch.no_grad():
        logits = model(input_values).logits

    # take argmax and decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    # store time of inference
    inference_time.append(time.time() - start)

    # store transcription
    transcriptions.append(transcription)

transcriptions = list(np.array(transcriptions).reshape(-1))

In [7]:
df_transcriptions = pd.DataFrame(ds["text"][:len_ds], columns=['ground_truth'])
df_transcriptions[pre_train_file] = transcriptions
df_transcriptions[pre_train_file + "_inf_time"] = inference_time


df_transcriptions[pre_train_file] = df_transcriptions[pre_train_file].apply(lambda x: x.upper())

In [8]:
df_transcriptions

Unnamed: 0,ground_truth,facebook/wav2vec2-large-960h-lv60-self,facebook/wav2vec2-large-960h-lv60-self_inf_time
0,A MAN SAID TO THE UNIVERSE SIR I EXIST,A MAN SAID TO THE UNIVERSE SIR I EXIST,5.948975
1,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,SWEAT COVERED BRION'S BODY TRICKLING INTO THE ...,7.317595
2,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,THE CUT ON HIS CHEST STILL DRIPPING BLOOD THE ...,14.338364
3,HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL S...,HIS INSTANT PANIC WAS FOLLOWED BY A SMALL SHAR...,6.099911
4,ONE MINUTE A VOICE SAID AND THE TIME BUZZER SO...,ONE MINUTE A VOICE SAID AND THE TIME BUZZER SO...,5.535601
...,...,...,...
68,I DON'T BELIEVE ANN KNEW ANY MAGIC OR SHE'D HA...,I DON'T BELIEVE ANNE KNEW ANY MAGIC OR SHE'D H...,12.336592
69,I DO NOT KNOW CONFESSED SHAGGY,I DO NOT KNOW CONFESSED SHAGGY,19.080031
70,TRUE AGREED KALIKO,TRUE A GREEN CALICO,15.415310
71,KALIKO WENT TO THE BIG GONG AND POUNDED ON IT ...,CALICO WENT TO THE BIG GONG AND POUNDED ON IT ...,18.959109


In [9]:
df_transcriptions.to_csv(PATH_TRANSCRIPTIONS)