source: https://huggingface.co/facebook/wav2vec2-base-960h
paper: https://arxiv.org/abs/2006.11477

In [1]:

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch

# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Found cached dataset librispeech_asr_dummy (/home/nursyah/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)


In [2]:
# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate=16000).input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)


In [3]:
# what inside 
ds[0]["audio"]['array']

array([-0.00048828, -0.00018311, -0.00137329, ...,  0.00079346,
        0.00091553,  0.00085449])

In [4]:
from pydub import AudioSegment
from pydub.playback import play

In [19]:
test_audio = ds[0]['audio']['path']
print(test_audio)
song = AudioSegment.from_file(test_audio, format='flac')
print(f"playing sound using  pydub")
play(song)

/home/nursyah/.cache/huggingface/datasets/downloads/extracted/52084c21ae4fb2e8c3d62e87346b48521aaf0420040788a6baf22ce7825b982e/dev_clean/1272/141231/1272-141231-0000.flac
playing sound using  pydub


ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


In [6]:
print(transcription)

['A MAN SAID TO THE UNIVERSE SIR I EXIST']


In [29]:
# using custom audio

# convert audio to array
audio = AudioSegment.from_file('./audio_yt.m4a')
x = torch.FloatTensor(audio.get_array_of_samples())


In [26]:
# processing

inputs = processor(x, return_tensors="pt", padding="longest", sampling_rate=16000).input_values 
logits = model(inputs).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

In [27]:
print(transcription)

['']


In [28]:
test_audio = './audio_yt.m4a'
print(test_audio)
song = AudioSegment.from_file(test_audio)
print(f"playing sound using  pydub")
play(song)

./audio_yt.m4a
playing sound using  pydub


ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


KeyboardInterrupt: 