In [2]:
import os
import torch
import jiwer
import librosa

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [3]:
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def wav2vec_inference(wav_file):

    # load audio
    audio_input, sample_rate = librosa.load(wav_file, sr=16000)

    # tokenize audio input with the wav2vec2 transformer model
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values

    #INFERENCE

    # retrieve logits & take argmax
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    #transcribe prediction
    transcription = processor.decode(predicted_ids[0])

    return transcription

In [5]:
# the original transcription of the read speech
original_text = "THE NORTH WIND AND THE SUN WERE DISPUTING WHICH WAS THE STRONGER WHEN A TRAVELLER CAME ALONG WRAPPED IN A WARM CLOAK THEY AGREED THAT THE ONE WHO FIRST SUCCEEDED IN MAKING THE TRAVELLER TAKE HIS CLOAK OFF SHOULD BE CONSIDERED STRONGER THAN THE OTHER THEN THE NORTH WIND BLEW AS HARD AS HE COULD BUT THE MORE HE BLEW THE MORE CLOSELY DID THE TRAVELLER FOLD HIS CLOAK AROUND HIM AND AT LAST THE NORTH WIND GAVE UP THE ATTEMPT THEN THE SUN SHINED OUT WARMLY AND IMMEDIATELY THE TRAVELLER TOOK OFF HIS CLOAK AND SO THE NORTH WIND WAS OBLIGED TO CONFESS THAT THE SUN WAS THE STRONGER OF THE TWO"

In [6]:
leminh_recording = 'audio_samples/leminh-wind-09092021.wav'
tatsu_recording = 'audio_samples/tatsu-wind-09092021.wav'

# get prediction of our audio recordings
print(f'Original transcription:\n{original_text}')
print()
leminh_prediction = wav2vec_inference(leminh_recording)
print(f'Prediction of {os.path.basename(leminh_recording)}:\n{leminh_prediction}')
print()
tatsu_prediction = wav2vec_inference(tatsu_recording)
print(f'Prediction of {os.path.basename(tatsu_recording)}:\n{tatsu_prediction}')

Original transcription:
THE NORTH WIND AND THE SUN WERE DISPUTING WHICH WAS THE STRONGER WHEN A TRAVELLER CAME ALONG WRAPPED IN A WARM CLOAK THEY AGREED THAT THE ONE WHO FIRST SUCCEEDED IN MAKING THE TRAVELLER TAKE HIS CLOAK OFF SHOULD BE CONSIDERED STRONGER THAN THE OTHER THEN THE NORTH WIND BLEW AS HARD AS HE COULD BUT THE MORE HE BLEW THE MORE CLOSELY DID THE TRAVELLER FOLD HIS CLOAK AROUND HIM AND AT LAST THE NORTH WIND GAVE UP THE ATTEMPT THEN THE SUN SHINED OUT WARMLY AND IMMEDIATELY THE TRAVELLER TOOK OFF HIS CLOAK AND SO THE NORTH WIND WAS OBLIGED TO CONFESS THAT THE SUN WAS THE STRONGER OF THE TWO

Prediction of leminh-wind-09092021.wav:
THE NORTH WIND AND THE SUN WERE DISPUTING WHICH WAS THE STRONGER WHEN HE TRAVELLER CAME ALONG WRAPPED IN A WARM CLOAK DAY AGREED THAT THE ONE WHO FIRST SUCCEEDED IN MAKING THE TRAVELR TAKE HIS CLOAK OFF SHOULD BE CONSIDERED STRONGER THAN THE OTHER THEN THE NORTH WIND BLEW AS HARD AS HE COULD BUT THE MORE HE BLEW THE MORE CLOSELY TID THE TRAVEL

## The code snippet below is for WER calcutation.

In [7]:
from jiwer import wer

# WER is calculated in percentage.
WER_leminh = round(wer(original_text, leminh_prediction)*100, 2)
WER_tatsu = round(wer(original_text, tatsu_prediction)*100, 2)

print(f'WER_leminh: {WER_leminh}% WER')
print(f'WER_tatsu: {WER_tatsu}% WER')

WER_leminh: 7.96% WER
WER_tatsu: 21.24% WER
