Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings.

This guide shows how to:
1. Finetune Wav2Vec2 on the MInDS-14 dataset to transcribe audio to text.
2. Use your finetuned model for inference.

# Libraries

In [None]:
pip install transformers datasets evaluate jiwer

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_dataset, Audio
from transformers import AutoProcessor

# Load Data

In [None]:
# Load a smaller subset of the MInDS-14 (to experiment on a small dataset first)
minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")

# train-test split 
minds = minds.train_test_split(test_size=0.2)

In [None]:
# Inspect dataset detail
# NB: focusing on the audio and transcription
# audio: a 1-dimensional array of the speech signal that must be called to load and resample the audio file.
# transcription: the target text.
minds

In [None]:
# Inspect an example
minds["train"][0]

# Preprocessing

In [None]:
# Load a Wav2Vec2 processor to process the audio signal
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

In [None]:
# MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its dataset card)
# You’ll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

In [None]:
# The transcription text contains a mix of upper and lowercase characters
# The Wav2Vec2 tokenizer is only trained on uppercase characters
# ...make sure the text matches the tokenizer’s vocabulary
def uppercase(example):
    return {"transcription": example["transcription"].upper()}

minds = minds.map(uppercase)

In [None]:
def prepare_dataset(batch):
    # Call the audio column to load and resample the audio file
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    
    # Extracts the input_values from the audio file and tokenize the transcription column with the processor
    batch["input_length"] = len(batch["input_values"][0])
    return batch

encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

In [None]:
# Transformers doesn’t have a data collator for speech recognition
# Adapt the DataCollatorWithPadding to create a batch of examples
# Also perform dynamic padding which is more efficient than setting padding=True
@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")