In [3]:
# skipped:
# https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
# https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

LANG_ID = "en"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
SAMPLES = 10

test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)

Downloading builder script:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/en (download: 56.45 GiB, generated: 84.74 GiB, post-processed: Unknown size, total: 141.19 GiB) to /Users/matthias/.cache/huggingface/datasets/common_voice/en/6.1.0/d3d5467c15716a2699f2ea3710fdc8bed7c20ae8ed66c248185735a0695dcc3b...


Downloading data:   0%|          | 0.00/60.6G [00:00<?, ?B/s]

KeyboardInterrupt: 

# Fine-tuning a pretrained model
[Automatic speech recognition](https://huggingface.co/docs/transformers/tasks/asr) (ASR) converts a speech signal to text. It is an example of a sequence-to-sequence task, going from a sequence of audio inputs to textual outputs. Voice assistants like Siri and Alexa utilize ASR models to assist users.

In [1]:
from IPython.display import HTML
HTML('<iframe width="640" height="360" src="https://www.youtube.com/embed/TksaY_FDgnk" allowfullscreen></iframe>')



This guide will show you how to fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
> <font color="darkgreen">See the automatic speech recognition [task page](https://huggingface.co/tasks/automatic-speech-recognition) for more information about its associated models, datasets, and metrics.</font>"

## Load MInDS-14 dataset
Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library:

In [2]:
from datasets import load_dataset, Audio
minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
minds

Reusing dataset minds14 (/Users/matthias/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/aa40414f15e0f919231d617440192034af844835dc1e6a697f4b552e0551fd26)


Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 563
})

Split this dataset into a train and test set:

In [3]:
minds = minds.train_test_split(test_size=0.2)

Then take a look at the dataset:

In [4]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

While the dataset contains a lot of helpful information, like `lang_id` and `intent_class`, you will focus on the `audio` and `transcription` columns in this guide. Remove the other columns:

In [5]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

Take a look at the example again:

In [6]:
print(minds["train"])
minds["train"][0]

Dataset({
    features: ['path', 'audio', 'transcription'],
    num_rows: 450
})


{'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav',
 'audio': {'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav',
  'array': array([ 0.00024414, -0.00024414,  0.        , ..., -0.0012207 ,
         -0.0012207 , -0.00146484], dtype=float32),
  'sampling_rate': 8000},
 'transcription': 'hi I just moved and I need to change my address on file'}

The `audio` column contains a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file.

## Preprocess
Load the Wav2Vec2 processor to process the audio signal and transcribed text:

In [7]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
processor



Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='facebook/wav2vec2-base', vocab_size=32, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [21]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    batch["labels"] = processor(text=batch["transcription"]).input_ids
    return batch
#
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

        

#0:   0%|          | 0/113 [00:00<?, ?ex/s]

#1:   0%|          | 0/113 [00:00<?, ?ex/s]

#2:   0%|          | 0/112 [00:00<?, ?ex/s]

#3:   0%|          | 0/112 [00:00<?, ?ex/s]

TypeError: __call__() missing 1 required positional argument: 'raw_speech'

The [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has a sampling rate of 8000khz. You will need to resample the dataset to use the pretrained Wav2Vec2 model:

In [8]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav',
 'audio': {'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav',
  'array': array([ 1.7701100e-04, -4.0352734e-06, -1.7177974e-04, ...,
         -1.4763844e-03, -1.3631504e-03, -8.0110703e-04], dtype=float32),
  'sampling_rate': 16000},
 'transcription': 'hi I just moved and I need to change my address on file'}

The preprocessing function needs to:
- Call the `audio` column to load and resample the audio file.
- Extract the `input_values` from the audio file.
- Typically, when you call the processor, you call the feature extractor. Since you also want to tokenize text, instruct the processor to call the tokenizer instead with a context manager.

In [20]:
16000 /40

400.0

In [15]:
def prepare_dataset(batch):
    
    print(batch)
    
    print("DO\n")
    audioBatch = batch["audio"]
    print(audioBatch)
    print("audio array(s)")
    audio = list(map(lambda x: x["array"], audioBatch))
    print(audio)
    print("sample rate(s)")
    sampling_rate = list(map(lambda x: x["sampling_rate"], audioBatch))
    print(sampling_rate)
    #batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    #batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    print("\nDONE\n")
    
    batch = processor(audio=audio, sampling_rate=sampling_rate).input_values[0]
    
    return batch

prepare_dataset(minds["train"][0:2])

{'path': ['/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav', '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~FREEZE/602b9cd7bb1e6d0fbce91f98.wav'], 'audio': [{'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~ADDRESS/602ba79f05f96973d6794473.wav', 'array': array([ 1.7701100e-04, -4.0352734e-06, -1.7177974e-04, ...,
       -1.4763844e-03, -1.3631504e-03, -8.0110703e-04], dtype=float32), 'sampling_rate': 16000}, {'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~FREEZE/602b9cd7bb1e6d0fbce91f98.wav', 'array': array([-6.72695251e-06, -1.14509785e-05,  5.77080345e-06, ...,
       -1.09173852e-05,  

TypeError: __call__() missing 1 required positional argument: 'raw_speech'

In [35]:
myMap = list(map(lambda x: x["array"], minds["train"][0:2]["audio"]))
myMap

[array([2.2707417e-04, 1.6862030e-04, 1.7948631e-05, ..., 1.2631404e-03,
        1.1549155e-03, 6.3748658e-04], dtype=float32),
 array([-5.6353033e-06, -6.7926835e-06,  5.0695940e-06, ...,
         2.2004682e-05,  2.0106493e-06, -1.7000570e-05], dtype=float32)]

Use 🤗 Datasets [map](https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/main_classes#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the map function by increasing the number of processes with `num_proc`. Remove the columns you don't need:

In [25]:
encoded_minds = minds.map(prepare_dataset, num_proc=4)

        

#1:   0%|          | 0/113 [00:00<?, ?ex/s]

#2:   0%|          | 0/112 [00:00<?, ?ex/s]

#3:   0%|          | 0/112 [00:00<?, ?ex/s]

#0:   0%|          | 0/113 [00:00<?, ?ex/s]

audio
{'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~DIRECT_DEBIT/602bab91bb1e6d0fbce921a3.wav', 'array': array([ 2.1389985e-04,  2.1389624e-05, -2.1230409e-04, ...,
        1.6469065e-05,  2.6599606e-04,  2.0366881e-04], dtype=float32), 'sampling_rate': 16000}

array
[ 2.1389985e-04  2.1389624e-05 -2.1230409e-04 ...  1.6469065e-05
  2.6599606e-04  2.0366881e-04]
audio

sampling_rate
{'path': '/Users/matthias/.cache/huggingface/datasets/downloads/extracted/fb791ac7ade81c7c43dde6c9ec61964b6bf332984c72ad98e9c19ed1694ec798/en-US~PAY_BILL/602ba53bbb1e6d0fbce92061.wav', 'array': array([ 2.1329646e-05,  4.2766180e-05, -2.4604093e-05, ...,
       -1.8323808e-05, -2.0185173e-04, -1.9399276e-04], dtype=float32), 'sampling_rate': 16000}16000

array

[ 2.1329646e-05  4.2766180e-05 -2.4604093e-05 ... -1.8323808e-05
 -2.0185173e-04 -1.9399276e-04]

sampling_rate
16000
audio

TypeError: __call__() missing 1 required positional argument: 'raw_speech'

In [23]:
!conda env list

# conda environments:
#
base                     /Users/matthias/opt/anaconda3
aws39                    /Users/matthias/opt/anaconda3/envs/aws39
aws3920220921            /Users/matthias/opt/anaconda3/envs/aws3920220921
base20220121             /Users/matthias/opt/anaconda3/envs/base20220121
hf                    *  /Users/matthias/opt/anaconda3/envs/hf
hf20220505               /Users/matthias/opt/anaconda3/envs/hf20220505
hf20220921               /Users/matthias/opt/anaconda3/envs/hf20220921
nlu                      /Users/matthias/opt/anaconda3/envs/nlu
nlu20220407              /Users/matthias/opt/anaconda3/envs/nlu20220407
pytorch                  /Users/matthias/opt/anaconda3/envs/pytorch
pytorch20220121          /Users/matthias/opt/anaconda3/envs/pytorch20220121

