Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.

This guide shows how to:
1. Finetune Wav2Vec2 on the MInDS-14 dataset to classify speaker intent.
2. Use finetuned model for inference.

# Libraries

In [1]:
pip install transformers datasets evaluate

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset, Audio
from transformers import TrainingArguments, Trainer
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification,  TrainingArguments, Trainer, pipeline

mps_device = torch.device("mps")

2024-04-29 21:24:24.439137: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Data

In [None]:
# Load the MInDS-14 dataset
minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

# Split the dataset’s train split into a smaller train and test set
# Chance to experiment and make sure everything works before spending more time on the full dataset
minds = minds.train_test_split(test_size=0.2)

# Inspect the data
minds

In [None]:
# Dataset contains a lot of useful information, like lang_id and english_transcription
# We’ll focus on the audio and intent_class in this guide. Remove the other columns.
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

# Two fields in the dataset:
# audio: a 1-dimensional array of the speech signal that must be called to load and resample the audio file.
# intent_class: represents the class id of the speaker’s intent.
minds["train"][0]

In [None]:
# Map label name to label id
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
    
# Now the label id can be converted to a label name
id2label[str(2)]

# Preprocessing

In [None]:
# load a Wav2Vec2 feature extractor to process the audio signal
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [None]:
# The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it’s dataset card), 
# Need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

In [None]:
# create a preprocessing function that:
# Calls the audio column to load, and if necessary, resample the audio file
# Checks if the sampling rate of audio file = sampling rate of the audio data a model was pretrained with
# Set a maximum input length to batch longer inputs without truncating them
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [None]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

# Evaluation

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
# create a function that passes your predictions and labels to compute to calculate the accuracy
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

# Training

In [None]:
# Load Wav2Vec2 with AutoModelForAudioClassification 
# Load the number of expected labels, and the label mappings
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)
model.to(mps_device)

In [None]:
# Define your training hyperparameters in TrainingArguments. 
# The only required parameter is output_dir which specifies where to save your model. 
# At end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.
training_args = TrainingArguments(
    output_dir="audio_classification_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Pass the training arguments to Trainer 
# Also pass the model, dataset, tokenizer, data collator, and compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# Call train() on the Trainer object to finetune model
trainer.train()

# Inference

In [None]:
# Load an audio file for inference
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

In [None]:
# Inference using pipeline() object
classifier = pipeline("audio-classification", model="audio_classification_model")
classifier(audio_file)

In [None]:
# Inference using PyTorch

# Load a feature extractor to preprocess the audio file and return the input as PyTorch tensors
feature_extractor = AutoFeatureExtractor.from_pretrained("audio_classification_model")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
# Pass inputs to the model and return the logits
model = AutoModelForAudioClassification.from_pretrained("audio_classification_model")
with torch.no_grad():
    logits = model(**inputs).logits
# Get the class with the highest probability, and use the model’s id2label mapping to convert it to a label
predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label