In [1]:
# https://huggingface.co/learn/audio-course/en/chapter4/fine-tuning
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments
import torchaudio
import torchaudio.transforms as T
import numpy as np
import os

In [2]:
# https://huggingface.co/docs/transformers/v4.37.2/en/model_doc/hubert#transformers.HubertForSequenceClassification

In [3]:
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

sampling_rate = feature_extractor.sampling_rate

In [4]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [5]:
from datasets import load_dataset, Audio

dataset = load_dataset('audiofolder', data_dir='Rebetika_vowels')

Resolving data files:   0%|          | 0/337 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

In [6]:
print(dataset['train'][50])
print(dataset['train'].features['label'])

{'audio': {'path': '/media/datadisk/python/audio_singer_classification/Rebetika_vowels/train/Bithikotsis/Bithikotsis_u_1.mp3', 'array': array([-0.1448793 , -0.1835938 , -0.14170903, ..., -0.04168473,
       -0.07418936, -0.06321772]), 'sampling_rate': 44100}, 'label': 1}
ClassLabel(names=['Bellou', 'Bithikotsis', 'KaitiGkrey', 'Kazantzidis', 'KazantzidisOld', 'Ninou', 'Tsaousakis'], id=None)


In [7]:
# resample audio files to desired sample rate
dataset = dataset.cast_column('audio', Audio(sampling_rate=sampling_rate))

In [8]:
print(dataset['train'][50])

{'audio': {'path': '/media/datadisk/python/audio_singer_classification/Rebetika_vowels/train/Bithikotsis/Bithikotsis_u_1.mp3', 'array': array([-0.10464216, -0.16641691, -0.09649917, ...,  0.11519736,
       -0.02316136,  0.        ]), 'sampling_rate': 16000}, 'label': 1}


In [9]:
sample = dataset["train"][0]["audio"]
print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -0.000103, Variance: 0.0218


In [10]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: 2.75e-09, Variance: 1.0


In [11]:
max_duration = 10.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [12]:
dataset_encoded = dataset.map(
    preprocess_function,
    remove_columns=['audio'],
    batched=True,
    batch_size=100,
    num_proc=1,
)
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 337
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 49
    })
})

In [13]:
id2label_fn = dataset['train'].features['label'].int2str
id2label_fn(dataset['train'][50]['label'])

'Bithikotsis'

In [14]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(dataset_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label['2']

'KaitiGkrey'

In [15]:
num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model_name = model_id.split("/")[-1]
batch_size = 2
gradient_accumulation_steps = 4
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-vowels_voice",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
)

In [18]:
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
0,1.8071,1.71576,0.326531
1,1.3222,1.27833,0.55102
2,1.0286,1.101652,0.55102
4,0.5522,0.871918,0.653061
5,0.4271,0.648291,0.795918
6,0.4032,0.710244,0.77551
8,0.3049,0.630359,0.755102
9,0.2133,0.593282,0.795918


Checkpoint destination directory distilhubert-finetuned-vowels_voice/checkpoint-84 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=420, training_loss=0.7851248786562965, metrics={'train_runtime': 217.9527, 'train_samples_per_second': 15.462, 'train_steps_per_second': 1.927, 'total_flos': 1.3947559504292898e+16, 'train_loss': 0.7851248786562965, 'epoch': 9.94})