In [1]:
# https://huggingface.co/learn/audio-course/en/chapter4/fine-tuning
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments
import torchaudio
import torchaudio.transforms as T
import numpy as np
import os

In [None]:
# https://huggingface.co/docs/transformers/v4.37.2/en/model_doc/hubert#transformers.HubertForSequenceClassification

In [2]:
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

sampling_rate = feature_extractor.sampling_rate

In [3]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [11]:
from datasets import load_dataset, Audio

dataset = load_dataset('audiofolder', data_dir='Rebetika_whole')

Resolving data files:   0%|          | 0/94 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
print(dataset['train'][50])
print(dataset['train'].features['label'])

{'audio': {'path': '/media/maximos/9C33-6BBD/python/audio_singer_classification/Rebetika/train/Tsaousakis/Tsaousakis_sil_remov_1.wav', 'array': array([ 2.03247070e-02,  2.17895508e-02,  2.29187012e-02, ...,
       -1.83105469e-04, -6.10351562e-05, -3.05175781e-05]), 'sampling_rate': 44100}, 'label': 3}
ClassLabel(names=['Bellou', 'Kazantzidis', 'Ninou', 'Tsaousakis', 'kazantzidis_old'], id=None)


In [12]:
# resample audio files to desired sample rate
dataset = dataset.cast_column('audio', Audio(sampling_rate=sampling_rate))

In [14]:
print(dataset['train'][50])

{'audio': {'path': '/media/maximos/9C33-6BBD/python/audio_singer_classification/Rebetika/train/Tsaousakis/Tsaousakis_sil_remov_1.wav', 'array': array([ 0.01362134,  0.02519781,  0.02488767, ...,  0.00035684,
       -0.00023973, -0.00010438]), 'sampling_rate': 16000}, 'label': 3}


In [15]:
sample = dataset["train"][0]["audio"]
print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.000378, Variance: 0.0203


In [16]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: 5.4e-10, Variance: 1.0


In [36]:
max_duration = 10.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [37]:
dataset_encoded = dataset.map(
    preprocess_function,
    remove_columns=['audio'],
    batched=True,
    batch_size=100,
    num_proc=1,
)
dataset_encoded

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 94
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 19
    })
})

In [38]:
id2label_fn = dataset['train'].features['label'].int2str
id2label_fn(dataset['train'][50]['label'])

'Tsaousakis'

In [39]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(dataset_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label['2']

'Ninou'

In [40]:
num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
model_name = model_id.split("/")[-1]
batch_size = 2
gradient_accumulation_steps = 4
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-rebetika_voice",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
)

In [52]:
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [53]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5728,1.517116,0.473684
1,1.3801,1.282689,0.578947
2,1.1607,1.069747,0.578947
4,0.9615,0.890882,0.578947
5,0.705,0.886153,0.473684
6,0.7092,0.784642,0.736842
8,0.6222,0.698306,0.842105
9,0.6153,0.695647,0.842105


TrainOutput(global_step=110, training_loss=0.9693356600674716, metrics={'train_runtime': 585.635, 'train_samples_per_second': 1.605, 'train_steps_per_second': 0.188, 'total_flos': 1.98602826633354e+16, 'train_loss': 0.9693356600674716, 'epoch': 9.36})