In [1]:
# https://huggingface.co/learn/audio-course/en/chapter4/fine-tuning
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments
import torchaudio
import torchaudio.transforms as T
import numpy as np
import os

In [2]:
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

sampling_rate = feature_extractor.sampling_rate

In [3]:
# https://pytorch.org/audio/main/tutorials/audio_io_tutorial.html
test_file_path = 'Rebetika/Bellou/Bellou_sil_remov_1.wav'
waveform, sample_rate = torchaudio.load(test_file_path)

In [4]:
print(waveform.shape)

torch.Size([1, 672333])


In [5]:
# https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html
resample_rate = 16000
resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
resampled_waveform = resampler(waveform)

In [6]:
print(resampled_waveform.shape)

torch.Size([1, 243931])


In [7]:
# load and resample audio file to desired characteristics
def load_and_resample_audio_file(file_path, resample_rate=16000):
    waveform, sample_rate = torchaudio.load(file_path)
    resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
    return resampler(waveform)
# end load_and_resample_audio_file

In [8]:
# feature extractor will handle audio files, e.g. normalize and truncate to max length
inputs = feature_extractor( resampled_waveform , sampling_rate=resample_rate)
print(f"inputs keys: {list(inputs.keys())}")

inputs keys: ['input_values', 'attention_mask']


In [9]:
print(inputs['input_values'][0].shape)
print( np.mean(inputs['input_values']) )
print( np.std(inputs['input_values']) )
print(inputs['attention_mask'])

(1, 243931)
-4.1715507e-09
0.99999756
[array([1], dtype=int32)]


In [10]:
class_names = os.listdir('Rebetika')
print(class_names)

['kazantzidis_old', 'Ninou', 'Bellou', 'Kazantzidis', 'Tsaousakis']


In [11]:
id2label = {
    str(i): c for i, c in enumerate(class_names)
}
label2id = {v: k for k,v in id2label.items()}
print(id2label)
print(label2id)

{'0': 'kazantzidis_old', '1': 'Ninou', '2': 'Bellou', '3': 'Kazantzidis', '4': 'Tsaousakis'}
{'kazantzidis_old': '0', 'Ninou': '1', 'Bellou': '2', 'Kazantzidis': '3', 'Tsaousakis': '4'}


In [12]:
# create dataset
train_test_ratio = 0.8

rebetika = {
    'train': [],
    'test': []
}

for c_i, c in enumerate( class_names ):
    class_files = os.listdir('Rebetika/' + c)
    for f_i, file_name in enumerate( class_files ):
        if not file_name.startswith('.'):
            if f_i <= len(class_files)*train_test_ratio:
                rebetika['train'].append(
                    {
                        'array' : load_and_resample_audio_file('Rebetika/' + c + os.sep + file_name, resample_rate=16000),
                        'class': int( label2id[ c ] )
                    }
                )
            else:
                rebetika['test'].append(
                    {
                        'array' : load_and_resample_audio_file('Rebetika/' + c + os.sep + file_name, resample_rate=16000),
                        'class_idx': int( label2id[ c ] )
                    }
                )
            # end if
        # end if
    # end for class_files
# end for class_names

print(rebetika)

{'train': [{'array': tensor([[ 0.0168,  0.0335,  0.0374,  ..., -0.0438, -0.0348, -0.0249]]), 'class': 0}, {'array': tensor([[ 0.0198,  0.0196, -0.0042,  ...,  0.0197,  0.0208,  0.0199]]), 'class': 0}, {'array': tensor([[-0.0136, -0.0238, -0.0243,  ...,  0.0178,  0.0216,  0.0196]]), 'class': 0}, {'array': tensor([[0.0138, 0.0228, 0.0220,  ..., 0.0412, 0.0447, 0.0423]]), 'class': 0}, {'array': tensor([[0.0138, 0.0222, 0.0203,  ..., 0.0201, 0.0220, 0.0070]]), 'class': 0}, {'array': tensor([[0.0148, 0.0234, 0.0210,  ..., 0.0197, 0.0193, 0.0215]]), 'class': 0}, {'array': tensor([[0.0146, 0.0261, 0.0258,  ..., 0.0209, 0.0228, 0.0085]]), 'class': 0}, {'array': tensor([[ 0.0172, -0.0026, -0.0174,  ...,  0.0334,  0.0423,  0.0241]]), 'class': 0}, {'array': tensor([[0.0140, 0.0242, 0.0244,  ..., 0.0148, 0.0052, 0.0149]]), 'class': 0}, {'array': tensor([[ 4.6234e-05, -2.0508e-05, -1.5664e-04,  ...,  1.3930e-03,
          1.2100e-03,  6.1904e-05]]), 'class': 1}, {'array': tensor([[0.0137, 0.0217, 0

In [22]:
from datasets import load_dataset

dataset = load_dataset('audiofolder', data_dir='Rebetika')

Resolving data files:   0%|          | 0/94 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [30]:
print(dataset['train'][50])
dataset['train'].features['label']

{'audio': {'path': '/media/maximos/9C33-6BBD/python/audio_singer_classification/Rebetika/train/Tsaousakis/Tsaousakis_sil_remov_1.wav', 'array': array([ 2.03247070e-02,  2.17895508e-02,  2.29187012e-02, ...,
       -1.83105469e-04, -6.10351562e-05, -3.05175781e-05]), 'sampling_rate': 44100}, 'label': 3}


ClassLabel(names=['Bellou', 'Kazantzidis', 'Ninou', 'Tsaousakis', 'kazantzidis_old'], id=None)

In [13]:
num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-rebetika_voice",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
)

In [15]:
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [17]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=rebetika["train"],
    eval_dataset=rebetika["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature` to this method that includes input_values, but you provided []