In [1]:
import numpy as np

from datasets import load_dataset, Audio, DatasetDict
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

import evaluate
import torch

2023-08-19 00:08:36.894079: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 00:08:37.035312: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 00:08:37.036502: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = load_dataset('audiofolder', data_dir='./data', split="train")

Resolving data files:   0%|          | 0/999 [00:00<?, ?it/s]

In [3]:
data = data.train_test_split(test_size=0.2, stratify_by_column='label')

In [4]:
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [5]:
label2id

{'blues': '0',
 'classical': '1',
 'country': '2',
 'disco': '3',
 'hiphop': '4',
 'jazz': '5',
 'metal': '6',
 'pop': '7',
 'reggae': '8',
 'rock': '9'}

In [6]:
id2label

{'0': 'blues',
 '1': 'classical',
 '2': 'country',
 '3': 'disco',
 '4': 'hiphop',
 '5': 'jazz',
 '6': 'metal',
 '7': 'pop',
 '8': 'reggae',
 '9': 'rock'}

In [7]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [8]:
data = data.cast_column("audio", Audio(sampling_rate=16_000))

In [9]:
data["train"][0]

{'audio': {'path': '/home/lucija/Documents/projekat/ml_music_classification/data/rock/rock.00044.wav',
  'array': array([-0.07570747, -0.10663743, -0.07869549, ...,  0.14128385,
          0.17095664,  0.        ]),
  'sampling_rate': 16000},
 'label': 9}

In [10]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True)
    return inputs

In [11]:
data = data.map(preprocess_function, remove_columns="audio", batched=True)

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
accuracy = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
data["train"][0]

{'label': 9,
 'input_values': [-0.6751357316970825,
  -0.9508605003356934,
  -0.7017723917961121,
  -0.4074861705303192,
  -0.340657114982605,
  -0.2760833203792572,
  -0.24036329984664917,
  -0.34966912865638733,
  -0.2664985954761505,
  -0.24154341220855713,
  0.004633772186934948,
  0.2162235677242279,
  0.26530033349990845,
  0.24212737381458282,
  0.20699447393417358,
  0.053767427802085876,
  -0.08686371892690659,
  -0.27824875712394714,
  -0.35720741748809814,
  -0.3368380069732666,
  -0.10759948939085007,
  0.0858534500002861,
  0.3394002318382263,
  0.5878491401672363,
  0.6559004783630371,
  0.6773855686187744,
  0.6079670786857605,
  0.5239613056182861,
  0.6136279106140137,
  0.6256782412528992,
  0.4262131452560425,
  0.21265283226966858,
  0.22796066105365753,
  0.47694385051727295,
  0.47127753496170044,
  0.4703630208969116,
  0.5769724249839783,
  0.7193443179130554,
  0.5759080648422241,
  0.2171865999698639,
  -0.21014010906219482,
  -0.6994539499282837,
  -1.0454317

In [16]:
data["test"][0]

{'label': 1,
 'input_values': [0.22685106098651886,
  0.25335249304771423,
  0.3260166645050049,
  0.26833680272102356,
  -0.30424168705940247,
  -0.7372673153877258,
  -0.4714626371860504,
  -0.7289277911186218,
  -0.781032383441925,
  -0.40148794651031494,
  -0.20140375196933746,
  -0.3099370300769806,
  -0.19287674129009247,
  0.21201130747795105,
  -0.1180858463048935,
  -0.06154344603419304,
  -0.03597079962491989,
  -0.3541756272315979,
  -0.5181412100791931,
  -0.4579007625579834,
  -0.396477073431015,
  -0.2891448736190796,
  -0.30989253520965576,
  -0.12557350099086761,
  -0.4374581575393677,
  -0.8267472982406616,
  -0.7415247559547424,
  -0.9374895691871643,
  -0.9307399392127991,
  -0.709122359752655,
  -0.6425746083259583,
  -0.8283587098121643,
  -0.7680423259735107,
  -0.1134151741862297,
  0.1279689222574234,
  -0.21088247001171112,
  -0.20363982021808624,
  -0.33978545665740967,
  -0.4929628372192383,
  -0.7866630554199219,
  -0.4166979193687439,
  -0.5087043642997742,

In [17]:
training_args = TrainingArguments(
    output_dir="music-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["train"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,2.305719,0.097622
1,5.765900,,0.100125
2,5.765900,,0.100125


TrainOutput(global_step=18, training_loss=3.203280554877387, metrics={'train_runtime': 1589.575, 'train_samples_per_second': 1.508, 'train_steps_per_second': 0.011, 'total_flos': 2.0899466696832e+16, 'train_loss': 3.203280554877387, 'epoch': 2.88})

In [18]:
from datasets import load_dataset, Audio

dataset = load_dataset('audiofolder', data_dir='./data', split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[1]["audio"]["path"]

Resolving data files:   0%|          | 0/999 [00:00<?, ?it/s]

In [20]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("./music-classification/checkpoint-18/")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [21]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("./music-classification/checkpoint-18/")
with torch.no_grad():
    logits = model(**inputs).logits

In [22]:
predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

'blues'

In [23]:
audio_file

'/home/lucija/Documents/projekat/ml_music_classification/data/blues/blues.00001.wav'