In [1]:
import numpy as np

from datasets import load_dataset, Audio, DatasetDict
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer

import evaluate
import torch

In [2]:
# !pip install datasets transformers==4.28.0
# !pip install evaluate

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
data = load_dataset('audiofolder', data_dir='drive/MyDrive/data', split="train")

Resolving data files:   0%|          | 0/999 [00:00<?, ?it/s]

In [5]:
data_train_testval = data.train_test_split(test_size=0.2, stratify_by_column='label')

In [6]:
data_val_test = data_train_testval['test'].train_test_split(0.5, stratify_by_column='label')

In [7]:
data = DatasetDict({
    'train' : data_train_testval['train'],
    'val' : data_val_test['train'],
    'test' : data_val_test['test']
})

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 799
    })
    val: Dataset({
        features: ['audio', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 100
    })
})

In [9]:
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [10]:
label2id

{'blues': '0',
 'classical': '1',
 'country': '2',
 'disco': '3',
 'hiphop': '4',
 'jazz': '5',
 'metal': '6',
 'pop': '7',
 'reggae': '8',
 'rock': '9'}

In [11]:
id2label

{'0': 'blues',
 '1': 'classical',
 '2': 'country',
 '3': 'disco',
 '4': 'hiphop',
 '5': 'jazz',
 '6': 'metal',
 '7': 'pop',
 '8': 'reggae',
 '9': 'rock'}

In [12]:
PRETRAINED_MODEL = "facebook/wav2vec2-base-960h"

In [13]:
feature_extractor = AutoFeatureExtractor.from_pretrained(PRETRAINED_MODEL, do_normalize=True, return_attention_mask=True)

In [14]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [15]:
data = data.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [16]:
data["train"][0]

{'audio': {'path': '/content/drive/MyDrive/data/classical/classical.00038.wav',
  'array': array([-0.07147478, -0.11915156, -0.08404056, ..., -0.07367378,
         -0.1258204 ,  0.        ]),
  'sampling_rate': 16000},
 'label': 1}

In [17]:
(np.mean(data['train'][0]['audio']['array']))

0.00020896931087431938

In [18]:
np.var(data['train'][0]['audio']['array'])

0.0030606209779248353

In [19]:
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs
# def preprocess_function(examples):
#     audio_arrays = [x["array"] for x in examples["audio"]]
#     inputs = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True)
#     return inputs

In [20]:
data = data.map(preprocess_function, remove_columns="audio", batched=True, batch_size=100, num_proc=1)

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

  tensor = as_tensor(value)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [21]:
data

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 799
    })
    val: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

In [22]:
accuracy = evaluate.load("accuracy")

In [23]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [24]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(PRETRAINED_MODEL, num_labels=num_labels, label2id=label2id, id2label=id2label)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['projector.bias', 'projector.weight', 'classifier.bias', 'classifier.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be 

In [25]:
# data["train"][0]

In [26]:
# data["test"][0]

In [27]:
BATCH_SIZE = 1
EPOCHS = 10

In [28]:
training_args = TrainingArguments(
    output_dir="music-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["val"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.2355,2.306051,0.1
2,2.346,2.303717,0.1
3,2.3422,2.303565,0.1
4,2.3105,2.30304,0.1
5,2.285,2.302766,0.1
6,2.3015,2.302758,0.1
7,2.3066,2.302841,0.1
8,2.3448,2.302815,0.1
9,2.2873,2.302846,0.1
10,2.3077,2.302823,0.1


TrainOutput(global_step=7990, training_loss=2.306934299397379, metrics={'train_runtime': 9206.6749, 'train_samples_per_second': 0.868, 'train_steps_per_second': 0.868, 'total_flos': 2.176185221991791e+18, 'train_loss': 2.306934299397379, 'epoch': 10.0})

In [33]:
trainer.evaluate()

{'eval_loss': 2.3060505390167236,
 'eval_accuracy': 0.1,
 'eval_runtime': 74.3174,
 'eval_samples_per_second': 1.346,
 'eval_steps_per_second': 1.346,
 'epoch': 10.0}

In [34]:
trainer.predict(data['test'])

PredictionOutput(predictions=array([[-1.14037849e-01, -2.78894287e-02,  4.60867845e-02,
        -1.53860599e-01,  1.21407405e-01, -1.10265195e-01,
         3.17074731e-02,  4.66588564e-04,  7.63235763e-02,
        -3.50783728e-02],
       [-1.14388496e-01, -3.06328423e-02,  4.90814447e-02,
        -1.57790303e-01,  1.25208318e-01, -1.10707916e-01,
         2.91716252e-02,  1.12250133e-03,  7.40353018e-02,
        -3.46048400e-02],
       [-1.13604389e-01, -3.25564221e-02,  5.29878885e-02,
        -1.60764128e-01,  1.27772138e-01, -1.11998737e-01,
         2.80429255e-02,  2.39634723e-03,  7.05061629e-02,
        -3.48887965e-02],
       [-1.14019230e-01, -2.96023525e-02,  4.84721661e-02,
        -1.56216353e-01,  1.23639569e-01, -1.10302024e-01,
         3.06590982e-02,  8.16936721e-04,  7.44524077e-02,
        -3.46023031e-02],
       [-1.14389963e-01, -2.97867469e-02,  4.76725288e-02,
        -1.57096341e-01,  1.23918481e-01, -1.10149279e-01,
         2.98554134e-02,  7.53800501e-04,

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset('audiofolder', data_dir='./data', split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[1]["audio"]["path"]

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("./music-classification/checkpoint-18/")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("./music-classification/checkpoint-18/")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

In [None]:
audio_file

In [None]:
trainer.evaluate()

In [None]:
res = trainer.predict(data['test'])

In [None]:
res

In [None]:
res.predictions

In [None]:
res.label_ids

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512