# <center> Music Genre Classification with DistilHubert



[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Iq0CrjqxE1UFOBfGxNJR7609oV1ksrv?usp=sharing) &nbsp;  [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/code/mahimairaja/music-genre-classification-with-distilbert) <br><br>
[![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/mahimairaja/music-genre-gtzan-classification)  &nbsp;&nbsp;&nbsp; [![Demo Website](https://img.shields.io/badge/Demo-000000?style=for-the-badge&logo=About.me&logoColor=orange)](https://huggingface.co/spaces/mahimairaja/music-genre-classifier)


In [None]:
! pip install -q gradio transformers datasets[audio] accelerate evaluate wandb

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

import evaluate
import wandb
import os
import numpy as np
from datasets import load_dataset
from datasets import Audio
from transformers import AutoFeatureExtractor
from huggingface_hub import notebook_login, login
from transformers import AutoModelForAudioClassification
from transformers import TrainingArguments
from transformers import pipeline
from transformers import Trainer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [None]:
from kaggle_secrets import UserSecretsClient

secret_label = "hf_token"
secret_value = UserSecretsClient().get_secret(secret_label)

login(token=secret_value)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
secret_label = "wb_token"
secret_value = UserSecretsClient().get_secret(secret_label)

os.environ['WANDB_API_KEY'] = secret_value
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmahimairaja[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Loading the dataset

In [None]:
gtzan = load_dataset("marsyas/gtzan", "all")
gtzan

Downloading builder script:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

Downloading and preparing dataset gtzan/all to /root/.cache/huggingface/datasets/marsyas___gtzan/all/0.0.0/8bd0e23c2d9b2be30d36bc6834319772dff22a3bd28527996612386cef003910...


Downloading data:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset gtzan downloaded and prepared to /root/.cache/huggingface/datasets/marsyas___gtzan/all/0.0.0/8bd0e23c2d9b2be30d36bc6834319772dff22a3bd28527996612386cef003910. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

In [None]:
gtzan = gtzan['train'].train_test_split(seed=42, shuffle=True, test_size=0.1)

gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [None]:
gtzan["train"][0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
  'array': array([ 0.10720825,  0.16122437,  0.28585815, ..., -0.22924805,
         -0.20629883, -0.11334229], dtype=float32),
  'sampling_rate': 22050},
 'genre': 7}

In [None]:
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][0]["genre"])

'pop'

### Preprocessing the data

In [None]:
# Loading the feature extractor for our model

model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)


Downloading (…)rocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [None]:
# Converting our dataset's sampling rate to 16k

gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [None]:
gtzan["train"][0]['audio']['sampling_rate']

16000

### Normalization

> It is good to have zero mean and unit variance

In [None]:
sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.000185, Variance: 0.0493


In [None]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: -7.45e-09, Variance: 1.0


In [None]:
# Applying constraints to entire dataset

# max_duration = 30.0
max_duration = 15.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
gtzan_encoded

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

In [None]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

In [None]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

print('Label 2 ID\n', label2id, '\n')
print('Id 2 Label\n',id2label, '\n')
id2label["7"]

Label 2 ID
 {'blues': '0', 'classical': '1', 'country': '2', 'disco': '3', 'hiphop': '4', 'jazz': '5', 'metal': '6', 'pop': '7', 'reggae': '8', 'rock': '9'} 

Id 2 Label
 {'0': 'blues', '1': 'classical', '2': 'country', '3': 'disco', '4': 'hiphop', '5': 'jazz', '6': 'metal', '7': 'pop', '8': 'reggae', '9': 'rock'} 



'pop'

# Let's start Fine-Tuning

In [None]:
# Loading the classifier head

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['projector.bias', 'classifier.weight', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Loging if you wish to track the process

# notebook_login()

In [None]:
# Defining training arguments

model_name = "distilhubert-music-classifier"
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [None]:
# Defining the evaluation metrics

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Let's start training for fine-tuning

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/mahimairaja/distilhubert-music-classifier-finetuned-gtzan into local empty directory.


Download file pytorch_model.bin:   0%|          | 7.38k/90.4M [00:00<?, ?B/s]

Download file runs/Aug31_02-55-55_d25f370cb3c6/events.out.tfevents.1693450566.d25f370cb3c6.23.0:   5%|5       …

Download file training_args.bin: 100%|##########| 3.93k/3.93k [00:00<?, ?B/s]

Clean file runs/Aug31_02-55-55_d25f370cb3c6/events.out.tfevents.1693450566.d25f370cb3c6.23.0:   4%|3         |…

Clean file training_args.bin:  25%|##5       | 1.00k/3.93k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/90.4M [00:00<?, ?B/s]

[34m[1mwandb[0m: wandb version 0.15.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230831_042738-kjuum9kf[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhelpful-puddle-4[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mahimairaja/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/mahimairaja/huggingface/runs/kjuum9kf[0m


Epoch,Training Loss,Validation Loss,Accuracy
1,2.0608,2.036097,0.43
2,1.663,1.538652,0.62
3,1.2399,1.207424,0.68
4,1.0662,1.080526,0.65
5,0.7986,0.888002,0.75
6,0.7328,0.803691,0.74
7,0.5891,0.791759,0.78
8,0.5227,0.723227,0.79
9,0.5123,0.713788,0.78
10,0.5578,0.705669,0.79


TrainOutput(global_step=570, training_loss=1.0428661639230292, metrics={'train_runtime': 4709.6109, 'train_samples_per_second': 1.909, 'train_steps_per_second': 0.121, 'total_flos': 3.066994137312e+17, 'train_loss': 1.0428661639230292, 'epoch': 10.0})

## Uploading the model to HF Hub

In [None]:
model_name = 'CTC-based'

kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

In [None]:
trainer.push_to_hub(**kwargs)

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file runs/Aug31_04-26-55_5a3109e38df4/events.out.tfevents.1693456058.5a3109e38df4.23.0:   0%|          …

To https://huggingface.co/mahimairaja/distilhubert-music-classifier-finetuned-gtzan
   d74f233..0dd32d6  main -> main

To https://huggingface.co/mahimairaja/distilhubert-music-classifier-finetuned-gtzan
   0dd32d6..3a6c6cb  main -> main



'https://huggingface.co/mahimairaja/distilhubert-music-classifier-finetuned-gtzan/commit/0dd32d6f1a6e14dfa29c00a6bf059269f41a5c9c'

## Load the model from hub

In [None]:
classifier = pipeline(
    "audio-classification", model="mahimairaja/distilhubert-music-classifier-finetuned-gtzan"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [None]:
example = gtzan['train'][0]['audio']
example

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
 'array': array([ 0.0873509 ,  0.20183384,  0.4790867 , ..., -0.18743178,
        -0.23294401, -0.13517427], dtype=float32),
 'sampling_rate': 16000}

In [None]:
from IPython.display import Audio

Audio(example['array'], rate=example["sampling_rate"])

In [None]:
classifier(example['path'])

[{'score': 0.7938147783279419, 'label': 'pop'},
 {'score': 0.08587687462568283, 'label': 'disco'},
 {'score': 0.03774033114314079, 'label': 'country'},
 {'score': 0.023951858282089233, 'label': 'rock'},
 {'score': 0.016294879838824272, 'label': 'classical'}]