### Installs

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install accelerate -U
!pip install gradio

### Imports

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio as ipy_audio
import librosa
import librosa.display

In [None]:
import torch
from datasets import load_dataset
from datasets import Audio as hfd_audio
from transformers import pipeline
from transformers import (
    WhisperForConditionalGeneration, WhisperProcessor, AutoFeatureExtractor,
    AutoModelForAudioClassification, TrainingArguments, Trainer
)
import evaluate
import gradio as gr

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

### Prep

Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# dataset_id = "neerajaabhyankar/hindustani-raag-small"
dataset_id = "/content/drive/MyDrive/HuggingFace/cache/hindustani-raag-small"  # drive path

In [None]:
# hrs_full = load_dataset(dataset_id, revision="0dfb021e54e0e7489b90a47e23ef15f34fa740ec")
hrs_full = load_dataset(dataset_id)  # from drive path
hrs = hrs_full["train"].train_test_split(seed=42, shuffle=True, train_size=0.8, test_size=0.2, stratify_by_column="label") # train-val split
del hrs_full
dataset_name = dataset_id.split("/")[-1]

In [None]:
# # TEMP: v small dataset
hrs_filtered = hrs.filter(lambda example: example["label"] in [2, 4])
hrs = hrs_filtered

In [None]:
# # TEMP: og dataset

# dataset_name = "gtzan"
# hrs = load_dataset("marsyas/gtzan")
# hrs = hrs["train"].train_test_split(seed=42, shuffle=True, train_size=0.2, test_size=0.1)

In [None]:
# raag dataset
# len(hrs_encoded["test"][0]["attention_mask"])  # 192000
# len(hrs_encoded["test"][0]["input_values"])  # 192000

In [None]:
# og dataset
# len(hrs_encoded["test"][0]["attention_mask"])  # 480214
# len(hrs_encoded["test"][0]["input_values"])  # 480214

Base Model

In [None]:
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

Prepare Dataset for Model

In [None]:
sampling_rate = feature_extractor.sampling_rate  # 16000
max_duration = 90.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
hrs = hrs.cast_column("audio", hfd_audio(sampling_rate=feature_extractor.sampling_rate))

In [None]:
hrs_encoded = hrs.map(
    preprocess_function, remove_columns=["audio"], batched=True, num_proc=1
)

In [None]:
# For some reason, colab tries to read audio from the path. Getting rid of the paths...

In [None]:
# new_hrs = {"train": [], "test": []}

In [None]:
# for sample in hrs["test"]:
#     new_sample = sample.copy()  # Create a shallow copy of the sample
#     if 'audio' in new_sample and 'path' in new_sample['audio']:
#         del new_sample['audio']['path']  # Delete the 'path' key
#     new_hrs["test"].append(new_sample)

In [None]:
# for sample in hrs["train"]:
#     new_sample = sample.copy()  # Create a shallow copy of the sample
#     if 'audio' in new_sample and 'path' in new_sample['audio']:
#         print(new_sample['audio']['path'])
#         del new_sample['audio']['path']  # Delete the 'path' key
#     new_hrs["train"].append(new_sample)

In [None]:
# new_hrs["train"] = []
# len(new_hrs["train"]), len(new_hrs["test"])

In [None]:
# from datasets import Dataset, NamedSplit
# from typing import OrderedDict
# del hrs
# new_hrs = OrderedDict(new_hrs)
# new_hrs_ds_train = Dataset.from_list(new_hrs["train"])
# new_hrs_ds_test = Dataset.from_list(new_hrs["test"])
# del new_hrs

In [None]:
# hrs_encoded_train = new_hrs_ds_train.map(
#     preprocess_function, remove_columns=["audio"], batched=True, num_proc=1
# )
# del new_hrs_ds_train
# hrs_encoded_test = new_hrs_ds_test.map(
#     preprocess_function, remove_columns=["audio"], batched=True, num_proc=1
# )
# del new_hrs_ds_test
# hrs_encoded = {
#     "train": hrs_encoded_train,
#     "test": hrs_encoded_test
# }
# del hrs_encoded_train, hrs_encoded_test

In [None]:
# # TEMP: og dataset
# hrs_encoded = hrs_encoded.rename_column("genre", "label")

In [None]:
hrs_encoded["train"].features

In [None]:
id2label = {
    int(i): hrs_encoded["train"].features["label"].int2str(i)
    for i in range(len(hrs_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

In [None]:
print(id2label[hrs_encoded["train"][0]["label"]])
sample = hrs_encoded["train"][0]["input_values"]
ipy_audio(data=sample, rate=feature_extractor.sampling_rate)

Prepare Model for Finetuning

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label,
    # torch_dtype=torch.bfloat16,
).to("cuda")
model_name = model_id.split("/")[-1]

In [None]:
model.device

In [None]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
## Train settings ##

batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 50

training_args = TrainingArguments(
    f"{model_name}-finetuned-{dataset_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # fp16=True,
    push_to_hub=False,
)

### FineTune

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=hrs_encoded["train"],
    eval_dataset=hrs_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
torch.cuda.empty_cache()

### Plot

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
preds = trainer.predict(hrs_encoded["test"])
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = hrs_encoded["test"]["label"]
labels = hrs_encoded["test"].features["label"].names

cm = confusion_matrix(true_labels, pred_labels, labels=list(range(len(labels))))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm[:6,:6], display_labels=labels[:6])
disp.plot()
plt.show()

In [None]:
model.save_pretrained("/content/drive/MyDrive/HuggingFace/distilhubert-finetuned-bairagi-bageshree-only")

In [None]:
# ## Danger Zone ##
# kwargs = {
#     "dataset_tags": "marsyas/gtzan",
#     "dataset": "GTZAN",
#     "model_name": f"{model_name}-finetuned-gtzan",
#     "finetuned_from": model_id,
#     "tasks": "audio-classification",
# }
# trainer.push_to_hub(**kwargs)