## Installs

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install accelerate -U
!pip install gradio

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio as ipy_audio
import librosa
import librosa.display

In [None]:
import torch
from datasets import load_dataset
from datasets import Audio as hfd_audio
from transformers import pipeline
from transformers import (
    WhisperForConditionalGeneration, WhisperProcessor, AutoFeatureExtractor,
    AutoModelForAudioClassification, TrainingArguments, Trainer
)
import evaluate
from renumics import spotlight
# import gradio as gr

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

## Dataset

In [None]:
dataset_id = "neerajaabhyankar/hindustani-raag-small"
hrs_full = load_dataset(dataset_id, revision="0dfb021e54e0e7489b90a47e23ef15f34fa740ec")
hrs = hrs_full["train"].train_test_split(seed=42, shuffle=True, train_size=0.8, test_size=0.2, stratify_by_column="label") # train-val split
del hrs_full
dataset_name = dataset_id.split("/")[-1]

In [None]:
hrs["train"].features

## DistilHubert

In [None]:
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

### Prepare Dataset for Model

In [None]:
sampling_rate = feature_extractor.sampling_rate  # 16000
max_duration = 90.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
hrs.column_names

In [None]:
hrs = hrs.cast_column("audio", hfd_audio(sampling_rate=feature_extractor.sampling_rate))
hrs_encoded = hrs.map(
    preprocess_function, remove_columns=["audio"], batched=True, num_proc=1
)

In [None]:
hrs_encoded["train"].features

In [None]:
id2label = {
    int(i): hrs["train"].features["label"].int2str(i)
    for i in range(len(hrs_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

In [None]:
print(id2label[hrs_encoded["train"][0]["label"]])
sample = hrs_encoded["train"][0]["input_values"]
ipy_audio(data=sample, rate=feature_extractor.sampling_rate)

### Prepare Model for Finetuning

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label,
)
model_name = model_id.split("/")[-1]

In [None]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

### Get only the encoder of the model

In [None]:
# # Convert the sample to a feature projection
# sample_tensor = torch.tensor(sample).unsqueeze(0)
# features = model.hubert.feature_extractor(sample_tensor)
# projected_features = model.hubert.feature_projection(features.transpose(1, 2))

In [None]:
def get_feature_projection(examples):
    input_arrays = [x for x in examples["input_values"]]
    features = [model.hubert.feature_extractor(torch.tensor(inputs).unsqueeze(0)) for inputs in input_arrays]
    projected_features = [model.hubert.feature_projection(f.transpose(1, 2)) for f in features]
    examples["projected_features"] = projected_features
    return examples

hrs_encoded = hrs_encoded.map(get_feature_projection, batched=True, batch_size=8)

In [None]:
hrs_encoded

In [None]:
len(hrs_encoded["train"][0]["projected_features"]), len(hrs_encoded["train"][0]["projected_features"][0])

In [None]:
len(hrs_encoded["train"][1]["projected_features"]), len(hrs_encoded["train"][1]["projected_features"][0])

In [None]:
# # TSNE plot of projected feature per class labels
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
# import numpy as np
# # Extract features and labels
# features = np.vstack(hrs_encoded["train"]["projected_features"])
# labels = np.array(hrs_encoded["train"]["label"])

# # Fit TSNE
# tsne = TSNE(n_components=2, random_state=42)
# tsne_results = tsne.fit_transform(features)

# # Plot TSNE
# plt.figure(figsize=(10, 10))
# for label in np.unique(labels):
#     indices = labels == label
#     plt.scatter(tsne_results[indices, 0], tsne_results[indices, 1], label=id2label[label], alpha=0.5)
# plt.legend()
# plt.title("TSNE plot of projected features")
# plt.xlabel("TSNE Component 1")
# plt.ylabel("TSNE Component 2")
# plt.show()

### FineTune

In [None]:
## Train settings ##

batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-{dataset_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=hrs_encoded["train"],
    eval_dataset=hrs_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

## Simple FCN on Projected Features

In [None]:
import torch.nn as nn

class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.classifier_head = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        out = torch.relu(self.fc1(lstm_out))
        out = self.fc2(out)
        out = self.classifier_head(out)
        return out

In [None]:
input_dim = 768  # Dimension of the projected features
hidden_dim = 32  # Hidden layer dimension
num_classes = len(id2label)  # Number of classes

classifier = SimpleLSTM(input_dim, hidden_dim, num_classes)

In [None]:
num_epochs = 1
learning_rate = 1e-5
batch_size = 8

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# from torch.utils.data import DataLoader
# train_data_loader = DataLoader(hrs_encoded["train"], batch_size=1, shuffle=True)
# test_data_loader  = DataLoader(hrs_encoded["test"], batch_size=1, shuffle=True)

In [None]:
# max_dim1 = max([len(x["projected_features"][0]) for x in hrs_encoded["train"]])
# max_dim2 = max([len(x["projected_features"][0]) for x in hrs_encoded["test"]])
# max_dim = max(max_dim1, max_dim2)
max_dim = 2500

In [None]:
from torch.utils.data import DataLoader, TensorDataset

def get_padding(x_shape, max_dim):
    left_pad = int(np.ceil(max_dim - x_shape)//2)
    padding_shape = (0, 0, left_pad, max_dim - x_shape - left_pad)
    assert sum(padding_shape) == max_dim - x_shape
    return padding_shape

def do_padding(x_var):
    padder = nn.ConstantPad2d(get_padding(len(x_var), max_dim), 0)
    assert padder.padding[-1]+padder.padding[-2]+len(x_var) == max_dim
    padded = padder(torch.tensor(x_var))
    assert padded.shape[-2] == max_dim
    return padded

In [None]:
X_train = [
    do_padding(x[0])
    for x in hrs_encoded["train"]["projected_features"]
]
train_dataset = TensorDataset(torch.stack(X_train), torch.tensor(hrs_encoded["train"]["label"]))
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
X_test = [
    do_padding(x[0])
    for x in hrs_encoded["test"]["projected_features"]
]
test_dataset = TensorDataset(torch.stack(X_test), torch.tensor(hrs_encoded["test"]["label"]))
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Training loop
for epoch in range(num_epochs):
    for batch_input in train_data_loader:
        # Forward pass
        outputs = classifier(torch.tensor(batch_input[0]).float())
        loss = criterion(outputs, batch_input[1].long())
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    eval_losses = []
    for batch_input in test_data_loader:
        # Forward pass
        outputs = classifier(torch.tensor(batch_input[0]).float())
        loss = criterion(outputs, batch_input[1].long())
        eval_losses.append(loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Eval Loss: {np.mean(eval_losses):.4f}')


In [None]:
# Training loop
for epoch in range(10):
    for batch_input in train_data_loader:
        # Forward pass
        outputs = classifier(torch.tensor(batch_input[0]).float())
        loss = criterion(outputs, batch_input[1].long())
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    eval_losses = []
    for batch_input in test_data_loader:
        # Forward pass
        outputs = classifier(torch.tensor(batch_input[0]).float())
        loss = criterion(outputs, batch_input[1].long())
        eval_losses.append(loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Eval Loss: {np.mean(eval_losses):.4f}')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
# Set model to evaluation mode
classifier.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for batch_input in test_data_loader:
        # Forward pass
        outputs = classifier(torch.tensor(batch_input[0]).float())
        predictions = torch.argmax(outputs, dim=1)
        
        all_predictions.extend(predictions.numpy())
        all_targets.extend(batch_input[1].numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(all_targets, all_predictions)


In [None]:
# Plot confusion

plt.figure(figsize=(10, 10))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.xticks(np.arange(num_classes))
plt.yticks(np.arange(num_classes))
plt.show()

## LSTM

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
X_train = [inp["audio"]["array"] for inp in hrs["train"]]
y_train = [inp["label"] for inp in hrs["train"]]
X_test = [inp["audio"]["array"] for inp in hrs["test"]]
y_test = [inp["label"] for inp in hrs["test"]]

In [None]:
sampling_rate = hrs["train"][0]["audio"]["sampling_rate"]

In [None]:
# Extract MFCC features from audio

def extract_mfcc(audio_inputs, sampling_rate=sampling_rate, num_mfcc=13):
    return librosa.feature.mfcc(y=np.array(audio_inputs), sr=sampling_rate, n_mfcc=num_mfcc)

X_train_mfcc = [extract_mfcc(inp) for inp in X_train]
X_train_mfcc_tensor = [torch.tensor(np.array(mfcc).T, dtype=torch.float32).unsqueeze(0) for mfcc in X_train_mfcc]
y_train_targets = torch.tensor(y_train, dtype=torch.float32)
del X_train_mfcc, y_train

X_test_mfcc = [extract_mfcc(inp) for inp in X_test]
X_test_mfcc_tensor = [torch.tensor(np.array(mfcc).T, dtype=torch.float32).unsqueeze(0) for mfcc in X_test_mfcc]
y_test_targets = torch.tensor(y_test, dtype=torch.float32)
del X_test_mfcc, y_test

In [None]:
# Hyperparameters

input_size = X_train_mfcc_tensor[0].shape[2]
num_classes = 50

hidden_size = 64
num_layers = 1
output_size = 32

batch_size = 8
learning_rate = 0.001
num_epochs = 10

In [None]:
# Model

class AudioLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, output_size)
        self.fc2 = nn.Linear(output_size, output_size)
        self.classifier_head = nn.Linear(output_size, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        out = torch.relu(self.fc1(lstm_out))
        out = self.fc2(out)
        out = self.classifier_head(out)
        return out

In [None]:
# TensorDataset + DataLoader

def get_pad(x_shape, max_dim):
    left_pad = int(np.ceil(max_dim-x_shape[1])//2)
    return (0, 0, left_pad, max_dim - x_shape[1] - left_pad, 0, 0)

max_dim = max(max([X_train_mfcc_tensor[i].shape[1] for i in range(len(X_train_mfcc_tensor))]), max([X_test_mfcc_tensor[i].shape[1] for i in range(len(X_test_mfcc_tensor))])) 

X_train_mfcc_tensor = [
    nn.ConstantPad1d(get_pad(x.shape, max_dim), 0)(x)
    for x in X_train_mfcc_tensor
]
train_dataset = TensorDataset(torch.stack(X_train_mfcc_tensor), y_train_targets)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

X_test_mfcc_tensor = [
    nn.ConstantPad1d(get_pad(x.shape, max_dim), 0)(x)
    for x in X_test_mfcc_tensor
]
test_dataset = TensorDataset(torch.stack(X_test_mfcc_tensor), y_test_targets)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Init mode, loss function, optimizer
model = AudioLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Training loop

for epoch in range(num_epochs):
    for batch_inputs, batch_targets in train_data_loader:
        # Forward pass
        outputs = model(batch_inputs.squeeze(1))
        loss = criterion(outputs, batch_targets.long())

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    eval_losses = []
    for batch_inputs, batch_targets in train_data_loader:
        # Forward pass
        outputs = model(batch_inputs.squeeze(1))
        loss = criterion(outputs, batch_targets.long())
        eval_losses.append(loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Eval Loss: {np.mean(eval_losses):.4f}')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
# Set model to evaluation mode
model.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for batch_inputs, batch_targets in test_data_loader:
        # Forward pass
        outputs = model(batch_inputs.squeeze(1))
        predictions = torch.argmax(outputs, dim=1)
        
        all_predictions.extend(predictions.numpy())
        all_targets.extend(batch_targets.numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(all_targets, all_predictions)


In [None]:
# Plot confusion

plt.figure(figsize=(10, 10))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.xticks(np.arange(num_classes))
plt.yticks(np.arange(num_classes))
plt.show()

In [None]:
torch.save(model.state_dict(), "models/LSTM1")

In [None]:
# model = AudioLSTM(input_size, hidden_size, num_layers, output_size)
# model.load_state_dict(torch.load("models/LSTM1"))
# model.eval()

Dimension Debugging

In [None]:
# for batch_inputs, batch_targets in train_data_loader:
#     break
# outputs = model(batch_inputs.squeeze(1))
# print(outputs.shape)
# loss = criterion(outputs, batch_targets.long())
# print(loss)

In [None]:
# # Backward pass and optimization
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()

In [None]:
# ## Danger Zone ##
# kwargs = {
#     "dataset_tags": "marsyas/gtzan",
#     "dataset": "GTZAN",
#     "model_name": f"{model_name}-finetuned-gtzan",
#     "finetuned_from": model_id,
#     "tasks": "audio-classification",
# }
# trainer.push_to_hub(**kwargs)