<a href="https://colab.research.google.com/github/naidu199/LLM-workshop/blob/main/Sound_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install soundata
# sudo apt-get install -y libasound2-dev



In [None]:
import soundata
import torch
import torchaudio
from torch.utils.data import DataLoader
from torchvision import models
from torch import nn, optim
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F

In [None]:
dataset_name = 'urbansound8k'
dataset_path = './data'  # choose your own path
dataset = soundata.initialize(dataset_name, data_home=dataset_path)
dataset.download()

5.61GB [27:47, 3.61MB/s]                            


In [None]:
# get annotations and audio for a random clip
example_clip = dataset.choice_clip()
tags = example_clip.tags
y, sr = example_clip.audio

In [None]:
from IPython.display import Audio
# Play audio
Audio(y, rate=sr)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
# Plot waveform
plt.figure(figsize=(10, 4))
plt.plot(y)
plt.title('Waveform')
plt.show()

In [None]:
# MFCC with torchaudio
# Convert the numpy array to a PyTorch tensor
waveform = torch.from_numpy(y)

# Compute MFCCs
mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sr, n_mfcc=40)
mfcc = mfcc_transform(waveform).detach().squeeze().numpy()

# Plot a spectrogram using the first audio tensor in the x list
plt.figure(figsize=(10, 4))
plt.imshow(mfcc, cmap='cool', origin='lower')
plt.title('MFCC')
plt.show()

In [None]:
clips = dataset.load_clips()

In [None]:
class UrbanSoundDataset(Dataset):

    def __init__(self, clips):
        self.clips = clips
        self.transform = torchaudio.transforms.MFCC(sample_rate=44100, n_mfcc=40)  # MFCC Transform

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, index):
        clip = self.clips[index]
        audio = torch.from_numpy(clip.audio[0]).float()
        sample_rate = clip.audio[1]
        label = clip.class_id
        mfcc = self.transform(audio)

        return mfcc, label


In [None]:
# Subset the clips data according to the fold attribute
train_clips = [v for k,v in clips.items() if v.fold in (1,2,3,4,5,6,7,8)]
val_clips = [v for k,v in clips.items() if v.fold == 9]
test_clips = [v for k,v in clips.items() if v.fold == 10]

In [None]:
# Create Datasets
train_dataset = UrbanSoundDataset(train_clips)
val_dataset = UrbanSoundDataset(val_clips)
test_dataset = UrbanSoundDataset(test_clips)

In [None]:
def collate_fn(batch):
    # Sort the batch in the descending order
    batch.sort(key=lambda x: x[0].shape[1], reverse=True)

    # Separate sequences and labels
    sequences, labels = zip(*batch)

    # Get maximum length of sequences
    max_length = sequences[0].shape[1]

    # Pad all sequences to max length
    sequences_padded = [F.pad(sequence, (0,max_length - sequence.shape[1])) for sequence in sequences]

    # Stack sequences and labels
    sequences_tensor = torch.stack(sequences_padded)
    labels_tensor = torch.tensor(labels)

    return sequences_tensor, labels_tensor

In [None]:
# Create DataLoaders
batch_size = 256
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=2)

In [None]:
# Define a convolutional neural network
model = models.resnet18(pretrained=True)
print(model)

In [None]:
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.fc = nn.Linear(512, 10)  # assuming that the fc7 layer has 512 neurons, change if not

In [None]:
# Choose a device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [None]:
from tqdm.notebook import tqdm
# Training loop
num_epochs = 3  # customize according to your needs
for epoch in tqdm(range(num_epochs)):
    loss_train = 0.0
    model.train()

    for i, data in tqdm(enumerate(train_loader)):
        inputs, labels = data[0], data[1]
        inputs = inputs.unsqueeze(1).to(device) # shape: batch, mfxx, timesteps
        labels = torch.LongTensor(labels).to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_train += loss.item() * inputs.size(0)

    print(f'Epoch : {epoch+1}, Training Loss: {loss_train / len(train_loader.dataset)}')

In [None]:
# print test loss and compute it

In [None]:
# Plotting
plt.plot(train_losses, label='Training loss')
plt.plot(test_losses, label='Validation loss')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))


def evaluate_model(model, test_loader):
    model.eval()  # set the model to evaluation mode
    outputs = [model(mfcc.unsqueeze(1).to(device)) for mfcc, _ in test_loader]
    labels = [label for _, label in test_loader]

    accuracies = [accuracy(output, label.to(device)) for output, label in zip(outputs, labels)]

    average_accuracy = sum(accuracies) / len(accuracies)

    print(f'Test Accuracy: {average_accuracy:.2f}')

evaluate_model(model, test_loader)