In [None]:
!pip install soundata

In [None]:
import torch

from torch.utils.data import random_split, Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.nn.functional as F

import torchaudio
import librosa
import soundata

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import time

from tqdm import tqdm
import torchvision.models as models

In [None]:
!mkdir /content/data

In [None]:
dataset = soundata.initialize('urbansound8k', data_home='/content/data')
dataset.download()  # download the dataset
dataset.validate()

5.61GB [08:08, 12.3MB/s]                            
1.15MB [00:01, 672kB/s]                           
100%|██████████| 1/1 [00:00<00:00, 397.19it/s]
100%|██████████| 8732/8732 [00:52<00:00, 167.18it/s]


({'metadata': {}, 'clips': {}}, {'metadata': {}, 'clips': {}})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class UrbanSoundDataset(Dataset):
  def __init__(self, metadata_file, audio_dir, transform=None, max_length = 128):
    self.metadata = pd.read_csv(metadata_file)
    self.audio_dir = audio_dir
    self.transform = transform
    self.max_length = max_length

    self.label_encoder = LabelEncoder()
    self.metadata['encoded_label'] = self.label_encoder.fit_transform(self.metadata['class'])

  def __len__(self):
    return len(self.metadata)

  def pad_or_truncate(self, mel_spec):
    if mel_spec.shape[1] > self.max_length:
      mel_spec = mel_spec[:, :self.max_length]

    else:
      pad_width = self.max_length - mel_spec.shape[1]
      mel_spec = np.pad(mel_spec, pad_width=((0,0), (0, pad_width)), mode='constant')

    return mel_spec

  def __getitem__(self, idx):
    row = self.metadata.iloc[idx]
    audio_path = os.path.join(self.audio_dir, f"fold{str(row['fold'])}", row['slice_file_name'])
    label = row['encoded_label']

    try:
      signal, sr = librosa.load(audio_path, sr=22050)
      mel_spec = librosa.feature.melspectrogram(
          y=signal,
          sr=sr,
          n_mels=128,
          hop_length=512
        )
      mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
      mel_spec_db = self.pad_or_truncate(mel_spec_db)

    except Exception as e:
      print(f"Error processing {audio_path}: {e}")
      return None, None

    if self.transform:
      mel_spec_db = self.transform(mel_spec_db)
    mel_spec_db = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0)
    return mel_spec_db, torch.tensor(label, dtype=torch.long)

In [None]:
metadata_path = '/content/data/metadata/UrbanSound8K.csv'
audio_path = '/content/data/audio'

In [None]:
dataset = UrbanSoundDataset(metadata_path, audio_path)
train_size = int(0.8 * len(dataset))  # 80% of the dataset for training
val_size = len(dataset) - train_size  # Remaining 20% for validation

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
for features, labels in train_dataloader:
    print(f"Feature batch shape: {features.shape}")
    print(f"Label batch shape: {labels.shape}")
    break


Feature batch shape: torch.Size([32, 1, 128, 128])
Label batch shape: torch.Size([32])


In [None]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)  # Input: (1, 128, 128)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)  # Output: (64, 64, 64)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2, 2)  # Halves each dimension
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(64 * 32 * 32, 256)  # Adjust dimensions based on input size
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))  # Conv -> BN -> ReLU -> Pool
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = x.view(x.size(0), -1)  # Flatten for fully connected layer
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)  # Final output (logits)
        return x

In [None]:
class ResNet18AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNet18AudioClassifier, self).__init__()

        # Load ResNet-18 without pretrained weights
        self.resnet18 = models.resnet18(pretrained=False)

        # Modify the first convolutional layer to accept 1 channel input (grayscale)
        self.resnet18.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

        # Modify the fully connected layer to match the number of output classes
        self.resnet18.fc = nn.Linear(self.resnet18.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet18(x)

In [None]:
num_classes = len(dataset.label_encoder.classes_)
model = ResNet18AudioClassifier(num_classes).to(device)

print(model(features.to(device)).shape)



torch.Size([32, 10])


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 15

In [None]:
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    start_time = time.time()

    # Training phase with TQDM for batches
    with tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch') as batch_tqdm:
        for features, labels in batch_tqdm:
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()  # Zero the parameter gradients
            outputs = model(features)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            running_loss += loss.item()

            loss.backward()  # Backward pass
            optimizer.step()  # Optimize model parameters

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

            # Update batch progress bar
            batch_tqdm.set_postfix(loss=loss.item(), accuracy=correct_predictions/total_predictions)

    # Calculate training statistics
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = correct_predictions / total_predictions * 100
    epoch_time = time.time() - start_time

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%, Time: {epoch_time:.2f}s")

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    val_correct_predictions = 0
    val_total_predictions = 0

    with torch.no_grad():  # Disable gradient computation for validation
        for features, labels in val_dataloader:
            features, labels = features.to(device), labels.to(device)

            outputs = model(features)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            val_correct_predictions += (predicted == labels).sum().item()
            val_total_predictions += labels.size(0)

    # Calculate validation accuracy and loss
    val_loss = val_loss / len(val_dataloader)
    val_accuracy = val_correct_predictions / val_total_predictions * 100

    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

Epoch 1/15: 100%|██████████| 219/219 [02:46<00:00,  1.32batch/s, accuracy=0.553, loss=0.374]


Epoch [1/15], Loss: 1.2737, Accuracy: 55.29%, Time: 166.18s
Validation Loss: 1.0881, Validation Accuracy: 63.02%


Epoch 2/15: 100%|██████████| 219/219 [02:40<00:00,  1.37batch/s, accuracy=0.769, loss=1.21]


Epoch [2/15], Loss: 0.6879, Accuracy: 76.92%, Time: 160.28s
Validation Loss: 1.2664, Validation Accuracy: 62.97%


Epoch 3/15: 100%|██████████| 219/219 [02:40<00:00,  1.36batch/s, accuracy=0.827, loss=0.797]


Epoch [3/15], Loss: 0.5041, Accuracy: 82.69%, Time: 160.78s
Validation Loss: 1.4200, Validation Accuracy: 61.93%


Epoch 4/15: 100%|██████████| 219/219 [02:32<00:00,  1.43batch/s, accuracy=0.879, loss=0.324]


Epoch [4/15], Loss: 0.3729, Accuracy: 87.89%, Time: 152.88s
Validation Loss: 0.9498, Validation Accuracy: 74.64%


Epoch 5/15: 100%|██████████| 219/219 [02:33<00:00,  1.43batch/s, accuracy=0.905, loss=1.65]


Epoch [5/15], Loss: 0.3024, Accuracy: 90.51%, Time: 153.29s
Validation Loss: 0.5584, Validation Accuracy: 81.11%


Epoch 6/15: 100%|██████████| 219/219 [02:32<00:00,  1.44batch/s, accuracy=0.92, loss=0.197]


Epoch [6/15], Loss: 0.2490, Accuracy: 92.04%, Time: 152.11s
Validation Loss: 0.4950, Validation Accuracy: 84.89%


Epoch 7/15: 100%|██████████| 219/219 [02:32<00:00,  1.44batch/s, accuracy=0.939, loss=0.461]


Epoch [7/15], Loss: 0.1908, Accuracy: 93.86%, Time: 152.19s
Validation Loss: 0.5066, Validation Accuracy: 84.60%


Epoch 8/15: 100%|██████████| 219/219 [02:34<00:00,  1.42batch/s, accuracy=0.943, loss=0.0817]


Epoch [8/15], Loss: 0.1601, Accuracy: 94.29%, Time: 154.08s
Validation Loss: 0.9699, Validation Accuracy: 74.59%


Epoch 9/15: 100%|██████████| 219/219 [02:33<00:00,  1.43batch/s, accuracy=0.948, loss=0.356]


Epoch [9/15], Loss: 0.1436, Accuracy: 94.80%, Time: 153.51s
Validation Loss: 0.3541, Validation Accuracy: 88.90%


Epoch 10/15: 100%|██████████| 219/219 [02:33<00:00,  1.42batch/s, accuracy=0.954, loss=0.0226]


Epoch [10/15], Loss: 0.1200, Accuracy: 95.45%, Time: 153.72s
Validation Loss: 0.4482, Validation Accuracy: 87.98%


Epoch 11/15: 100%|██████████| 219/219 [02:35<00:00,  1.41batch/s, accuracy=0.97, loss=0.15]


Epoch [11/15], Loss: 0.0866, Accuracy: 96.95%, Time: 155.06s
Validation Loss: 0.5398, Validation Accuracy: 86.26%


Epoch 12/15: 100%|██████████| 219/219 [02:31<00:00,  1.44batch/s, accuracy=0.96, loss=0.359]


Epoch [12/15], Loss: 0.1048, Accuracy: 96.03%, Time: 151.58s
Validation Loss: 0.9068, Validation Accuracy: 75.50%


Epoch 13/15: 100%|██████████| 219/219 [02:33<00:00,  1.43batch/s, accuracy=0.966, loss=0.0222]


Epoch [13/15], Loss: 0.0954, Accuracy: 96.61%, Time: 153.02s
Validation Loss: 0.4506, Validation Accuracy: 86.49%


Epoch 14/15: 100%|██████████| 219/219 [02:33<00:00,  1.43batch/s, accuracy=0.977, loss=0.0703]


Epoch [14/15], Loss: 0.0664, Accuracy: 97.67%, Time: 153.13s
Validation Loss: 0.5176, Validation Accuracy: 87.18%


Epoch 15/15: 100%|██████████| 219/219 [02:31<00:00,  1.45batch/s, accuracy=0.971, loss=0.034]


Epoch [15/15], Loss: 0.0886, Accuracy: 97.05%, Time: 151.30s
Validation Loss: 0.5175, Validation Accuracy: 86.09%
