# Audio Effects Classifier Example

To demonstrate how to use the [IDMT-SMT-Audio Effects](https://mirdata.readthedocs.io/en/stable/source/mirdata.html#module-mirdata.datasets.idmt_smt_audio_effects) data loader available in mirdata, this Jupyter notebook goes through the process of training a simple audio effects classifier using the [PyTorch](https://pytorch.org/) framework.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/francescopapaleo/mirdata-notebooks/blob/master/idmt_smt_audio_effects/audio_effects_classifier.ipynb)

In [1]:
#@title Imports
import mirdata
import os
import shutil
import torch
import numpy as np

print(mirdata.__version__)

0.3.8


In [2]:
#@title Helper functions

def print_directory_tree(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, "").count(os.sep)
        indent = " " * 4 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = " " * 4 * (level + 1)
        # Using a list comprehension to filter out .wav and .xml files
        files_to_print = [f for f in files if not f.endswith((".wav", ".xml", "."))]
        for f in files_to_print:
            print(f"{subindent}{f}")

In [3]:
#@title Initialize the dataset
idmt_smt_audio_effects = mirdata.initialize("idmt_smt_audio_effects")
print(idmt_smt_audio_effects.data_home)

/Users/francescopapaleo/mir_datasets/idmt_smt_audio_effects


In [5]:
#@title Download the dataset
# Uncomment the line below to download the dataset
# idmt_smt_audio_effects.download(force_overwrite=False, cleanup=True)

# Print the directory tree
# print_directory_tree(idmt_smt_audio_effects.data_home)

In [4]:
#@title Validate the dataset
idmt_smt_audio_effects.validate()

100%|██████████| 792/792 [00:00<00:00, 1212.01it/s]
100%|██████████| 55044/55044 [01:44<00:00, 527.16it/s]
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


({'metadata': {}, 'tracks': {}}, {'metadata': {}, 'tracks': {}})

In [8]:
from torch.utils.data import Dataset


class IDMTSMTAudioEffectsDataset(Dataset):
    """
    Custom dataset class for the IDMT-SMT-AUDIO-EFFECTS dataset.
    """

    def __init__(
        self,
        transform=None,
    ):
        """
        Args:
            data_home (str): The path to the folder where the dataset is stored.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.dataset = mirdata.initialize("idmt_smt_audio_effects")
        self.transform = transform
        self.tracks = self.dataset.load_tracks()  # Load all track metadata

        self.monophonic_tracks = {
            k: v for k, v in self.tracks.items() if 'polyphon' not in os.path.split(v.audio_path)[0]
        }


    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the item to fetch.

        Returns:
            (Tensor, int): Tuple containing (audio waveform as a tensor, label as an integer)
        """
        track_id = list(self.monophonic_tracks.keys())[idx]
        track = self.monophonic_tracks[track_id]

        # Load the audio file
        audio, sr = track.audio

        # Apply preprocessing if needed
        if self.transform:
            audio = self.transform(audio)

        # Convert the audio array to a PyTorch tensor
        audio_tensor = torch.tensor(audio, dtype=torch.float32)

        label_map = {
            11: 0,  # No effect
            12: 1,  # No Effect, amplifier simulation
            21: 2,  # Feedback Delay
            22: 3,  # Slapback Delay
            23: 4,  # Reverb
            31: 5,  # Chorus
            32: 6,  # Flanger
            33: 7,  # Phaser
            34: 8,  # Tremolo
            35: 9,  # Vibrato
            41: 10,  # Distortion
            42: 11,  # Overdrive
        }

        label = label_map.get(
            track.fx_type, -1
        )  # Default to -1 if fx_type is not found

        return audio_tensor, label

    def __len__(self):
        """
        Returns:
            int: The size of the dataset.
        """
        return len(self.monophonic_tracks)

In [9]:
dataset = IDMTSMTAudioEffectsDataset()
print(len(dataset))  # Prints the number of items in the dataset
audio, label = dataset[0]  # Gets the first sample
print(audio.shape)  # Prints the shape of the audio tensor
print(label)  # Prints the label

41184
torch.Size([88201])
8


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AudioEffectClassifier(nn.Module):
    def __init__(self, audio_length, num_classes=12):
        super(AudioEffectClassifier, self).__init__()

        # Define the layers of the network
        self.conv1 = nn.Conv1d(
            in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1
        )
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv1d(
            in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1
        )
        
        # Calculate the size of the flattened features after the conv and pool layers
        feature_size = self._calculate_feature_size(audio_length)
        self.fc1 = nn.Linear(feature_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.num_flat_features(x))  # Flatten the tensor for the fully connected layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def _calculate_feature_size(self, audio_length):
        # Simulate the forward pass for the convolutional and pooling layers to calculate the feature size
        x = torch.rand(1, 1, audio_length)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        return x.numel()

    def num_flat_features(self, x):
        # This method is not needed if _calculate_feature_size is used
        pass


In [11]:
import torch.optim as optim

audio_length = 88201  # The length of the audio samples in the dataset
model = AudioEffectClassifier(audio_length)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam is a good default choice)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
from torchinfo import summary

# Assuming `AudioEffectClassifier` is your model class and `audio_length` is defined
model = AudioEffectClassifier(audio_length=audio_length, num_classes=12)

# Replace `dummy_input` with the actual size of the input data
# For example, if your input data is 1 channel, 88200 samples long:
dummy_input = (1, 1, 88200)

# Print the summary of the model
summary(model, input_size=dummy_input)


RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [Conv1d: 1, MaxPool1d: 1, Conv1d: 1, MaxPool1d: 1]

In [None]:
from torch.utils.data import DataLoader

# DataLoader setup
batch_size = 1
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 0):
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(), labels.cuda()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

print('Finished Training')
