# Initialisation

In [6]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from torchvision.utils import make_grid

import numpy as np
import matplotlib.pyplot as plt

import librosa
import pandas as pd

# Dataset
## Loading the genre label of each track

In [7]:
# Loading the information about each track (metadata)
path_of_fma_metadata = "../fma_metadata/tracks.csv"
fma_metadata = pd.read_csv(path_of_fma_metadata, header=[0, 1], index_col=0)
# Flatten columns multi-index
fma_metadata.columns = ['__'.join(col).strip() for col in fma_metadata.columns.values]

# Keeping only the genre column
track_genre_df = fma_metadata.reset_index()[['track_id', 'track__genre_top']]
track_genre_df = track_genre_df.rename(columns={"track__genre_top": "music_genre"})

# Function to get the audio file path from track_id
def get_audio_path(track_id):
    # There are 1000 tracks per folder
    folder = str(track_id // 1000).zfill(3)
    filename = f"{str(track_id).zfill(6)}.mp3"
    return "fma_small" + f"/{folder}/{filename}"

track_genre_df["path"] = track_genre_df["track_id"].apply(get_audio_path)
print(track_genre_df)

        track_id music_genre                      path
0              2     Hip-Hop  fma_small/000/000002.mp3
1              3     Hip-Hop  fma_small/000/000003.mp3
2              5     Hip-Hop  fma_small/000/000005.mp3
3             10         Pop  fma_small/000/000010.mp3
4             20         NaN  fma_small/000/000020.mp3
...          ...         ...                       ...
106569    155316        Rock  fma_small/155/155316.mp3
106570    155317        Rock  fma_small/155/155317.mp3
106571    155318        Rock  fma_small/155/155318.mp3
106572    155319        Rock  fma_small/155/155319.mp3
106573    155320         NaN  fma_small/155/155320.mp3

[106574 rows x 3 columns]


## Preprocessing the data with Librosa

In [8]:
class MusicGenreDataset(torch.utils.data.Dataset):
    def __init__(self, track_genre_df, transform=None):
        self.track_genre_df = track_genre_df
        self.transform = transform
        self.genre_to_idx = {genre: idx for idx, genre in enumerate(track_genre_df['music_genre'].unique())}

    def __len__(self):
        return len(self.track_genre_df)
    
    # Methods 
    # Loading the audio file
    def load_audio(path):
        audio, _ = librosa.load(path, mono=True)
        return audio

    # Converting audio to mel-spectrogram
    def audio_to_mel(audio):
        mel = librosa.feature.melspectrogram(y=audio,n_mels=128)
        mel_db = librosa.power_to_db(mel)
        return mel_db.astype(np.float32)

    # Normalizing the mel-spectrogram per track
    def track_normalization(mel):
        mean = mel.mean()
        std = mel.std()
        if std < 1e-6:
            std = 1.0
        return (mel - mean) / std

    def __getitem__(self, idx):
        row = self.track_genre_df.iloc[idx]
        audio_path = row['path']
        genre = row['music_genre']
        genre_idx = self.genre_to_idx[genre]

        # Load audio and convert to mel-spectrogram
        audio = self.load_audio(audio_path)
        mel = self.audio_to_mel(audio)
        mel_normalized = self.track_normalization(mel)

        if self.transform:
            mel_normalized = self.transform(mel_normalized)

        return mel_normalized, genre_idx

# Baseline Implementation
## Model : 
Small (2D) CNN with 3 convolutional layers (Conv → ReLU → MaxPool →
Dropout), followed by a fully connected layer and softmax output.

In [None]:
# Our baseline model
class CNN_base(nn.Module):
    def __init__(self, num_classes=8, dropout=0.25):
        super().__init__()
        channels = [1, 32, 64, 128]
        layers = []
        for i in range(3):
            layers += [
                nn.Conv2d(channels[i], channels[i + 1], kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(dropout)
            ]
        self.features = nn.Sequential(*layers)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, num_classes),
        )

    
    def forward(self, x):
        return self.classifier(self.features(x))
    

 • Loss: Cross-entropy.

 • Optimizer: Adam.

In [None]:
model = CNN_base(num_classes=8, dropout=0.25)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


 • Metric: Test accuracy

In [None]:
def test_model(model, test_dl, device, writer=None, global_step=None):
    correct = 0
    total = 0
    with torch.no_grad():
        for imgs, labels in test_dl:
            mel = mel.to(device)
            label = label.to(device)
            preds = model(mel).argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    if writer is not None and global_step is not None:
        writer.add_scalar("test/accuracy", 100 * correct / total, global_step)
    print(f"Test accuracy: {100 * correct / total:.2f}%")