# Initialisation

In [10]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Subset
# from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import make_grid

import numpy as np
import numpy.random as rd
import os

import librosa
import pandas as pd

# Dataset
## Loading the genre label of each track

In [11]:
# Loading the information about each track (metadata)
path_of_fma_metadata = "../fma_metadata/tracks.csv"
fma_metadata = pd.read_csv(path_of_fma_metadata, header=[0, 1], index_col=0)
# Flatten columns multi-index
fma_metadata.columns = ['__'.join(col).strip() for col in fma_metadata.columns.values]

# Keeping only the genre column
track_genre_df = fma_metadata.reset_index()[['track_id', 'track__genre_top']]
track_genre_df = track_genre_df.rename(columns={"track__genre_top": "music_genre"})
track_genre_df = track_genre_df.dropna(subset=["music_genre"])


# Function to get the audio file path from track_id
def get_audio_path(track_id):
    # There are 1000 tracks per folder
    folder = str(track_id // 1000).zfill(3)
    filename = f"{str(track_id).zfill(6)}.mp3"
    return "../fma_small" + f"/{folder}/{filename}"

track_genre_df["path"] = track_genre_df["track_id"].apply(get_audio_path)
track_genre_df = track_genre_df[track_genre_df["path"].apply(os.path.isfile)]
track_genre_df = track_genre_df.reset_index(drop=True) 
music_genres = track_genre_df["music_genre"].unique().tolist()
nbr_music_genres = len(music_genres)
genre_to_idx = {g: i for i, g in enumerate(music_genres)}
track_genre_df["label"] = track_genre_df["music_genre"].map(genre_to_idx)
print(track_genre_df)

      track_id music_genre                         path  label
0            2     Hip-Hop  ../fma_small/000/000002.mp3      0
1            5     Hip-Hop  ../fma_small/000/000005.mp3      0
2           10         Pop  ../fma_small/000/000010.mp3      1
3          140        Folk  ../fma_small/000/000140.mp3      2
4          141        Folk  ../fma_small/000/000141.mp3      2
...        ...         ...                          ...    ...
7995    154308     Hip-Hop  ../fma_small/154/154308.mp3      0
7996    154309     Hip-Hop  ../fma_small/154/154309.mp3      0
7997    154413         Pop  ../fma_small/154/154413.mp3      1
7998    154414         Pop  ../fma_small/154/154414.mp3      1
7999    155066     Hip-Hop  ../fma_small/155/155066.mp3      0

[8000 rows x 4 columns]


## Preprocessing the data with Librosa

In [12]:
class MusicGenreDataset(torch.utils.data.Dataset):
    MAX_LEN = 1300   
    def __init__(self, track_genre_df, transform=None):
        self.track_genre_df = track_genre_df
        self.transform = transform
        self.genre_to_idx = {g:i for i,g in enumerate(track_genre_df["music_genre"].unique())}

    def __len__(self):
        return len(self.track_genre_df)

    def load_audio(self, path):
        try:
            audio, _ = librosa.load(path, sr=None, mono=True)
            return audio
        except:
            return np.zeros(22050, dtype=np.float32)

    
    def audio_to_mel(self, audio):
        mel = librosa.feature.melspectrogram(y=audio, n_mels=128)
        mel_db = librosa.power_to_db(mel)
        return mel_db.astype(np.float32)

    def track_normalization(self, mel):
        mean = mel.mean()
        std = mel.std() if mel.std() > 1e-6 else 1.0
        mel_normalized = (mel - mean) / std
        T = mel_normalized.shape[1]
        if T < self.MAX_LEN:
            mel_croped = np.pad(mel_normalized, ((0, 0), (0, self.MAX_LEN - T)), mode="constant")
        else:
            mel_croped = mel_normalized[:, :self.MAX_LEN]
        return mel_croped


    def __getitem__(self, idx):
        row = self.track_genre_df.iloc[idx]
        audio = self.load_audio(row["path"])
        mel = self.audio_to_mel(audio)
        mel = self.track_normalization(mel)

        # shape -> (1, 128, time)
        mel = np.asarray(mel, dtype=np.float32)           
        mel = np.nan_to_num(mel) 
        mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0)


        label = self.genre_to_idx[row["music_genre"]]
        return mel, label

## Spliting the Dataset into Train/Test subsets

In [13]:
indices = track_genre_df.index[
    track_genre_df["path"].apply(lambda p: os.path.isfile(p))
].tolist()


rd.shuffle(indices)

split = int(0.8 * len(indices))
train_indices = indices[:split]
test_indices = indices[split:]

train_dataset = Subset(MusicGenreDataset(track_genre_df), train_indices)
test_dataset = Subset(MusicGenreDataset(track_genre_df), test_indices)

# Baseline Implementation
## Model : 
Small (2D) CNN with 3 convolutional layers (Conv → ReLU → MaxPool →
Dropout), followed by a fully connected layer and softmax output.

In [14]:
# Our baseline model
class CNN_base(nn.Module):
    def __init__(self, num_classes=nbr_music_genres, dropout=0.25):
        super().__init__()
        channels = [1, 32, 64, 128]
        layers = []
        for i in range(3):
            layers += [
                nn.Conv2d(channels[i], channels[i + 1], kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(dropout)
            ]
           
        self.features = nn.Sequential(*layers)
        self.gap = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, num_classes),
        )

    
    def forward(self, x):
        x = self.features(x)
        x = self.gap(x)   
        x = self.classifier(x)
        return x

 • Loss: Cross-entropy.

 • Optimizer: Adam.

In [15]:
model = CNN_base(num_classes=nbr_music_genres, dropout=0.25)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


 • Metric: Test accuracy

In [16]:
def test_model(model, test_dl, device, writer=None, global_step=None):
    correct = 0
    total = 0
    with torch.no_grad():
        for mel,label in test_dl:
            mel = mel.to(device)
            label = label.to(device)
            preds = model(mel).argmax(1)
            correct += (preds == label).sum().item()
            total += label.size(0)
    if writer is not None and global_step is not None:
        writer.add_scalar("test/accuracy", 100 * correct / total, global_step)
    print(f"Test accuracy: {100 * correct / total:.2f}%")

# Run the training loop

Use 
[tensorboard]
( https://docs.pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html) 
to monitor training.

In [17]:
device = torch.device("cpu")

train_dl = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=4)


# log a small batch of images and the model graph (if possible)
imgs_sample, labels_sample = next(iter(train_dl))
imgs_sample = imgs_sample.to(device)
grid = make_grid(imgs_sample[:16], nrow=4, normalize=True, scale_each=True)


In [18]:
# provide a global step counter that you can increment in the training loop if desired
global_step = 0

test_model(model, test_dl, device, global_step=global_step)
for epoch in range(3):
    for mel, labels in train_dl:
        global_step += 1
        mel, labels = mel.to(device), labels.to(device)
        opt.zero_grad()
        logits = model(mel)
        loss = criterion(logits, labels)
        loss.backward()
        opt.step()
    print(f"Epoch {epoch + 1}: train loss = {loss.item():.4f}")
    test_model(model, test_dl, device, global_step=global_step)


  audio, _ = librosa.load(path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Test accuracy: 14.44%
Epoch 1: train loss = 1.9360
Test accuracy: 14.25%
Epoch 2: train loss = 2.0027
Test accuracy: 21.25%
Epoch 3: train loss = 1.7514
Test accuracy: 25.00%
