# Initialisation

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Subset
# from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import make_grid
from tqdm import tqdm

import numpy as np
import numpy.random as rd
import os

import librosa
import pandas as pd

# Dataset
## Loading the genre label of each track

In [48]:
# Loading the information about each track (metadata)
path_of_fma_metadata = "../fma_metadata/tracks.csv"
fma_metadata = pd.read_csv(path_of_fma_metadata, header=[0, 1], index_col=0)
# Flatten columns multi-index
fma_metadata.columns = ['__'.join(col).strip() for col in fma_metadata.columns.values]

# Keeping only the genre column
track_genre_df = fma_metadata.reset_index()[['track_id', 'track__genre_top']]
track_genre_df = track_genre_df.rename(columns={"track__genre_top": "music_genre"})
track_genre_df = track_genre_df.dropna(subset=["music_genre"])


# Function to get the audio file path from track_id
def get_audio_path(track_id):
    # There are 1000 tracks per folder
    folder = str(track_id // 1000).zfill(3)
    filename = f"{str(track_id).zfill(6)}.mp3"
    return "../fma_small" + f"/{folder}/{filename}"

def is_loadable_audio(path):
    try:
        librosa.load(path, sr=None, mono=True)
        return True
    except Exception:
        return False

track_genre_df["path"] = track_genre_df["track_id"].apply(get_audio_path)
track_genre_df = track_genre_df[track_genre_df["path"].apply(os.path.isfile)]
track_genre_df["valid"] = track_genre_df["path"].apply(is_loadable_audio)
track_genre_df = track_genre_df[track_genre_df["valid"]].reset_index(drop=True)


music_genres = track_genre_df["music_genre"].unique().tolist()
nbr_music_genres = len(music_genres)
genre_to_idx = {g: i for i, g in enumerate(music_genres)}
track_genre_df["label"] = track_genre_df["music_genre"].map(genre_to_idx)
print(track_genre_df)


KeyboardInterrupt: 

## Preprocessing the data with Librosa

In [None]:
#Precomputing mel-spectrogram and puting in cache
CACHE_DIR = "./mel_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

MAX_LEN = 1300

def compute_mel(path):
    audio, _ = librosa.load(path, sr=None, mono=True)
    mel = librosa.feature.melspectrogram(y=audio, n_mels=128)
    mel = librosa.power_to_db(mel).astype(np.float32)

    # Normalize
    mel = (mel - mel.mean()) / (mel.std() + 1e-6)
    T = mel.shape[1]
    if T < MAX_LEN:
        mel = np.pad(mel, ((0,0),(0,MAX_LEN-T)))
    else:
        mel = mel[:, :MAX_LEN]

    return mel

print("Precomputing mel-spectrogram and puting in cache")
cached_paths = []
for i, row in tqdm(track_genre_df.iterrows(), total=len(track_genre_df)):
    track_id = row["track_id"]
    audio_path = row["path"]

    cache_path = f"{CACHE_DIR}/{track_id}.npy"
    cached_paths.append(cache_path)

    if os.path.exists(cache_path):
        continue

    try:
        mel = compute_mel(audio_path)
        np.save(cache_path, mel)
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

track_genre_df["mel_path"] = cached_paths

In [None]:
class MusicGenreDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.genre_to_idx = {g:i for i,g in enumerate(df["music_genre"].unique())}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        mel = np.load(row["mel_path"])
        mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0)

        label = self.genre_to_idx[row["music_genre"]]
        return mel, label

## Spliting the Dataset into Train/Test subsets

In [None]:
indices = track_genre_df.index[
    track_genre_df["path"].apply(lambda p: os.path.isfile(p))
].tolist()


rd.shuffle(indices)

split = int(0.8 * len(indices))
train_indices = indices[:split]
test_indices = indices[split:]

train_dataset = Subset(MusicGenreDataset(track_genre_df), train_indices)
test_dataset = Subset(MusicGenreDataset(track_genre_df), test_indices)

# Baseline Implementation
## Model : 
Small (2D) CNN with 3 convolutional layers (Conv → ReLU → MaxPool →
Dropout), followed by a fully connected layer and softmax output.

In [None]:
# Our baseline model
class CNN_base(nn.Module):
    def __init__(self, num_classes=nbr_music_genres, dropout=0.25):
        super().__init__()
        channels = [1, 32, 64, 128]
        layers = []
        for i in range(3):
            layers += [
                nn.Conv2d(channels[i], channels[i + 1], kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(dropout)
            ]
           
        self.features = nn.Sequential(*layers)
        self.gap = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, num_classes),
        )

    
    def forward(self, x):
        x = self.features(x)
        x = self.gap(x)   
        x = self.classifier(x)
        return x

 • Loss: Cross-entropy.

 • Optimizer: Adam.

In [None]:
model = CNN_base(num_classes=nbr_music_genres, dropout=0.25)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


 • Metric: Test accuracy

In [None]:
def test_model(model, test_dl, device, writer=None, global_step=None):
    correct = 0
    total = 0
    with torch.no_grad():
        for mel,label in test_dl:
            mel = mel.to(device)
            label = label.to(device)
            preds = model(mel).argmax(1)
            correct += (preds == label).sum().item()
            total += label.size(0)
    if writer is not None and global_step is not None:
        writer.add_scalar("test/accuracy", 100 * correct / total, global_step)
    print(f"Test accuracy: {100 * correct / total:.2f}%")

# Run the training loop

Use 
[tensorboard]
( https://docs.pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html) 
to monitor training.

In [None]:
device = torch.device("cpu")

train_dl = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=4)


# log a small batch of images and the model graph (if possible)
imgs_sample, labels_sample = next(iter(train_dl))
imgs_sample = imgs_sample.to(device)
grid = make_grid(imgs_sample[:16], nrow=4, normalize=True, scale_each=True)


In [None]:
# provide a global step counter that you can increment in the training loop if desired
global_step = 0

test_model(model, test_dl, device, global_step=global_step)
for epoch in range(3):
    for mel, labels in train_dl:
        mel, labels = mel.to(device), labels.to(device)

        opt.zero_grad()
        logits = model(mel)
        loss = criterion(logits, labels)
        loss.backward()
        opt.step()

    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")
    test_model(model, test_dl, device)


  audio, _ = librosa.load(path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Test accuracy: 11.81%
Epoch 1: train loss = 2.0889
Test accuracy: 15.69%
Epoch 2: train loss = 2.0334
Test accuracy: 20.69%
Epoch 3: train loss = 1.8320
Test accuracy: 25.88%
