In [1]:
import numpy as np
import pandas as pd
from soundfile import read as sf_read
import librosa
from pathlib import Path
import os
from tqdm import tqdm

First, define a function to make a mel spectrogram for us:

In [2]:
def make_mel_spectrogram(fpath, n_fft=1024, hop_length=512):
    fpath = './data/fma_medium' + fpath
    # If we can't read it, skip and print the filename
    try:
        audio, _ = sf_read(fpath)
    except:
        print(f"Unable to read: {fpath}")
        return -1
    # Convert to one channel by averaging stereo channels
    if audio.shape[-1] ==  2:
        audio = np.mean(audio, axis=-1)
    
    if len(np.unique(audio)) == 1:
        print(f"No data: {fpath}")
        return -1

    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=44100, n_fft=n_fft, hop_length=hop_length)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    ##### Normalize the spectrogram
    # Calculate min and max values across the tensor
    min_values = np.min(mel_spectrogram)
    max_values = np.max(mel_spectrogram)

    # Min-max normalization
    output = (mel_spectrogram - min_values) / (max_values - min_values)
    #####

    output = output.astype(np.float32)

    return output

Then, loop through all the files to generate spectrograms for them:

In [8]:
# Make new folder for resampled tracks
Path("./data/fma_medium/mel_spectrograms").mkdir(parents=True, exist_ok=True)

tracks = pd.read_csv('./data/processed_genres.csv')

for track in tqdm(tracks['fpath']):
    if not os.path.exists("./data/fma_medium/mel_spectrograms/" + track.split('\\')[-1][:-4] + '.npy'):
        spec = make_mel_spectrogram(track)
        if spec == -1:
            continue
        np.save("./data/fma_medium/mel_spectrograms/" + track.split('\\')[-1][:-4] + '.npy', spec)
    tracks.loc[tracks['fpath'] == track, 'fpath'] = os.sep + 'mel_spectrograms' + os.sep + track.split('\\')[-1][:-4] + '.npy'

  0%|          | 0/24927 [00:00<?, ?it/s]

 28%|██▊       | 6990/24927 [00:18<01:00, 294.85it/s]

No data: ./data/fma_medium\044\044374.mp3


 63%|██████▎   | 15632/24927 [00:44<00:23, 390.89it/s]

Unable to read: ./data/fma_medium\098\098566.mp3
Unable to read: ./data/fma_medium\098\098568.mp3


 67%|██████▋   | 16617/24927 [00:46<00:25, 326.88it/s]

No data: ./data/fma_medium\107\107535.mp3


100%|██████████| 24927/24927 [01:08<00:00, 363.56it/s]


Remove any bad apples from the dataframe

In [9]:
tracks = tracks[tracks['fpath'].str.contains('mel_spectrograms')]

Finally, save the new CSV with updated mel spectrogram file paths:

In [10]:
tracks.to_csv('./data/processed_genres_mel.csv', index=False)