In [1]:
import os
import pickle
import librosa
import librosa.display
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [24]:
class Loader:
    """Loader is responsible for loading an audio file."""

    def __init__(self, sample_rate, duration, mono):
        self.sample_rate = sample_rate
        self.duration = duration
        self.mono = mono

    def load(self, file_path):
        signal = librosa.load(file_path,
                              sr=None,
                              mono=self.mono)[0]
        return signal

In [25]:
class Padder:
    """Padder is responsible to apply padding to an array."""

    def __init__(self, mode="constant"):
        self.mode = mode

    def left_pad(self, array, num_missing_items):
        padded_array = np.pad(array,
                              (num_missing_items, 0),
                              mode=self.mode)
        return padded_array

    def right_pad(self, array, num_missing_items):
        padded_array = np.pad(array,
                              (0, num_missing_items),
                              mode=self.mode)
        return padded_array

In [26]:
class LogSpectrogramExtractor:
    """LogSpectrogramExtractor extracts log spectrograms (in dB) from a
    time-series signal.
    """

    def __init__(self, sr, fmin, n_bins, hop_length):
        self.hop_length = hop_length
        self.sr = sr
        self.fmin = fmin
        self.n_bins = n_bins

    def extract(self, signal):
        cqt = librosa.cqt(
            signal, 
            self.sr, 
            hop_length=self.hop_length, 
            window="hann", 
            fmin=self.fmin, 
            n_bins=self.n_bins
        )
        spectrogram = np.abs(cqt)
        log_spectrogram = librosa.amplitude_to_db(spectrogram)
        return log_spectrogram

In [27]:
class MinMaxNormaliser:
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())
        norm_array = norm_array * (self.max - self.min) + self.min
        return norm_array

    def denormalise(self, norm_array, original_min, original_max):
        array = (norm_array - self.min) / (self.max - self.min)
        array = array * (original_max - original_min) + original_min
        return array

In [28]:
class Saver:
    """saver is responsible to save features, and the min max values."""

    def __init__(self, feature_save_dir, min_max_values_save_dir):
        self.feature_save_dir = feature_save_dir
        self.min_max_values_save_dir = min_max_values_save_dir

    def save_feature(self, feature, file_path):
        save_path = self._generate_save_path(file_path)
        np.save(save_path, feature)

    def save_min_max_values(self, min_max_values):
        save_path = os.path.join(self.min_max_values_save_dir,
                                 "min_max_values.pkl")
        self._save(min_max_values, save_path)

    @staticmethod
    def _save(data, save_path):
        with open(save_path, "wb") as f:
            pickle.dump(data, f)

    def _generate_save_path(self, file_path):
        file_name = os.path.split(file_path)[1]
        save_path = os.path.join(self.feature_save_dir, file_name + ".npy")
        return save_path

In [47]:
class PreprocessingPipeline:
    """PreprocessingPipeline processes audio files in a directory, applying
    the following steps to each file:
        1- load a file
        2- pad the signal (if necessary)
        3- extracting log spectrogram from signal
        4- normalise spectrogram
        5- save the normalised spectrogram
    Storing the min max values for all the log spectrograms.
    """

    def __init__(self):
        self.padder = None
        self.extractor = None
        self.normaliser = None
        self.saver = None
        self.min_max_values = {}
        self._loader = None
        self._num_expected_samples = None

    @property
    def loader(self):
        return self._loader

    @loader.setter
    def loader(self, loader):
        self._loader = loader
        self._num_expected_samples = int(loader.sample_rate * loader.duration)

    def process(self, audio_files_dir):
        for root, _, files in os.walk(audio_files_dir):
            for file in files:
                file_path = os.path.join(root, file)
                print(file_path)
                self._process_file(file_path)
                print(f"Processed file {file_path}")
        self.saver.save_min_max_values(self.min_max_values)

    def _process_file(self, file_path):
        signal = self.loader.load(file_path)
        if self._is_padding_necessary(signal):
            signal = self._apply_padding(signal)
        feature = self.extractor.extract(signal)
        norm_feature = self.normaliser.normalise(feature)
        save_path = self.saver.save_feature(norm_feature, file_path)
        self._store_min_max_value(save_path, feature.min(), feature.max())

    def _is_padding_necessary(self, signal):
        if len(signal) < self._num_expected_samples:
            return True
        return False

    def _apply_padding(self, signal):
        num_missing_samples = self._num_expected_samples - len(signal)
        padded_signal = self.padder.right_pad(signal, num_missing_samples)
        return padded_signal

    def _store_min_max_value(self, save_path, min_val, max_val):
        self.min_max_values[save_path] = {
            "min": min_val,
            "max": max_val
        }

In [48]:
def extract(signal, sr, hop_length, fmin, n_bins):
    cqt = librosa.cqt(
        signal, 
        sr, 
        hop_length=hop_length, 
        window="hann", 
        fmin=fmin, 
        n_bins=n_bins
    )
    spectrogram = np.abs(cqt)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram

In [49]:
class Loader:
    """Loader is responsible for loading an audio file."""

    def __init__(self, sample_rate, duration, mono):
        self.sample_rate = sample_rate
        self.duration = duration
        self.mono = mono

    def load(self, file_path):
        signal = librosa.load(file_path,
                              sr=self.sample_rate,
                              mono=self.mono)[0]
        return signal

In [50]:
HOP_LENGTH = 512
N_BINS = 64
FMIN = librosa.note_to_hz('C0')
DURATION = 2.72  # in seconds
SAMPLE_RATE = 48000
MONO = True

SPECTROGRAMS_SAVE_DIR = "spectrograms/"
MIN_MAX_VALUES_SAVE_DIR = "fsdd/"
FILES_DIR = "audios/"

# instantiate all objects
loader = Loader(SAMPLE_RATE, DURATION, MONO)
padder = Padder()
log_spectrogram_extractor = LogSpectrogramExtractor(SAMPLE_RATE, FMIN, N_BINS, HOP_LENGTH)
min_max_normaliser = MinMaxNormaliser(0, 1)
saver = Saver(SPECTROGRAMS_SAVE_DIR, MIN_MAX_VALUES_SAVE_DIR)

preprocessing_pipeline = PreprocessingPipeline()
preprocessing_pipeline.loader = loader
preprocessing_pipeline.padder = padder
preprocessing_pipeline.extractor = log_spectrogram_extractor
preprocessing_pipeline.normaliser = min_max_normaliser
preprocessing_pipeline.saver = saver

preprocessing_pipeline.process(FILES_DIR)

audios/track_0.wav
Processed file audios/track_0.wav
audios/track_1.wav
Processed file audios/track_1.wav
audios/track_10.wav
Processed file audios/track_10.wav
audios/track_100.wav
Processed file audios/track_100.wav
audios/track_101.wav
Processed file audios/track_101.wav
audios/track_102.wav
Processed file audios/track_102.wav
audios/track_103.wav
Processed file audios/track_103.wav
audios/track_104.wav
Processed file audios/track_104.wav
audios/track_105.wav
Processed file audios/track_105.wav
audios/track_106.wav
Processed file audios/track_106.wav
audios/track_107.wav
Processed file audios/track_107.wav
audios/track_108.wav
Processed file audios/track_108.wav
audios/track_109.wav
Processed file audios/track_109.wav
audios/track_11.wav
Processed file audios/track_11.wav
audios/track_110.wav
Processed file audios/track_110.wav
audios/track_111.wav
Processed file audios/track_111.wav
audios/track_112.wav
Processed file audios/track_112.wav
audios/track_113.wav
Processed file audios/

  if __name__ == '__main__':


Processed file audios/track_882.wav
audios/track_89.wav
Processed file audios/track_89.wav
audios/track_9.wav
Processed file audios/track_9.wav
audios/track_90.wav
Processed file audios/track_90.wav
audios/track_91.wav
Processed file audios/track_91.wav
audios/track_92.wav
Processed file audios/track_92.wav
audios/track_93.wav
Processed file audios/track_93.wav
audios/track_94.wav
Processed file audios/track_94.wav
audios/track_95.wav
Processed file audios/track_95.wav
audios/track_96.wav
Processed file audios/track_96.wav
audios/track_97.wav
Processed file audios/track_97.wav
audios/track_98.wav
Processed file audios/track_98.wav
audios/track_99.wav
Processed file audios/track_99.wav


In [None]:
librosa.display.specshow(np.load("spectrograms/opm.mp3.npy"))

In [121]:
np.load("spectrograms/track_0.wav.npy").shape

(72, 216)

## Cut audios

In [180]:
import soundfile as sf

In [181]:
y, sr = librosa.load("topNCS_merged.wav", mono=True, sr=None)

In [202]:
DURATION = 2.72
HOP_LENGTH = int(DURATION * sr)

In [203]:
frames = librosa.util.frame(y, frame_length=int(DURATION * sr), hop_length=HOP_LENGTH)

In [204]:
for index, frame in enumerate(frames.T):
    sf.write(f'audios/track_{index}.wav', frame, 44100, 'PCM_24')

In [82]:
def load_fsdd(spectrograms_path):
    x_train = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path) # (n_bins, frames, 1)
            if np.any(np.isnan(spectrogram)):
                print(file_name)
            x_train.append(spectrogram)
    x_train = np.array(x_train)
    return x_train

In [84]:
os.remove("spectrograms/.DS_Store")

In [85]:
x_train = load_fsdd("spectrograms/")

In [71]:
for i, j in enumerate(x_train):
    if np.any(np.isnan(j)):
        print("NAN")
        print(i)

NAN
867
NAN
868


In [57]:

assert not 