In [1]:
from glob import glob
from multiprocessing import Pool, cpu_count

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import tensorflow as tf
from tqdm import tqdm

In [2]:
SAMPLE_FILE = "/app/_data/competition_data/train_short_audio/acafly/XC580740.ogg"
TARGET_LEN_S = 5

In [11]:
x, sr = librosa.load(SAMPLE_FILE)
x = x[: TARGET_LEN_S * sr]
print(f"{x.shape=}\n{x.dtype=}\n{sr=}")

x.shape=(110250,)
x.dtype=dtype('float32')
sr=22050


In [None]:
def get_mfccs(
    signal=None,
    sample_rate=22500,
    num_mfccs=13,
    frame_length=1024,
    frame_step=512,
    fft_length=1024,
    fmax=8000,
    fmin=80,
    num_mel_bins=64,
):
    """Compute the MFCCs for audio file

    Keyword Arguments:
        signals {tensor} -- input signals as tensor or np.array in float32 type (default: {None})
        sample_rate {int} -- sampling rate (default: {44100})
        num_mfccs {int} -- number of mfccs to keep (default: {13})
        frame_length {int} -- frame length to compute STFT (default: {1024})
        frame_step {int} -- frame step to compute STFT (default: {512})
        fft_length {int} -- FFT length to compute STFT (default: {1024})
        fmax {int} -- Top edge of the highest frequency band (default: {8000})
        fmin {int} -- Lower bound on the frequencies to be included in the mel spectrum (default: {80})

    Returns:
        Tensor -- mfccs as tf.Tensor
    """

    #     if len(signal.shape) == 1:
    #         signal = tf.reshape(signal, [1, -1])

    #     # only support mono audio
    #     assert signal.shape[0] == 1

    # Step 1 : signals->stfts
    # `stfts` is a complex64 Tensor representing the Short-time Fourier Transform of
    
    # each signal in `signals`. Its shape is [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1 = 513

    stfts = tf.signal.stft(
        signal,
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=fft_length,
    )

    # Step2 : stfts->magnitude_spectrograms
    # An energy spectrogram is the magnitude of the complex-valued STFT.

    # A float32 Tensor of shape [batch_size, ?, 513].

    magnitude_spectrograms = tf.abs(stfts)

    # Step 3 : magnitude_spectrograms->mel_spectrograms
    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1]

    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=num_mel_bins,
        num_spectrogram_bins=num_spectrogram_bins,
        sample_rate=sample_rate,
        lower_edge_hertz=fmin,
        upper_edge_hertz=fmax,
    )

    mel_spectrograms = tf.tensordot(
        magnitude_spectrograms, linear_to_mel_weight_matrix, 1
    )

    # Step 4 : mel_spectrograms->log_mel_spectrograms
    log_offset = 1e-6
    log_mel_spectrograms = tf.math.log(mel_spectrograms + log_offset)

    # Step 5 : log_mel_spectrograms->mfccs
    # Keep the first `num_mfccs` MFCCs.
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[
        ..., :num_mfccs
    ]

    return mfccs.numpy()

In [9]:
msg = get_mfccs(
    signal=x,
    sample_rate=sr,
    num_mfccs=256,
    frame_length=1024,
    frame_step=427,
    fft_length=1024,
    fmax=8000,
    fmin=80,
    num_mel_bins=256,
)

# msg.numpy().shape

In [12]:
x = tf.constant(x)

In [7]:
for i in tqdm(range(1000)):
    msg = get_mfccs(
        signal=x,
        sample_rate=sr,
        num_mfccs=256,
        frame_length=1024,
        frame_step=427,
        fft_length=1024,
        fmax=8000,
        fmin=80,
        num_mel_bins=256,
    )

100%|██████████| 1000/1000 [00:17<00:00, 57.28it/s]


In [14]:
def _map(_):
    msg = get_mfccs(
        signal=x,
        sample_rate=sr,
        num_mfccs=256,
        frame_length=1024,
        frame_step=427,
        fft_length=1024,
        fmax=8000,
        fmin=80,
        num_mel_bins=256,
    )


with Pool(cpu_count() * 0 + 2) as pool:
    _ = list(
        tqdm(
            pool.imap(
                _map,
                range(1000),
            ),
            total=1000,
            smoothing=0,
        )
    )

  0%|          | 0/1000 [00:22<?, ?it/s]


KeyboardInterrupt: 