In [None]:
# -*- coding: utf-8 -*-

from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import librosa

import librosa.display
import sys, os

def detect_pitch(y, sr, t):
    index = magnitudes[:, t].argmax()
    pitch = pitches[index, t]
    return pitch

#############################################
# Load an example with vocals.
path = sys.argv[1]
y, sr = librosa.load(path)


# Get the spectrogram
D = librosa.stft(y)
# Split into harmonic and percussive components
H, P = librosa.decompose.hpss(D, margin=2.0)


# And compute the spectrogram magnitude and phase
S_full, phase = librosa.magphase(H)

# We'll compare frames using cosine similarity, and aggregate similar frames
# by taking their (per-frequency) median value.
#
# To avoid being biased by local continuity, we constrain similar frames to be
# separated by at least 2 seconds.
#
# This suppresses sparse/non-repetetitive deviations from the average spectrum,
# and works well to discard vocal elements.

S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))

# The output of the filter shouldn't be greater than the input
# if we assume signals are additive.  Taking the pointwise minimium
# with the input spectrum forces this.
S_filter = np.minimum(S_full, S_filter)


##############################################
# The raw filter output can be used as a mask,
# but it sounds better if we use soft-masking.

# We can also use a margin to reduce bleed between the vocals and instrumentation masks.
# Note: the margins need not be equal for foreground and background separation
margin_i, margin_v = 2, 10
power = 2

mask_i = librosa.util.softmask(S_filter,
                               margin_i * (S_full - S_filter),
                               power=power)

mask_v = librosa.util.softmask(S_full - S_filter,
                               margin_v * S_filter,
                               power=power)

# Once we have the masks, simply multiply them with the input spectrum
# to separate the components

S_foreground = mask_v * S_full
S_background = mask_i * S_full

# Regenerate audio using these masks
y_fore = librosa.core.istft(S_foreground * phase)
y_back = librosa.core.istft(S_background * phase)

basepath = os.path.splitext(path)[0]
librosa.output.write_wav(basepath + "_bass.wav", y_back, sr)

