In this notebook we will define/test a function for separating the "foreground" and "background" of a song (as defined in the included paper).

[Examples of algorithm results linked by authors in paper](https://interactiveaudiolab.github.io/demos/2dft)

In [3]:
from scipy.io import wavfile
from scipy.fft import fft2, ifft2
from scipy.signal import stft, istft
import numpy as np
"""
song_cut returns the "foreground" and "background" tracks. 
sound_filename, the song to be processed, should be a .wav file. 
radius is the radius of the neighborhood we use to define a peak. Larger radius values (close to 100) may cause background 
to leak into the foreground track, smaller radius values (close to 15) may cause the opposite.

In addition, this program assumes mono-track inputs with very little metadata attached. Such preprocessing can be performed 
in the free program Audacity.
"""
def song_cut(sound_filename,radius):
    samplerate, sound_data = wavfile.read(sound_filename)
    
    #the short time fourier transform turns our 1-dimensional sound information into a two dimension fequency/time plot.
    #nperseg and noverlap must be preserved so we can invert this short time transform later.
    nperseg_choice = 8192
    noverlap_choice = nperseg_choice/2
    sample_frequencies, segment_times, sound_stft = stft(sound_data, nperseg = nperseg_choice, noverlap = noverlap_choice)

    #the sound-rate spectrogram is generated by taking the 2D fourier transform of the magnitude of sound_stft
    spectrogram = fft2(np.absolute(sound_stft))

    #Now we want to find any peaks in the magnitude of spectrogram along the rate axis.
    spec_mag = np.absolute(spectrogram)
    threshold = np.std(spec_mag)
    dims = spec_mag.shape

    #iterate through all points in spec_mag, finding and marking peaks in background_mask
    background_mask = np.zeros(dims)
    for i in range(dims[0]):
        for j in range(dims[1]):
            #"""
            #if we get too close to the edges we will make the neighborhoods slightly smaller
            if j<radius and dims[1]-j < radius:
                neighborhood = spec_mag[i][0:dims[1]]
            elif j<radius:
                neighborhood = spec_mag[i][0:j+radius+1]
            elif dims[1]-j < radius:
                neighborhood = spec_mag[i][j-radius:dims[1]]
            else:
                neighborhood = spec_mag[i][j-radius: j+radius+1]
            nbhd_range = np.amax(neighborhood)-np.amin(neighborhood)
            #if nbhd_range is less than threshold we do not care
            if nbhd_range > threshold and spec_mag[i][j]== np.amax(neighborhood):
                background_mask[i][j] = 1
    #Here we separate the foreground and background portions of the signal
    foreground_mask = np.ones(dims)- background_mask
    foreground_spetrogram_magnitude = ifft2(np.multiply(foreground_mask,spectrogram))
    background_spetrogram_magnitude = ifft2(np.multiply(background_mask,spectrogram))
    time_background_mask = np.greater(background_spetrogram_magnitude, foreground_spetrogram_magnitude)
    time_foreground_mask = np.ones(time_background_mask.shape) - time_background_mask
    foreground_track = istft(np.multiply(time_foreground_mask,sound_stft), nperseg = nperseg_choice, noverlap = noverlap_choice) 
    background_track = istft(np.multiply(time_background_mask,sound_stft), nperseg = nperseg_choice, noverlap = noverlap_choice)
    #now turned back into 1d signals, we save as WAV files
    foreground_filename = 'foreground_'+sound_filename
    background_filename = 'background_'+sound_filename
    #want output file to have the same data type as input audio
    wavfile.write(foreground_filename, samplerate, foreground_track[1].astype('int16'))
    wavfile.write(background_filename, samplerate, background_track[1].astype('int16'))
    print('Separation Complete.')

In [5]:
#These songs were recordings of other songs on my phone. Not sure how much (if at all) the ambient air noise added
#affects the results
song_cut('blood.wav',30)
song_cut('burningForYou.wav',30)
song_cut('iCouldBeYours.wav',30)
song_cut('magentaMountain.wav',30)

#These were taken from the MUSDB18 dataset:
# https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems
#since I wanted .WAV files that weren't recordings from my phone.
song_cut('mixture.wav',30)
song_cut('mixture2.wav',30)

Separation Complete.
Separation Complete.
Separation Complete.
Separation Complete.
Separation Complete.
Separation Complete.
