In [None]:
import museval
import musdb
from utils import separate_from_audio,load_unet_spleeter
import audiofile as af
from glob import glob
import numpy as np
import torch

In [None]:
def estimate_and_evaluate(track):
    # assume mix as estimates
    estimates = {
        'vocals': track,
        'accompaniment': track
    }

    # Evaluate using museval
    scores = museval.eval_mus_track(
        track, estimates, output_dir="../results/"
    )

    # print nicely formatted and aggregated scores
    print(scores)
def to_mono(audio):
    return audio[0]+audio[1]

In [None]:
paths = glob("/nfs/home/pedro.lopes/data/dataset/musdb18hq/test/*")
mix_audios = []
vocal_audios = []
acc_audios = []
for path in paths:
    audio_mix, sr = af.read(path + '/mixture.wav' )
    audio_vocal, sr = af.read(path + '/vocals.wav' )
    audio_bass, sr = af.read(path + '/bass.wav' )
    audio_drums, sr = af.read(path + '/drums.wav' )
    audio_other, sr = af.read(path + '/other.wav' )
    audio_mix = to_mono(audio_mix)
    audio_vocal = to_mono(audio_vocal)
    audio_acc = to_mono((audio_bass+audio_drums+audio_other)/3)
    mix_audios.append(audio_mix)
    vocal_audios.append(audio_vocal)
    acc_audios.append(audio_acc)

In [None]:
weights_path = '../checkpoints/best_4-6.hdf5'
model = load_unet_spleeter((6,4),weights_path)

In [None]:
audio_acc = np.squeeze(audio_acc)
audio_acc.shape

In [None]:
from tqdm import tqdm
pred_vocals = []
pred_accs = []
for i in tqdm(range(len(mix_audios))):
    audio_mix = mix_audios[i]
    audio_vocal_pred = separate_from_audio(np.squeeze(audio_mix),44100,model)
    audio_acc_pred = audio_mix[:len(audio_vocal_pred)] - audio_vocal_pred
    pred_vocals.append(audio_vocal_pred)
    pred_accs.append(audio_acc_pred)
    
#     audio_vocal = np.expand_dims(audio_vocal,axis=-1)
#     audio_acc = np.expand_dims(audio_acc*3,axis=-1)
#     audio_vocal_pred = np.expand_dims(audio_vocal_pred,axis=-1)
#     audio_acc_pred = np.expand_dims(audio_acc_pred,axis=-1)
#     reference_sources = np.array([audio_vocal,audio_acc])
#     estimated_sources = np.array([audio_vocal_pred,audio_acc_pred])


In [None]:
results = []
for i in tqdm(range(len(pred_vocals))):
    estimated_sources = np.vstack([pred_vocals[i],pred_accs[i]])
    estimated_sources = np.expand_dims(estimated_sources,axis=-1)
    reference_sources = np.vstack([vocal_audios[i][:len(pred_vocals[i])],acc_audios[i][:len(pred_vocals[i])]*3])
    reference_sources = np.expand_dims(reference_sources,axis=-1)
    sdr,isr,sir,sar,perm=museval.metrics.bss_eval(reference_sources,estimated_sources,np.inf)
    results.append([sdr,isr,sir,sar])
    print([sdr,isr,sir,sar])

In [None]:
np.median(np.array(results),axis=0)

In [None]:
from IPython.display import Audio
Audio(np.squeeze(audio_vocal_pred),rate=44100)

In [None]:
full_len = 0
for item in mix_audios:
    full_len+=len(item)
full_len

In [None]:
full_len/3600/44100

In [None]:
Audio(np.squeeze(audio_vocal),rate=44100)

In [None]:
sdr,isr,sir,sar,perm=museval.metrics.bss_eval(reference_sources,estimated_sources,np.inf)

In [None]:
paths[2]

In [None]:

for i in tqdm(range(len(paths))):
    basename = paths[i].rsplit('/',1)[1]
    base_path = '/nfs/home/pedro.lopes/data/results/u_net_4_6/'
    af.write(base_path + basename + '_vocals.wav',pred_vocals[i], 44100)
    af.write(base_path + basename + '_acc.wav',pred_accs[i], 44100)

In [None]:
!mkdir /nfs/home/pedro.lopes/data/results/u_net_4_6


In [None]:
def calc_sdr(references, estimates):
    # compute SDR for one song
    delta = 1e-7  # avoid numerical errors
    num = np.sum(np.square(references), axis=(1, 2))
    den = np.sum(np.square(references - estimates), axis=(1, 2))
    num += delta
    den += delta
    return 10 * np.log10(num  / den)
calc_sdr(reference_sources,estimated_sources)

In [None]:
np.median(sdr[sdr>0],axis=-1)

In [None]:
np.median(sdr,axis=-1),np.median(isr,axis=-1),np.median(sir,axis=-1),np.median(sar,axis=-1)

In [None]:
!pip install openunmix

In [None]:
from openunmix import utils
import openunmix

In [None]:
def separate(
    audio,
    rate=None,
    model_str_or_path="umxhq",
    targets=None,
    niter=1,
    residual=False,
    wiener_win_len=300,
    aggregate_dict=None,
    separator=None,
    device=None,
    filterbank="torch",
):
    """
    Open Unmix functional interface
    Separates a torch.Tensor or the content of an audio file.
    If a separator is provided, use it for inference. If not, create one
    and use it afterwards.
    Args:
        audio: audio to process
            torch Tensor: shape (channels, length), and
            `rate` must also be provided.
        rate: int or None: only used if audio is a Tensor. Otherwise,
            inferred from the file.
        model_str_or_path: the pretrained model to use
        targets (str): select the targets for the source to be separated.
            a list including: ['vocals', 'drums', 'bass', 'other'].
            If you don't pick them all, you probably want to
            activate the `residual=True` option.
            Defaults to all available targets per model.
        niter (int): the number of post-processingiterations, defaults to 1
        residual (bool): if True, a "garbage" target is created
        wiener_win_len (int): the number of frames to use when batching
            the post-processing step
        aggregate_dict (str): if provided, must be a string containing a '
            'valid expression for a dictionary, with keys as output '
            'target names, and values a list of targets that are used to '
            'build it. For instance: \'{\"vocals\":[\"vocals\"], '
            '\"accompaniment\":[\"drums\",\"bass\",\"other\"]}\'
        separator: if provided, the model.Separator object that will be used
             to perform separation
        device (str): selects device to be used for inference
        filterbank (str): filterbank implementation method.
            Supported are `['torch', 'asteroid']`. `torch` is about 30% faster
            compared to `asteroid` on large FFT sizes such as 4096. However,
            asteroids stft can be exported to onnx, which makes is practical
            for deployment.
    """
    if separator is None:
        separator = utils.load_separator(
            model_str_or_path=model_str_or_path,
            targets=targets,
            niter=niter,
            residual=residual,
            wiener_win_len=wiener_win_len,
            device=device,
            pretrained=True,
            filterbank=filterbank,
        )
        separator.freeze()
        if device:
            separator.to(device)

    if rate is None:
        raise Exception("rate` must be provided.")

    if device:
        audio = audio.to(device)
    audio = utils.preprocess(audio, rate, separator.sample_rate)

    # getting the separated signals
    estimates = separator(audio)
    estimates = separator.to_dict(estimates, aggregate_dict=aggregate_dict)
    return estimates

In [None]:
audio_mix = torch.Tensor(mix_audios[0])
result = separate(audio_mix,rate=44100,model_str_or_path="../checkpoints/model6",targets=['vocals'],residual=True)

In [None]:
audio_vocal = np.array(result['vocals'][0][0])
audio_acc = mix_audios[0] - audio_vocal

In [None]:
Audio(audio_vocal,rate=44100)

In [None]:
Audio(audio_acc,rate=44100)

In [None]:
audio_vocal_pred.shape

In [None]:
from tqdm import tqdm
pred_vocals = []
pred_accs = []
for i in tqdm(range(len(mix_audios))):
    audio_mix = torch.Tensor(mix_audios[i])
    result = separate(audio_mix,rate=44100,model_str_or_path="../checkpoints/model6",targets=['vocals'],residual=True)
    audio_vocal_pred = np.array(result['vocals'][0][0])
    audio_acc_pred = audio_mix[:len(audio_vocal_pred)] - audio_vocal_pred[:len(audio_mix)]
    pred_vocals.append(audio_vocal_pred)
    pred_accs.append(audio_acc_pred)

In [None]:
for i in tqdm(range(len(paths))):
    basename = paths[i].rsplit('/',1)[1]
    base_path = '/nfs/home/pedro.lopes/data/results/blstm2/'
    af.write(base_path + basename + '_vocals.wav',pred_vocals[i], 44100)
    af.write(base_path + basename + '_acc.wav',pred_accs[i], 44100)

In [None]:
!mkdir /nfs/home/pedro.lopes/data/results/blstm2/

In [None]:
results = []
for i in tqdm(range(len(pred_vocals))):
    estimated_sources = np.vstack([pred_vocals[i][:len(pred_accs[i])],pred_accs[i]])
    print(len(pred_vocals[i]),len(pred_accs[i]),len(vocal_audios[i]),len(acc_audios[i]))
    estimated_sources = np.expand_dims(estimated_sources,axis=-1)
    reference_sources = np.vstack([vocal_audios[i][:len(pred_vocals[i])],acc_audios[i][:len(pred_vocals[i])]*3])
    reference_sources = np.expand_dims(reference_sources,axis=-1)
    sdr,isr,sir,sar,perm=museval.metrics.bss_eval(reference_sources,estimated_sources,np.inf)
    results.append([sdr,isr,sir,sar])
    print([sdr,isr,sir,sar])

In [None]:
Audio(pred_vocals[1],rate=44100)

In [None]:
np.nanmedian(np.array(results),axis=0)

In [None]:
from scipy import signal


In [None]:
sample_rate=44100
Audio(audio_mix[start_time*sample_rate:end_time*sample_rate],rate=44100)

In [None]:
import os
basepaths=[]
for path in paths:
    basepaths.append(os.path.basename(path))
basepaths[0]

In [None]:
model = 'u_net_5_5'
rootdir ='/nfs/home/pedro.lopes/data/results/' + model + '/'
number= 0
audio_vocal = af.read(rootdir + basepaths[number] + '_vocals.wav')
audio_acc = af.read(rootdir + basepaths[number] + '_acc.wav')

