<a href="https://colab.research.google.com/github/mrpep/music-source-separation-4all/blob/main/Music_Source_Separation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install dependencies 
%%capture
%%bash

pip install torchaudio==0.10.0+cu111 torch==1.10.0+cu111 -f https://download.pytorch.org/whl/cu111/torch_stable.html
pip install demucs
pip install youtube-dl
pip install ffmpeg-python
pip install openunmix
pip install typer
pip install httpx[http2]==0.19.0
pip install --no-deps spleeter
git clone https://github.com/pfnet-research/meta-tasnet
wget "https://www.dropbox.com/s/zw6zgt3edd88v87/best_model.pt"

In [2]:
#@title Enter nerd mode
from demucs.pretrained import get_model
from demucs.apply import apply_model
import torch
from youtube_dl import YoutubeDL
import ffmpeg
import numpy as np
import soundfile as sf
from pathlib import Path
import sys
sys.path.append("/content/meta-tasnet")

from model.tasnet import MultiTasNet
import librosa

from IPython.display import Audio

def download_from_youtube(url,start,end,sr=44100):
  with YoutubeDL(dict(format='bestaudio')) as ydl:
    video_url = ydl.extract_info(url, download=False)['formats'][0]['url']
  kwargs = {}
  if start > 0:
    kwargs['ss'] = start
  if end > 0:
    kwargs['t'] = end - start
  out,_ = ffmpeg.input(video_url,**kwargs).output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=sr).overwrite_output().run(capture_stdout=True)
  y = np.frombuffer(out,dtype='int16')
  y = np.reshape(y,(len(y)//2,2))/(2**15 - 1)
  return y

def demucs_separate(x, shifts=1, models=None):
  if 'mdx_extra_q' in models:
    model = models['mdx_extra_q']
  else:
    model = get_model(name='mdx_extra_q')
    models['mdx_extra_q'] = model
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model.to(device)
  model.eval()
  x = x.T
  x = np.expand_dims(x,0)
  x = torch.from_numpy(x)
  x = x.to(device, dtype=torch.float32)
  ref = x.mean(0)
  x = (x - ref.mean())/ref.std()
  sources = apply_model(model,x,shifts=shifts,split=True,overlap=0.25,progress=True)[0]
  sources = sources * ref.std() + ref.mean()
  sources = sources.detach().to('cpu').numpy()

  return sources

def openumx_separate(x,split_size=30, split_overlap=1, models=None):
  if 'umxl' in models:
    separator = models['umxl']
  else:
    separator = torch.hub.load('sigsep/open-unmix-pytorch', 'umxl', device='cuda')
    models['umxl'] = separator
  device = "cuda" if torch.cuda.is_available() else "cpu"
  separator.to(device)
  fs = 44100
  x = x.T
  x = x[np.newaxis,:,:]
  split_frames = int(split_size*fs)
  x_splits = np.concatenate([x[:,:,i:i+split_frames] for i in range(0,x.shape[-1]-split_frames,int(split_frames*split_overlap))],axis=0)
  sources_splits = []
  for split in x_splits:
    sources = separator(torch.tensor(split).unsqueeze(0).to(device,dtype=torch.float32))
    sources = sources.detach().cpu().numpy()
    sources_splits.append(sources)
  sources = np.concatenate(sources_splits,axis = -1)
  return sources

def metatasnet_separate(x,models=None):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  rate=44100
  if 'metatasnet' in models:
    model = models['metatasnet']
  else:
    state = torch.load("best_model.pt")  # load checkpoint
    model = MultiTasNet(state["args"]).to(device)  # initialize the model
    model.load_state_dict(state['state_dict'])  # load weights from the checkpoint
  
  def resample(audio, target_rate):
    return librosa.core.resample(audio, rate, target_rate, res_type='kaiser_best', fix=False)
    
  
  x = x.astype('float32')  # match the type with the type of the weights in the network
  x = x.T
  mix = [resample(x, s) for s in[8000, 16000, 32000]]  # resample to different sampling rates for the three stages
  mix = [librosa.util.fix_length(m, (mix[0].shape[-1]+1)*(2**i)) for i,m in enumerate(mix)]  # allign all three sample so that their lenghts are divisible
  mix = [torch.from_numpy(s).float().to(device).view(1, 1, -1) for s in mix]  # cast to tensor with shape: [1, 1, T']
  mix = [s / s.std(dim=-1, keepdim=True) for s in mix]  # normalize by the standard deviation

  model.eval()
  n_chunks = x.shape[0]//(30*44100)
  with torch.no_grad():        
    sources = model.inference(mix, n_chunks=n_chunks)[-1]  # call the network to obtain the separated audio with shape [1, 4, 1, T']

  # normalize the amplitudes by computing the least squares
  # -> we try to scale the separated stems so that their sum is equal to the input mix 
  a = sources[0,:,0,:].cpu().numpy().T  # separated stems
  b = mix[-1][0,0,:].cpu().numpy()  # input mix
  sol = np.linalg.lstsq(a, b, rcond=None)[0]  # scaling coefficients that minimize the MSE
  sources = a * sol  # scale the separated stems

  return sources

def spleeter_separate(x):
  pass

yt_cache_path = Path('youtube_cache')
if not yt_cache_path.exists():
  yt_cache_path.mkdir(parents=True)


In [9]:
#@title Separate!
model = "demucs" #@param ["demucs", "open-umx","meta-tasnet","spleeter-2stems","spleeter-4stems","spleeter-5stems"]
youtube_link = "i7AbCva8e7o" #@param {type:"string"}
youtube_start = 40 #@param {type:"integer"}
youtube_end =  120#@param {type:"integer"}
quality = 1 #@param {type: "integer"}

sampling_rate=44100
models = {}

from pathlib import Path

if youtube_link == "":
  from google.colab import files 
  uploaded = files.upload()
  mix, fs = librosa.core.load(uploaded,sr=sampling_rate,mono=False)
  mix = mix.T
  youtube_link = Path(uploaded).stem
else:
  video_cache_path = Path(yt_cache_path,'{}.wav'.format(youtube_link))
  if video_cache_path.exists():
    print('Cacheando video...')
    mix, _ = sf.read(str(video_cache_path.absolute()))
  else:
    print('Bajando de youtube...')
    mix = download_from_youtube(youtube_link, youtube_start,youtube_end,sr=sampling_rate)
    sf.write(video_cache_path,mix,samplerate=sampling_rate)
if model == 'demucs':
  sources = demucs_separate(mix,models=models,shifts=quality)
  source_names = ['Drums','Bass','Other','Vocals']
  source_fs=44100
elif model == 'open-umx':
  sources = openumx_separate(mix,models=models)[0]
  source_names = ['Vocals','Drums','Bass','Other']
  source_fs=44100
elif model == 'meta-tasnet':
  if mix.ndim == 2:
    ch_sources = []
    for ch in mix.T:
      ch_i = metatasnet_separate(ch,models)
      ch_sources.append(np.expand_dims(ch_i,0))
    sources = np.concatenate(ch_sources,axis=0)
    sources = np.transpose(sources,(2,0,1))
  else:
    sources = metatasnet_separate(ch,models)
  source_names = ['Bass','Drums','Vocals','Other']
  source_fs=32000
elif model == 'spleeter-2stems':
  command = "spleeter separate -p spleeter:2stems -o outputs/spleeter-2stems youtube_cache/{}.wav".format(youtube_link)
  !$command
elif model == 'spleeter-4stems':
  command = "spleeter separate -p spleeter:4stems -o outputs/spleeter-4stems youtube_cache/{}.wav".format(youtube_link)
  !$command
elif model == 'spleeter-5stems':
  command = "spleeter separate -p spleeter:5stems -o outputs/spleeter-5stems youtube_cache/{}.wav".format(youtube_link)
  !$command

if not model.startswith('spleeter'):
  if not Path('outputs/{}/{}'.format(model,youtube_link)).exists():
    Path('outputs/{}/{}'.format(model,youtube_link)).mkdir(parents=True)
  for source_name, source in zip(source_names,sources):
    sf.write('outputs/{}/{}/{}.wav'.format(model,youtube_link,source_name),source.T,source_fs)
  #print(source_name)
  #display(Audio(source,rate=44100))

Bajando de youtube...
[youtube] i7AbCva8e7o: Downloading webpage


100%|██████████████████████████████████████████████████████████████████████████| 99.0/99.0 [00:04<00:00, 20.58seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 99.0/99.0 [00:04<00:00, 20.13seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 99.0/99.0 [00:04<00:00, 20.13seconds/s]
100%|██████████████████████████████████████████████████████████████████████████| 99.0/99.0 [00:04<00:00, 20.88seconds/s]


In [None]:
Audio('outputs/demucs/i7AbCva8e7o/Drums.wav')

In [None]:
Audio('outputs/demucs/i7AbCva8e7o/Bass.wav')

In [None]:
Audio('outputs/demucs/i7AbCva8e7o/Other.wav')

In [None]:
Audio('outputs/demucs/i7AbCva8e7o/Vocals.wav')

### Transcripcion multipista

In [None]:
%%capture
%%bash

pip install --no-deps omnizart
pip install pretty_midi
omnizart download-checkpoints
pip install mido
pip install madmom

In [None]:
from omnizart.music import app as mapp
from omnizart.drum import app as dapp
mapp.transcribe('outputs/demucs/a1TGCp9DDLU/Bass.wav')
dapp.transcribe('outputs/demucs/a1TGCp9DDLU/Drums.wav')
