In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import os
import librosa as lb
from librosa.core import load,stft,istft,magphase
import numpy as np
from scipy.io import wavfile
import soundfile as sf

In [None]:
#installed
!pip install mir_eval

**Load and Pre-process MIR-1K Dataset**

In [None]:
#skip if unzipped
!unzip MIR-1K.zip

In [None]:
!tree -d MIR-1K

In [5]:
#Audio processing parameters
Sample_rate = 8192
Window_size = 1024
Hop = 768
Patch = 128
Stride = 10

In [6]:
#Function to extract magnitude spectrogram
def mag_phase_spectrogram(wav):
    wav = wav.astype(np.float32)
    spec = stft(wav,n_fft=Window_size,hop_length=Hop,win_length=Window_size)
    mag_spec,phase_spec = magphase(spec)
    return mag_spec.astype(np.float32),phase_spec

In [None]:
curr = 'MIR-1K/Wavfile'
out = 'MIR-1K_Processed'

if not os.path.exists(out):
        os.makedirs(out)
        
for f in os.listdir(curr):
    print("Processing {}".format(f))
    path = os.path.join(curr,f)
    _,audio = wavfile.read(path)
    # print(audio.shape)
    acc_wav = audio[:,0]
    voc_wav = audio[:,1]
    mix_wav = np.sum(audio,axis=-1) #take sum over last axis
    
    #extract magnitude spectrogram of wav files
    acc_mag,_= mag_phase_spectrogram(acc_wav)
    voc_mag,_ = mag_phase_spectrogram(voc_wav)
    mix_mag,mix_phase = mag_phase_spectrogram(mix_wav)
    
    #normalise magnitude spectrograms
    peak = mix_mag.max()
    acc_norm = acc_mag/peak
    voc_norm = voc_mag/peak
    mix_norm = mix_mag/peak  
    
    #save as .npz files
    print("Saving {}".format(f))
    f_ = f.split('.')[0]
    np.savez_compressed(f'{out}/{f_}.npz',mix_mag=mix_norm,mix_phase=mix_phase,vocal=voc_norm,acc=acc_norm)

**Data Augmentation**

In [None]:
def add_noise(audio,noise_factor):
    noise = np.random.randn(audio.shape[0],audio.shape[1])
    aug_audio = noise + noise_factor*noise
    return aug_audio.astype(np.float32)

In [None]:
def change_pitch(audio,sr,pitch_factor):
    return lb.effects.pitch_shift(audio,sr,pitch_factor)

In [None]:
curr = 'MIR-1K_Processed'
out = 'MIR-1K_Augmentated'

if not os.path.exists(out):
        os.makedirs(out)

print("Data augmentation processing...")

for npz_file in os.listdir(curr):
    f = npz_file.split('.')[0]
    path = os.path.join(curr,npz_file)
    files = np.load(path)
    mix_phase = files['mix_phase']
    
    #add noise
    voc_ns = add_noise(files['vocal'],0.05)
    mix_ns = add_noise(files['mix_mag'],0.05)    
    acc_ns = add_noise(files['acc'],0.05)
    
    #normalise noisy data
    peak = mix_ns.max()
    acc_norm = acc_ns/peak
    voc_norm = voc_ns/peak
    mix_norm = mix_ns/peak
    
    print("Saving noisy {}".format(f))
    np.savez_compressed(f'{out}/{f}_noisy.npz',mix_mag=mix_norm,mix_phase=mix_phase,vocal=voc_norm,acc=acc_norm)
    
print("Process complete...")

In [None]:
curr = 'MIR-1K/Wavfile'
out = 'MIR-1K_Augmentated'

if not os.path.exists(out):
        os.makedirs(out)
        
print("Data Augmentation processing...")

for f in os.listdir(curr):
    path = os.path.join(curr,f)
    _,audio = wavfile.read(path)
    # print(audio.shape)
    acc_wav = audio[:,0].astype(np.float32)
    voc_wav = audio[:,1].astype(np.float32)
    mix_wav = np.sum(audio,axis=-1).astype(np.float32) #take sum over last axis

    #change pitch
    voc_pc = change_pitch(voc_wav,Sample_rate,0.02)
    acc_pc = change_pitch(acc_wav,Sample_rate,0.02)
    mix_pc = change_pitch(mix_wav,Sample_rate,0.02)
    
    #extract magnitude spectrogram of wav files
    acc_mag,_= mag_phase_spectrogram(acc_pc)
    voc_mag,_ = mag_phase_spectrogram(voc_pc)
    mix_mag,mix_phase = mag_phase_spectrogram(mix_pc)
    
    #normalise magnitude spectrograms
    peak = mix_mag.max()
    acc_norm = acc_mag/peak
    voc_norm = voc_mag/peak
    mix_norm = mix_mag/peak  
    
    print("Saving pitch {}".format(f))
    f_ = f.split('.')[0]
    np.savez_compressed(f'{out}/{f_}_pitch.npz',mix_mag=mix_norm,mix_phase=mix_phase,vocal=voc_norm,acc=acc_norm)
    
print("Processing complete...")

**Create Train,Validation & Test Dataset**

In [7]:
!unzip drive/MyDrive/source_sep/MIR_processed.zip

unzip:  cannot find or open drive/MyDrive/source_sep/MIR_processed.zip, drive/MyDrive/source_sep/MIR_processed.zip.zip or drive/MyDrive/source_sep/MIR_processed.zip.ZIP.


In [None]:
!unzip drive/MyDrive/source_sep/MIR_augment.zip

In [8]:
#loads target npy files
def load_npy(npz_file, target = "mix_mag"):
  npy_files = np.load(npz_file)
  file = f'{target}.npy'
  target_file = npy_files[file]
  mix_mag_file = npy_files['mix_mag.npy']
  mix_phase_file = npy_files['mix_phase.npy']
  print(f"{target}_shape : {target_file.shape} & mix_mag_shape : {mix_mag_file.shape}")
  return mix_mag_file,target_file, mix_phase_file

In [9]:
#computes patches of data and adds them to train data
def sample_patches(mix_mag_spec, target_spec, phase,X,y,X_phase):
  #generate random start points for data patches
  if mix_mag_spec.shape[1]<Patch:
    npad = ((0,0),(0,Patch-mix_mag_spec.shape[1]))
    mix_mag_spec = np.pad(mix_mag_spec, pad_width=npad,mode = 'mean')
    target_spec = np.pad(target_spec,pad_width=npad,mode = 'mean')
    phase = np.pad(phase,pad_width=npad,mode = 'mean')
    X.append(mix_mag_spec[1:,:,np.newaxis])
    y.append(target_spec[1:,:,np.newaxis])
    X_phase.append(phase[1:,:,np.newaxis])

  else:      
    extra = np.random.randint(0,mix_mag_spec.shape[1]-Patch,(mix_mag_spec.shape[1]-Patch)//Stride) 
    #compute patches & add to training data
    for i in extra:
      j = i + Patch
      X.append(mix_mag_spec[1:,i:j,np.newaxis])
      y.append(target_spec[1:,i:j,np.newaxis])
      X_phase.append(phase[1:,i:j,np.newaxis])

**New dataset**

In [10]:
#train dataset
mix_train_X = []
vocal_train_y = []
train_phase = []

#validation dataset
mix_val_X = []
vocal_val_y = []
val_phase = []

#test dataset
mix_test_X = []
vocal_test_y = []
test_phase = []

* With Augmentation



In [11]:
#create dataset including augmented dataset
print("Creating dataset...")

directory = ['MIR-1K_Processed','MIR-1K_Augmentated']

for dir in directory:
  print(f'Processing directory {dir}...')

  for f in os.listdir(dir):
    path = os.path.join(dir,f)
    id = f.split('_')[1]
    mix_mag,voc,phase  = load_npy(path,target="vocal")

    if id == '4':    
      sample_patches(mix_mag,voc,phase,mix_val_X,vocal_val_y,val_phase)
    
    elif id == '5':
      sample_patches(mix_mag,voc,phase,mix_test_X,vocal_test_y,test_phase)
    
    else:
      sample_patches(mix_mag,voc,phase,mix_train_X,vocal_train_y,train_phase)
    
  print(f'Directory {dir} processing complete.')

print("Datasets created successfully.")

Creating dataset...
Processing directory MIR-1K_Processed...
vocal_shape : (513, 180) & mix_mag_shape : (513, 180)
vocal_shape : (513, 157) & mix_mag_shape : (513, 157)
vocal_shape : (513, 188) & mix_mag_shape : (513, 188)
vocal_shape : (513, 205) & mix_mag_shape : (513, 205)
vocal_shape : (513, 168) & mix_mag_shape : (513, 168)
vocal_shape : (513, 163) & mix_mag_shape : (513, 163)
vocal_shape : (513, 181) & mix_mag_shape : (513, 181)
vocal_shape : (513, 157) & mix_mag_shape : (513, 157)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 177) & mix_mag_shape : (513, 177)
vocal_shape : (513, 148) & mix_mag_shape : (513, 148)
vocal_shape : (513, 154) & mix_mag_shape : (513, 154)
vocal_shape : (513, 239) & mix_mag_shape : (513, 239)
vocal_shape : (513, 136) & mix_mag_shape : (513, 136)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 245) & mix_mag_shape : (513, 245)
vocal_shape : (513, 1

vocal_shape : (513, 128) & mix_mag_shape : (513, 128)
vocal_shape : (513, 144) & mix_mag_shape : (513, 144)
vocal_shape : (513, 162) & mix_mag_shape : (513, 162)
vocal_shape : (513, 134) & mix_mag_shape : (513, 134)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 216) & mix_mag_shape : (513, 216)
vocal_shape : (513, 149) & mix_mag_shape : (513, 149)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 113) & mix_mag_shape : (513, 113)
vocal_shape : (513, 200) & mix_mag_shape : (513, 200)
vocal_shape : (513, 250) & mix_mag_shape : (513, 250)
vocal_shape : (513, 117) & mix_mag_shape : (513, 117)
vocal_shape : (513, 219) & mix_mag_shape : (513, 219)
vocal_shape : (513, 221) & mix_mag_shape : (513, 221)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 180) & mix_mag_shape : (513, 180)
vocal_shape : (513, 229) & mix_mag_shape : (513, 229)
vocal_shape : (513, 146) & m

vocal_shape : (513, 242) & mix_mag_shape : (513, 242)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 223) & mix_mag_shape : (513, 223)
vocal_shape : (513, 237) & mix_mag_shape : (513, 237)
vocal_shape : (513, 203) & mix_mag_shape : (513, 203)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 229) & mix_mag_shape : (513, 229)
vocal_shape : (513, 133) & mix_mag_shape : (513, 133)
vocal_shape : (513, 216) & mix_mag_shape : (513, 216)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 233) & mix_mag_shape : (513, 233)
vocal_shape : (513, 152) & mix_mag_shape : (513, 152)
vocal_shape : (513, 161) & mix_mag_shape : (513, 161)
vocal_shape : (513, 182) & mix_mag_shape : (513, 182)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 208) & mix_mag_shape : (513, 208)
vocal_shape : (513, 106) & m

vocal_shape : (513, 120) & mix_mag_shape : (513, 120)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 227) & mix_mag_shape : (513, 227)
vocal_shape : (513, 190) & mix_mag_shape : (513, 190)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 197) & mix_mag_shape : (513, 197)
vocal_shape : (513, 245) & mix_mag_shape : (513, 245)
vocal_shape : (513, 169) & mix_mag_shape : (513, 169)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 147) & mix_mag_shape : (513, 147)
vocal_shape : (513, 173) & mix_mag_shape : (513, 173)
vocal_shape : (513, 111) & mix_mag_shape : (513, 111)
vocal_shape : (513, 201) & mix_mag_shape : (513, 201)
vocal_shape : (513, 249) & mix_mag_shape : (513, 249)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 149) & mix_mag_shape : (513, 149)
vocal_shape : (513, 147) & m

vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 180) & mix_mag_shape : (513, 180)
vocal_shape : (513, 190) & mix_mag_shape : (513, 190)
vocal_shape : (513, 117) & mix_mag_shape : (513, 117)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 170) & mix_mag_shape : (513, 170)
vocal_shape : (513, 126) & mix_mag_shape : (513, 126)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 128) & mix_mag_shape : (513, 128)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 155) & mix_mag_shape : (513, 155)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 191) & mix_mag_shape : (513, 191)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 169) & mix_mag_shape : (513, 169)
vocal_shape : (513, 141) & m

vocal_shape : (513, 143) & mix_mag_shape : (513, 143)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 155) & mix_mag_shape : (513, 155)
vocal_shape : (513, 116) & mix_mag_shape : (513, 116)
vocal_shape : (513, 172) & mix_mag_shape : (513, 172)
vocal_shape : (513, 167) & mix_mag_shape : (513, 167)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 216) & mix_mag_shape : (513, 216)
vocal_shape : (513, 136) & mix_mag_shape : (513, 136)
vocal_shape : (513, 169) & mix_mag_shape : (513, 169)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 118) & mix_mag_shape : (513, 118)
vocal_shape : (513, 165) & mix_mag_shape : (513, 165)
vocal_shape : (513, 239) & mix_mag_shape : (513, 239)
vocal_shape : (513, 221) & mix_mag_shape : (513, 221)
vocal_shape : (513, 179) & mix_mag_shape : (513, 179)
vocal_shape : (513, 149) & m

vocal_shape : (513, 237) & mix_mag_shape : (513, 237)
vocal_shape : (513, 144) & mix_mag_shape : (513, 144)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 239) & mix_mag_shape : (513, 239)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 198) & mix_mag_shape : (513, 198)
vocal_shape : (513, 201) & mix_mag_shape : (513, 201)
vocal_shape : (513, 154) & mix_mag_shape : (513, 154)
vocal_shape : (513, 223) & mix_mag_shape : (513, 223)
vocal_shape : (513, 178) & mix_mag_shape : (513, 178)
vocal_shape : (513, 140) & mix_mag_shape : (513, 140)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 207) & mix_mag_shape : (513, 207)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 191) & mix_mag_shape : (513, 191)
vocal_shape : (513, 173) & mix_mag_shape : (513, 173)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 130) & mix_mag_shape : (513, 130)
vocal_shape : (513, 171) & m

vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 209) & mix_mag_shape : (513, 209)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 209) & mix_mag_shape : (513, 209)
vocal_shape : (513, 224) & mix_mag_shape : (513, 224)
vocal_shape : (513, 133) & mix_mag_shape : (513, 133)
vocal_shape : (513, 170) & mix_mag_shape : (513, 170)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 142) & mix_mag_shape : (513, 142)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 190) & mix_mag_shape : (513, 190)
vocal_shape : (513, 125) & mix_mag_shape : (513, 125)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 173) & mix_mag_shape : (513, 173)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 217) & mix_mag_shape : (513, 217)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 131) & m

vocal_shape : (513, 185) & mix_mag_shape : (513, 185)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 142) & mix_mag_shape : (513, 142)
vocal_shape : (513, 165) & mix_mag_shape : (513, 165)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 167) & mix_mag_shape : (513, 167)
vocal_shape : (513, 174) & mix_mag_shape : (513, 174)
vocal_shape : (513, 165) & mix_mag_shape : (513, 165)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 170) & mix_mag_shape : (513, 170)
vocal_shape : (513, 91) & mix_mag_shape : (513, 91)
vocal_shape : (513, 143) & mix_mag_shape : (513, 143)
vocal_shape : (513, 142) & mix_mag_shape : (513, 142)
vocal_shape : (513, 169) & mix_mag_shape : (513, 169)
vocal_shape : (513, 146) & mix_mag_shape : (513, 146)
vocal_shape : (513, 133) & mix_mag_shape : (513, 133)
vocal_shape : (513, 115) & mix_mag_shape : (513, 115)
vocal_shape : (513, 172) & mix

vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 216) & mix_mag_shape : (513, 216)
vocal_shape : (513, 170) & mix_mag_shape : (513, 170)
vocal_shape : (513, 206) & mix_mag_shape : (513, 206)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 176) & mix_mag_shape : (513, 176)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 239) & mix_mag_shape : (513, 239)
vocal_shape : (513, 132) & mix_mag_shape : (513, 132)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 132) & mix_mag_shape : (513, 132)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 140) & mix_mag_shape : (513, 140)
vocal_shape : (513, 125) & mix_mag_shape : (513, 125)
vocal_shape : (513, 237) & mix_mag_shape : (513, 237)
vocal_shape : (513, 184) & mix_mag_shape : (513, 184)
vocal_shape : (513, 138) & m

vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 184) & mix_mag_shape : (513, 184)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 116) & mix_mag_shape : (513, 116)
vocal_shape : (513, 149) & mix_mag_shape : (513, 149)
vocal_shape : (513, 177) & mix_mag_shape : (513, 177)
vocal_shape : (513, 130) & mix_mag_shape : (513, 130)
vocal_shape : (513, 162) & mix_mag_shape : (513, 162)
vocal_shape : (513, 168) & mix_mag_shape : (513, 168)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 134) & mix_mag_shape : (513, 134)
vocal_shape : (513, 173) & mix_mag_shape : (513, 173)
vocal_shape : (513, 188) & mix_mag_shape : (513, 188)
vocal_shape : (513, 205) & mix_mag_shape : (513, 205)
vocal_shape : (513, 171) & mix_mag_shape : (513, 171)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 129) & m

vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 160) & mix_mag_shape : (513, 160)
vocal_shape : (513, 192) & mix_mag_shape : (513, 192)
vocal_shape : (513, 157) & mix_mag_shape : (513, 157)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 219) & mix_mag_shape : (513, 219)
vocal_shape : (513, 221) & mix_mag_shape : (513, 221)
vocal_shape : (513, 147) & mix_mag_shape : (513, 147)
vocal_shape : (513, 188) & mix_mag_shape : (513, 188)
vocal_shape : (513, 179) & mix_mag_shape : (513, 179)
vocal_shape : (513, 223) & mix_mag_shape : (513, 223)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 234) & mix_mag_shape : (513, 234)
vocal_shape : (513, 111) & mix_mag_shape : (513, 111)
vocal_shape : (513, 193) & m

vocal_shape : (513, 210) & mix_mag_shape : (513, 210)
vocal_shape : (513, 213) & mix_mag_shape : (513, 213)
vocal_shape : (513, 152) & mix_mag_shape : (513, 152)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 146) & mix_mag_shape : (513, 146)
vocal_shape : (513, 249) & mix_mag_shape : (513, 249)
vocal_shape : (513, 157) & mix_mag_shape : (513, 157)
vocal_shape : (513, 162) & mix_mag_shape : (513, 162)
vocal_shape : (513, 143) & mix_mag_shape : (513, 143)
vocal_shape : (513, 221) & mix_mag_shape : (513, 221)
vocal_shape : (513, 168) & mix_mag_shape : (513, 168)
vocal_shape : (513, 120) & mix_mag_shape : (513, 120)
vocal_shape : (513, 188) & mix_mag_shape : (513, 188)
vocal_shape : (513, 239) & mix_mag_shape : (513, 239)
vocal_shape : (513, 199) & mix_mag_shape : (513, 199)
vocal_shape : (513, 133) & mix_mag_shape : (513, 133)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 154) & m

vocal_shape : (513, 181) & mix_mag_shape : (513, 181)
vocal_shape : (513, 202) & mix_mag_shape : (513, 202)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 190) & mix_mag_shape : (513, 190)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 231) & mix_mag_shape : (513, 231)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 243) & mix_mag_shape : (513, 243)
vocal_shape : (513, 148) & mix_mag_shape : (513, 148)
vocal_shape : (513, 233) & mix_mag_shape : (513, 233)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 218) & mix_mag_shape : (513, 218)
vocal_shape : (513, 201) & mix_mag_shape : (513, 201)
vocal_shape : (513, 213) & mix_mag_shape : (513, 213)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 118) & mix_mag_shape : (513, 118)
vocal_shape : (513, 149) & mix_mag_shape : (513, 149)
vocal_shape : (513, 161) & mix_mag_shape : (513, 161)
vocal_shape : (513, 234) & m

vocal_shape : (513, 230) & mix_mag_shape : (513, 230)
vocal_shape : (513, 182) & mix_mag_shape : (513, 182)
vocal_shape : (513, 224) & mix_mag_shape : (513, 224)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 207) & mix_mag_shape : (513, 207)
vocal_shape : (513, 220) & mix_mag_shape : (513, 220)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 161) & mix_mag_shape : (513, 161)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 227) & mix_mag_shape : (513, 227)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 116) & mix_mag_shape : (513, 116)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 224) & mix_mag_shape : (513, 224)
vocal_shape : (513, 109) & mix_mag_shape : (513, 109)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 174) & m

vocal_shape : (513, 168) & mix_mag_shape : (513, 168)
vocal_shape : (513, 136) & mix_mag_shape : (513, 136)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 167) & mix_mag_shape : (513, 167)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 174) & mix_mag_shape : (513, 174)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 176) & mix_mag_shape : (513, 176)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 221) & mix_mag_shape : (513, 221)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 193) & mix_mag_shape : (513, 193)
vocal_shape : (513, 149) & mix_mag_shape : (513, 149)
vocal_shape : (513, 213) & mix_mag_shape : (513, 213)
vocal_shape : (513, 173) & mix_mag_shape : (513, 173)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 236) & mix_mag_shape : (513, 236)
vocal_shape : (513, 219) & mix_mag_shape : (513, 219)
vocal_shape : (513, 141) & m

vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 167) & mix_mag_shape : (513, 167)
vocal_shape : (513, 152) & mix_mag_shape : (513, 152)
vocal_shape : (513, 213) & mix_mag_shape : (513, 213)
vocal_shape : (513, 139) & mix_mag_shape : (513, 139)
vocal_shape : (513, 145) & mix_mag_shape : (513, 145)
vocal_shape : (513, 231) & mix_mag_shape : (513, 231)
vocal_shape : (513, 216) & mix_mag_shape : (513, 216)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 237) & mix_mag_shape : (513, 237)
vocal_shape : (513, 175) & mix_mag_shape : (513, 175)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 133) & mix_mag_shape : (513, 133)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 120) & mix_mag_shape : (513, 120)
vocal_shape : (513, 179) & mix_mag_shape : (513, 179)
vocal_shape : (513, 156) & mix_mag_shape : (513, 156)
vocal_shape : (513, 143) & mix_mag_shape : (513, 143)
vocal_shape : (513, 127) & m

vocal_shape : (513, 106) & mix_mag_shape : (513, 106)
vocal_shape : (513, 235) & mix_mag_shape : (513, 235)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 127) & mix_mag_shape : (513, 127)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 201) & mix_mag_shape : (513, 201)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 152) & mix_mag_shape : (513, 152)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 109) & mix_mag_shape : (513, 109)
vocal_shape : (513, 163) & mix_mag_shape : (513, 163)
vocal_shape : (513, 106) & mix_mag_shape : (513, 106)
vocal_shape : (513, 132) & mix_mag_shape : (513, 132)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 211) & mix_mag_shape : (513, 211)
vocal_shape : (513, 146) & mix_mag_shape : (513, 146)
vocal_shape : (513, 153) & mix_mag_shape : (513, 153)
vocal_shape : (513, 211) & mix_mag_shape : (513, 211)
vocal_shape : (513, 106) & m

vocal_shape : (513, 178) & mix_mag_shape : (513, 178)
vocal_shape : (513, 176) & mix_mag_shape : (513, 176)
vocal_shape : (513, 155) & mix_mag_shape : (513, 155)
vocal_shape : (513, 191) & mix_mag_shape : (513, 191)
vocal_shape : (513, 211) & mix_mag_shape : (513, 211)
vocal_shape : (513, 137) & mix_mag_shape : (513, 137)
vocal_shape : (513, 129) & mix_mag_shape : (513, 129)
vocal_shape : (513, 197) & mix_mag_shape : (513, 197)
vocal_shape : (513, 212) & mix_mag_shape : (513, 212)
vocal_shape : (513, 166) & mix_mag_shape : (513, 166)
vocal_shape : (513, 138) & mix_mag_shape : (513, 138)
vocal_shape : (513, 155) & mix_mag_shape : (513, 155)
vocal_shape : (513, 163) & mix_mag_shape : (513, 163)
vocal_shape : (513, 183) & mix_mag_shape : (513, 183)
vocal_shape : (513, 136) & mix_mag_shape : (513, 136)
vocal_shape : (513, 205) & mix_mag_shape : (513, 205)
vocal_shape : (513, 215) & mix_mag_shape : (513, 215)
vocal_shape : (513, 159) & mix_mag_shape : (513, 159)
vocal_shape : (513, 140) & m

vocal_shape : (513, 182) & mix_mag_shape : (513, 182)
vocal_shape : (513, 106) & mix_mag_shape : (513, 106)
vocal_shape : (513, 200) & mix_mag_shape : (513, 200)
vocal_shape : (513, 180) & mix_mag_shape : (513, 180)
vocal_shape : (513, 154) & mix_mag_shape : (513, 154)
vocal_shape : (513, 143) & mix_mag_shape : (513, 143)
vocal_shape : (513, 141) & mix_mag_shape : (513, 141)
vocal_shape : (513, 190) & mix_mag_shape : (513, 190)
vocal_shape : (513, 215) & mix_mag_shape : (513, 215)
vocal_shape : (513, 131) & mix_mag_shape : (513, 131)
vocal_shape : (513, 174) & mix_mag_shape : (513, 174)
vocal_shape : (513, 155) & mix_mag_shape : (513, 155)
vocal_shape : (513, 123) & mix_mag_shape : (513, 123)
vocal_shape : (513, 132) & mix_mag_shape : (513, 132)
vocal_shape : (513, 151) & mix_mag_shape : (513, 151)
vocal_shape : (513, 203) & mix_mag_shape : (513, 203)
vocal_shape : (513, 154) & mix_mag_shape : (513, 154)
vocal_shape : (513, 135) & mix_mag_shape : (513, 135)
vocal_shape : (513, 177) & m

In [12]:
#change dataset to numpy array
mix_train_X = np.asarray(mix_train_X,dtype=np.float32)
vocal_train_y = np.asarray(vocal_train_y,dtype=np.float32)

mix_val_X = np.asarray(mix_val_X,dtype=np.float32)
vocal_val_y = np.asarray(vocal_val_y,dtype=np.float32)

mix_test_X = np.asarray(mix_test_X,dtype=np.float32)
vocal_test_y = np.asarray(vocal_test_y,dtype=np.float32)

In [None]:
#RAM overuse
train_phase = np.asarray(train_phase)
val_phase = np.asarray(val_phase)
test_phase = np.asarray(test_phase)

In [13]:
print("Training Dataset Size :")
print(f'mix shape : {mix_train_X.shape}, voc shape : {vocal_train_y.shape}')

print("Validation Dataset Size :")
print(f'mix shape : {mix_val_X.shape}, voc shape : {vocal_val_y.shape}')

print("Testing Dataset Size :")
print(f'mix shape : {mix_test_X.shape}, voc shape : {vocal_test_y.shape}')

Training Dataset Size :
mix shape : (7647, 512, 128, 1), voc shape : (7647, 512, 128, 1)
Validation Dataset Size :
mix shape : (1929, 512, 128, 1), voc shape : (1929, 512, 128, 1)
Testing Dataset Size :
mix shape : (1743, 512, 128, 1), voc shape : (1743, 512, 128, 1)


* Without Augmentation

In [None]:
#create dataset without augmented dataset
print("Creating dataset...")

dir = 'MIR-1K_Processed'

for f in os.listdir(dir):
  path = os.path.join(dir,f)
  id = f.split('_')[1]
  mix_mag,voc,phase  = load_npy(path,target="vocal")

  if id == '4':    
    sample_patches(mix_mag,voc,phase,mix_val_X,vocal_val_y,val_phase)
  
  elif id == '5':
    sample_patches(mix_mag,voc,phase,mix_test_X,vocal_test_y,test_phase)
  
  else:
    sample_patches(mix_mag,voc,phase,mix_train_X,vocal_train_y,train_phase)
  
print("Datasets created successfully.")

**Old dataset**

In [None]:
dir = 'MIR-1K_Processed'

print("Creating training dataset ... ")
#global variables: training data after computing patches
X = []
y = []
X_phase = []

for f in os.listdir(dir):
  path = os.path.join(dir,f)
  print(f)
  mix_mag,voc,phase  = load_npy(path,target="vocal")
  sample_patches(mix_mag,voc,phase)

vocal_X = np.asarray(X,dtype=np.float32)
vocal_y = np.asarray(y,dtype=np.float32)
Mix_phase = np.asarray(X_phase)

print("Dataset created successfully.")
print(f'vocal_X_shape : {vocal_X.shape} & vocal_y.shape : {vocal_y.shape} & phase.shape : {Mix_phase.shape}')

In [None]:
#split total dataset into training and validation dataset
np.random.seed(0)
index = np.random.permutation(vocal_X.shape[0])
train_size = int(0.7*vocal_X.shape[0])
test_size = int(0.1*vocal_X.shape[0])
train_idx, val_idx, test_idx = index[:train_size], index[train_size:-test_size], index[-test_size:]
vocal_train_X, vocal_val_X, vocal_test_X = vocal_X[train_idx], vocal_X[val_idx], vocal_X[test_idx]
vocal_train_y, vocal_val_y, vocal_test_y = vocal_y[train_idx], vocal_y[val_idx],vocal_y[test_idx]
mix_test_phase = Mix_phase[test_idx]
print("Train & Validation dataset created successfully.")
print(f'Train set shape : {vocal_train_X.shape}, {vocal_train_y.shape}')
print(f'Val set shape : {vocal_val_X.shape}, {vocal_val_y.shape}')
print(f'Test set shape : {vocal_test_X.shape}, {vocal_test_y.shape}')
print(f'Phase set shape : {mix_test_phase.shape}')

**Define U-Net Model**

In [14]:
import numpy as np
import tensorflow as tf
from keras.models import load_model
from tensorflow.keras.layers import Conv2D,Concatenate,Activation,Dropout
from tensorflow.keras.layers import LeakyReLU as L_ReLU
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.layers import Conv2DTranspose as DeConv2D
from tensorflow.keras import Input,Model
from tensorflow import keras

2022-04-22 04:36:50.324278: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64:/home/pratikhya/catkin_ws/devel/lib:/opt/ros/noetic/lib:/opt/ros/noetic/lib/x86_64-linux-gnu
2022-04-22 04:36:50.324342: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [15]:
def model_u_net(input=Input((512, 128, 1))):
    conv_1 = Conv2D(16, 5, strides=2, padding='same')(input)
    conv_1 = BatchNorm()(conv_1)
    conv_1 = L_ReLU(alpha=0.2)(conv_1)
    print(conv_1.shape)

    conv_2 = Conv2D(32, 5, strides=2, padding='same')(conv_1)
    conv_2 = BatchNorm()(conv_2)
    conv_2 = L_ReLU(alpha=0.2)(conv_2)
    print(conv_2.shape)

    conv_3 = Conv2D(64, 5, strides=2, padding='same')(conv_2)
    conv_3 = BatchNorm()(conv_3)
    conv_3 = L_ReLU(alpha=0.2)(conv_3)
    print(conv_3.shape)

    conv_4 = Conv2D(128, 5, strides=2, padding='same')(conv_3)
    conv_4 = BatchNorm()(conv_4)
    conv_4 = L_ReLU(alpha=0.2)(conv_4)
    print(conv_4.shape)

    conv_5 = Conv2D(256, 5, strides=2, padding='same')(conv_4)
    conv_5 = BatchNorm()(conv_5)
    conv_5 = L_ReLU(alpha=0.2)(conv_5)
    print(conv_5.shape)

    conv_6 = Conv2D(512, 5, strides=2, padding='same')(conv_5)
    conv_6 = BatchNorm()(conv_6)
    conv_6 = L_ReLU(alpha=0.2)(conv_6)
    print(conv_6.shape)

    deconv_7 = DeConv2D(256, 5, strides=2, padding='same')(conv_6)
    deconv_7 = BatchNorm()(deconv_7)
    deconv_7 = Dropout(0.5)(deconv_7)
    deconv_7 = Activation('relu')(deconv_7)
    print(deconv_7.shape)

    deconv_8 = Concatenate(axis=3)([deconv_7, conv_5])
    deconv_8 = DeConv2D(128, 5, strides=2, padding='same')(deconv_8)
    deconv_8 = BatchNorm()(deconv_8)
    deconv_8 = Dropout(0.5)(deconv_8)
    deconv_8 = Activation('relu')(deconv_8)
    print(deconv_8.shape)

    deconv_9 = Concatenate(axis=3)([deconv_8, conv_4])
    deconv_9 = DeConv2D(64, 5, strides=2, padding='same')(deconv_9)
    deconv_9 = BatchNorm()(deconv_9)
    deconv_9 = Dropout(0.5)(deconv_9)
    deconv_9 = Activation('relu')(deconv_9)
    print(deconv_9.shape)

    deconv_10 = Concatenate(axis=3)([deconv_9, conv_3])
    deconv_10 = DeConv2D(32, 5, strides=2, padding='same')(deconv_10)
    deconv_10 = BatchNorm()(deconv_10)
    deconv_10 = Activation('relu')(deconv_10)
    print(deconv_10.shape)

    deconv_11 = Concatenate(axis=3)([deconv_10, conv_2])
    deconv_11 = DeConv2D(16, 5, strides=2, padding='same')(deconv_11)
    deconv_11 = BatchNorm()(deconv_11)
    deconv_11 = Activation('relu')(deconv_11)
    print(deconv_11.shape)

    deconv_12 = Concatenate(axis=3)([deconv_11, conv_1])
    deconv_12 = DeConv2D(1, 5, strides=2, padding='same')(deconv_12)
    deconv_12 = Activation('sigmoid')(deconv_12)
    print(deconv_12.shape)

    #compute output from mask
    mask = deconv_12
    out = mask*input
    print(out.shape)

    model = Model(inputs=input, outputs=out)
    model.summary()
    return model

**Training**

In [16]:
#create model
model = model_u_net()

2022-04-22 04:37:23.993351: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-04-22 04:37:24.106385: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-22 04:37:24.107111: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce MX250 computeCapability: 6.1
coreClock: 1.582GHz coreCount: 3 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 52.21GiB/s
2022-04-22 04:37:24.107416: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64:/home/pratikhya/catkin_ws/devel/lib:/opt/ros/noetic/lib:/opt/ros/noetic/lib/x86_

(None, 256, 64, 16)
(None, 128, 32, 32)
(None, 64, 16, 64)
(None, 32, 8, 128)
(None, 16, 4, 256)
(None, 8, 2, 512)
(None, 16, 4, 256)
(None, 32, 8, 128)
(None, 64, 16, 64)
(None, 128, 32, 32)
(None, 256, 64, 16)
(None, 512, 128, 1)
(None, 512, 128, 1)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512, 128, 1) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 256, 64, 16)  416         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 256, 64, 16)  64          conv2d[0][0]                     
________________________________________

In [None]:
#define optimizer for model
model.compile(optimizer ='adam', loss='mean_absolute_error')

#define call_back functions
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5,monitor="val_loss"),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",patience=3),
]

In [None]:
#train model
Epoch = 30
Batch = 16

history = model.fit(mix_train_X,vocal_train_y,epochs=Epoch,batch_size=Batch,verbose=1,validation_data=(mix_val_X,vocal_val_y),callbacks=my_callbacks)
print("Model training complete.")

In [None]:
#save trained model
final_model = 'drive/MyDrive/source_sep/model_1'
model.save(final_model)

In [None]:
model = load_model('drive/MyDrive/source_sep/model_1')

**Test Model**

In [None]:
#test model
Batch = 16
test_pred = model.predict(vocal_test_X, batch_size = Batch)

In [None]:
test_pred.shape

In [None]:
#predict on test data and store
#can modify to store raw test data for accuracy check purpose
print("Processing test data ...")
save_dir = 'drive/MyDrive/source_sep/test_data_pred'

for i in range(test_pred.shape[0]):
  pred_i = np.vstack((np.zeros((128)), test_pred[i].reshape(512, 128)))
  mix_phase_i = np.vstack((np.zeros((128)), mix_test_phase[i].reshape(512, 128)))
  y_i = pred_i * mix_phase_i
  print(f'Test data {i} pred shape : {y_i.shape}')
  audio_i = istft(y_i,hop_length=Hop,win_length=Window_size)
  sf.write(f'{save_dir}/audio_{i}.wav',audio_i,22050)

print("Test data prediction complete.")

In [None]:
def test_audio(filename,target_filename):
    voice,sr = lb.load(filename)
    mag, pha = mag_phase_spectrogram(voice)
    mag = mag[np.newaxis,1:,:128,np.newaxis]
    pred = model.predict(mag)
    pred = np.vstack((np.zeros((128)), pred[0].reshape(512, 128)))
    pha = pha[:,:128]
    iso_voice = pred*pha
    iso_voice = istft(iso_voice,hop_length=Hop,win_length=Window_size)
    sf.write(target_filename,iso_voice,sr)

**Evaluation**

In [None]:
from mir_eval import separation

In [None]:
#mix_data = vocal_test_X
#vocal_data = vocal_test_y
#pred = test_pred
#phase = mix_test_phase

#function to make (nsrc,nsample) for each audio reference & estimate and use bss_eval_sources
def compute_metric(mix_data, vocal_data, pred, phase):
  sdr_arr = []
  sir_arr = []
  sar_arr = []
  mix_audio = mix_data*phase
  vocal_audio = vocal_data*phase
  pred_audio = pred*phase

  for i in range(mix_audio.shape[0]):
    #prepare arrays for istft
    mix_i = mix_audio[i].reshape((512,128))
    vocal_i = vocal_audio[i].reshape((512,128))
    acc_i = mix_i - vocal_i
    pred_voc_i = pred_audio[i].reshape((512,128))
    pred_acc_i = mix_i - pred_voc_i

    vocal_i = np.vstack((np.zeros(128),vocal_i))
    acc_i = np.vstack((np.zeros(128),acc_i))
    pred_voc_i = np.vstack((np.zeros(128),pred_voc_i))
    pred_acc_i = np.vstack((np.zeros(128),pred_acc_i))

    #istft output for each source
    ref_voc = istft(vocal_i,hop_length=Hop,win_length=Window_size)
    ref_acc = istft(acc_i,hop_length=Hop,win_length=Window_size)
    est_voc = istft(pred_voc_i,hop_length=Hop,win_length=Window_size)
    est_acc = istft(pred_acc_i,hop_length=Hop,win_length=Window_size)

    #stack ref_src & est_src
    ref = np.vstack((ref_voc,ref_acc))
    est = np.vstack((est_voc,est_acc))
    # print(ref.shape,est.shape)

    sdr,sir,sar,_ = separation.bss_eval_sources(ref,est,compute_permutation=False)
    sdr_arr.append(sdr)
    sir_arr.append(sir)
    sar_arr.append(sar)

  return np.asarray(sdr_arr), np.asarray(sir_arr), np.asarray(sar_arr)    

In [None]:
sdr,sir,sar = compute_metric(vocal_test_X,vocal_test_y,test_pred,mix_test_phase)

In [None]:
print(sdr.shape,sir.shape,sar.shape)

In [None]:
print('SDR:')
print(sdr[:5])
print('SIR:')
print(sir[:5])
print('SAR:')
print(sar[:5])

In [None]:
SDR_vocal = np.mean(sdr[:,0])
SDR_acc = np.mean(sdr[:,1])
SIR_vocal = np.mean(sir[:,0])
SIR_acc = np.mean(sir[:,1])
SAR_vocal = np.mean(sar[:,0])
SAR_acc = np.mean(sar[:,1])

In [None]:
print(f'SDR vocal : {SDR_vocal}')
print(f'SDR acc : {SDR_acc}')
print(f'SIR vocal : {SIR_vocal}')
print(f'SIR acc : {SIR_acc}')
print(f'SAR vocal : {SAR_vocal}')
print(f'SAR acc : {SAR_acc}')