## Parsing MedleyDB

In [290]:
import librosa
import soundfile as sf
import os
import glob
import shutil
import icecream as ic
import numpy as np
import itertools
import zipfile
import yaml
import re

### Copy yaml metadata from metadata folder into songs folder

In [332]:
def parse_file(metadata_path, songs_path):
    songs_folder = os.listdir(songs_path)

    # filters out non yaml files
    metadata = glob.glob('*.yaml', root_dir=metadata_path)

    for file in metadata:

        yaml = file.split('.')[0]

        if yaml.endswith("_METADATA") == False:
            continue

        song_name = yaml[:-9]

        song_folder = songs_path + '/' + song_name
        # assert os.path.isdir(song_folder), "Song folder does not exist"
        if not os.path.isdir(song_folder):
            continue

        if song_name in songs_folder and not os.path.exists(song_folder + '/' + yaml + '.yaml'):
            shutil.copy(metadata_path + '/' + file, song_folder + '/' + file)



### Run
first arg: metadata folder (relative to parent directory of notebook; do not add ./)

second arg: song folder (same deal)

In [333]:
# parse_file('metadata', 'songs')
# parse_file('/Volumes/Lexar/medley_processed/Metadata', '/Volumes/Lexar/medley_processed/V2')
parse_file('Metadata', 'V2')

### Sort stems into instrument folders

In [334]:
def sort_stems(songs_folder):
    songs = os.listdir(songs_folder)
    songs = [song for song in songs if os.path.isdir(songs_folder + '/' + song)]

    for song in songs:
        with open(f"{songs_folder}/{song}/{song}_METADATA.yaml") as file:
            
            metadata = yaml.safe_load(file)

        for stem in metadata['stems']:
            # if stem.split('_')[-2] != 'STEM':
            # if stem.rsplit('_', 1)[1] != 'STEM':
            instrument = metadata['stems'][stem]['instrument']
            instrument = instrument.replace("/", "_")
            if instrument.endswith("singer") or instrument == "vocalists":
                instrument = "vocals"
                metadata['stems'][stem]['instrument'] = "vocals"
            # ic.ic(instrument)
            # metadata['stems'][stem]['instrument'] = metadata['stems'][stem]['instrument'].replace("/", "_")
            # ic.ic(metadata['stems'][stem]['instrument'])
            instrument = instrument.replace(" ", "")
            # metadata['stems'][stem]['instrument'].replace(" ", "")
            # metadata['stems'][stem]['instrument'] = instrument
            file = metadata['stems'][stem]['filename']

            if not re.search(r'.*STEM_\d+\.wav$', file):
                print(f"Skipping {file}")
                continue

            if(os.path.isdir(f"{songs_folder}/{song}/{instrument}_INSTR") == False):
                os.mkdir(f"{songs_folder}/{song}/{instrument}_INSTR")

            num_instr_stems = len(os.listdir(f"{songs_folder}/{song}/{instrument}_INSTR"))

            if os.path.exists(f"{songs_folder}/{song}/{song}_STEMS/{file}"):

                # Rename stems to their instrument name + number
                os.rename(f"{songs_folder}/{song}/{song}_STEMS/{file}", f"{songs_folder}/{song}/{song}_STEMS/{instrument}_{num_instr_stems}.wav")

                # Update metadata with new name
                metadata['stems'][stem]['filename'] = f"{instrument}_{num_instr_stems}.wav"
                # ic.ic(metadata['stems'][stem]['filename'])

            # Sort into instrument folders
            shutil.copy(f"{songs_folder}/{song}/{song}_STEMS/{instrument}_{num_instr_stems}.wav", f"{songs_folder}/{song}/{instrument}_INSTR/{instrument}_{num_instr_stems}.wav")

            with open(f"{songs_folder}/{song}/{song}_METADATA.yaml", 'w') as file:
                # ic.ic(metadata)
                yaml.safe_dump(metadata, file)
            # yaml.safe_dump(metadata, file)
                

In [335]:
# sort_stems('songs')
# sort_stems('/Volumes/Lexar/medley_processed/V2')
sort_stems('V2')

In [336]:
SAMPLE_RATE = 44100

def sum_stems(songs_folder):
    songs = os.listdir(songs_folder)
    songs = [song for song in songs if os.path.isdir(songs_folder + '/' + song)]

    for song in songs:
        folders = os.listdir(f"{songs_folder}/{song}")
        folders = [folder for folder in folders if os.path.isdir(f"{songs_folder}/{song}/{folder}")]

        instruments = [folder for folder in folders if folder.endswith("_INSTR")]
        ic.ic(instruments)

        for instrument in instruments:
            stems = os.listdir(f"{songs_folder}/{song}/{instrument}")
            stems = [stem for stem in stems if stem.endswith(".wav") and not stem.startswith("._")]

            # if len(stems) <= 1:
            #     shutil.move(f"{songs_folder}/{song}/{instrument}/{stems[0]}", f"{songs_folder}/{song}/{instrument}/{instrument}_SUM.wav")
                # continue

            # Load first stem
            # y, _ = librosa.load(f"{songs_folder}/{song}/{instrument}/{stems[0]}", mono=False, sr=SAMPLE_RATE)
            # y = np.zeros_like(y)  
            # def sanitize_filename(filename):
            #     # return re.sub(r'^\.', '', filename)
            #     return re.sub(r'^[._]+', '', filename)
                # return filename.replace(" ", "_").replace("/", "_")

            
            # stems = [sanitize_filename(stem) for stem in stems]
            

            # ic.ic(y.shape)
            inst_stem = None

            for stem in stems:
                ic.ic(stem)
                y_stem, _ = librosa.load(f"{songs_folder}/{song}/{instrument}/{stem}", mono=False, sr=SAMPLE_RATE)
                if inst_stem is None:
                    inst_stem = y_stem
                else:
                    # inst_stem += y_stem
                    np.sum([inst_stem, y_stem], axis=0, out=inst_stem)
                

            # Write sum to file
            sf.write(f"{songs_folder}/{song}/{instrument}/{instrument}_SUM.wav", inst_stem.T, SAMPLE_RATE, subtype='PCM_24')
            for stem in stems:
                os.remove(f"{songs_folder}/{song}/{instrument}/{stem}")

In [337]:
# sum_stems('songs')
# sum_stems('/Volumes/Lexar/medley_processed/V2')
sum_stems('V2')

ic| instruments: ['piano_INSTR',
                  'drumset_INSTR',
                  'tenorsaxophone_INSTR',
                  

'trumpet_INSTR',
                  'doublebass_INSTR',
                  'trombone_INSTR',
                  'bassclarinet_INSTR']
ic| stem: 'piano_0.wav'
ic| stem: 'drumset_0.wav'
ic| stem: 'tenorsaxophone_0.wav'
ic| stem: 'trumpet_0.wav'
ic| stem: 'doublebass_0.wav'
ic| stem: 'trombone_0.wav'
ic| stem: 'bassclarinet_0.wav'
ic| instruments: ['drumset_INSTR',
                  'vocals_INSTR',
                  'electricbass_INSTR',
                  'distortedelectricguitar_INSTR']
ic| stem: 'drumset_0.wav'
ic| stem: 'vocals_0.wav'
ic| stem: 'vocals_1.wav'
ic| stem: 'electricbass_0.wav'
ic| stem: 'distortedelectricguitar_2.wav'
ic| stem: 'distortedelectricguitar_1.wav'
ic| stem: 'distortedelectricguitar_0.wav'
ic| instruments: ['piano_INSTR',
                  'drumset_INSTR',
                  'tenorsaxophone_INSTR',
                  'doublebass_INSTR',
                  'trombonesection_INSTR',
                  'cleanelectricguitar_INSTR',
                  'trumpetsection_INSTR']


### generate_perms
zips entire song folder at the end with sh.make_archive

In [338]:
SAMPLE_RATE = 44100

import tqdm as tqdm

def generate_perms(songs_folder):
    songs = os.listdir(songs_folder)

    for i, song in tqdm.tqdm(enumerate(songs), total=len(songs), desc="Songs"):
        song_folder = songs_folder + '/' + song
        
        # if there is no song folder, skip
        if os.path.isdir(songs_folder + '/' + song) == False:
            continue

        # if the song has already been processed and zipped, skip
        if os.path.exists(song_folder + '.zip'):
            print(f"Skipping {song_folder}")
            continue

        
        instr_folders = os.listdir(song_folder)
        instr_folders = [folder for folder in instr_folders if folder.endswith("_INSTR") and not folder.startswith("._")]
        for instr in instr_folders:
            shutil.copy(song_folder + '/' + instr + '/' + instr + '_SUM.wav', song_folder + '/' + song + '_STEMS/' + instr + '_SUM.wav')

        stems_folder = song_folder + '/' + song + '_STEMS'
        stems_folder_contents = os.listdir(stems_folder)

        for stem in stems_folder_contents:
            if not stem.endswith('_SUM.wav') or stem.startswith("._"):
                os.remove(f"{stems_folder}/{stem}")

        # get all the stems
        stems = glob.glob('*.wav', root_dir=stems_folder)
        stems = [stem for stem in stems if not stem.startswith("._")]
        num_stems = len(stems)

        # generate all possible permutations of the stems of lengths num_stems-2 to num_stems, or 2 to num_stems if num_stems <= 3
        perms = []
        if num_stems > 3:
            for i in range(num_stems-2, num_stems):
                perms += itertools.combinations(stems, i)
        else:
            for i in range(2, num_stems):
                perms += itertools.combinations(stems, i)
            
        # ic.ic(perms)

        # for each permutation, create a new folder and copy the stems into it
        for i, perm in tqdm.tqdm(enumerate(perms), leave=False, total=len(perms), desc="Permutations"):
            # combine the names of all of the stems in the permutation
            perm_instruments = "_".join([p.split('.')[0] for p in perm])

            # perm_instruments = [p[:len(p)/2].join(p[:-2]) for p in perm_instruments] cut in half (backup plan)

            # if perm_instruments is greater than 225 characters, name it after the stems not in the permutation
            if len(perm_instruments) > 200 and len(perm) > num_stems - len(perm):
                perm_instruments = "_".join([p.split('.')[0] for p in stems if p not in perm])
                perm_instruments = perm_instruments + "_EXCL" # mark it as an exclusive permutation name

            perm_folder = song_folder + '/' + song + '_' + perm_instruments
            if(os.path.isdir(perm_folder) == False):
                os.mkdir(perm_folder)
            else:
                continue

            stems_arr = []

            for stem in perm:
                # shutil.copy(stems_folder + '/' + stem, perm_folder + '/' + stem)

                # stem_audio, sr = librosa.load(perm_folder + '/' + stem, mono=False, sr=SAMPLE_RATE)
                stem_audio, sr = librosa.load(stems_folder + '/' + stem, mono=False, sr=SAMPLE_RATE)
                stems_arr.append(stem_audio)
                # print(stem_audio.shape)
                # print(stem)
                # ic.ic(stem_audio)

            # stems_arr = np.where(stems_arr, , stems_arr)
            for i in range(len(stems_arr)):
                if stems_arr[i].shape[0] != 2:
                    stems_arr[i] = stems_arr[i].T
            # print([stems.shape for stems in stems_arr])
            perm_sum = np.sum(stems_arr, axis=0)
            # sf.write(perm_folder + '/' + song + '_PERM' + str(i) + '.wav', perm_sum.T, SAMPLE_RATE)
            # sf.write(perm_folder + '/' + song + '_PERM' + str(i) + '.wav', perm_sum.T, SAMPLE_RATE)
            sf.write(perm_folder + '/' + song + '_' + perm_instruments + '.wav', perm_sum.T, SAMPLE_RATE)
            
        # with zipfile.ZipFile(song_folder + '.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
        #     for root, _, files in os.walk(f'{song_folder}'):
        #         for file in files:
        #             # zipf.write(song_folder, os.path.basename(song_folder))
        #             zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), f'{song_folder}'))

### Run
arg: songs folder with stems + yaml metadata already in there (relative to parent directory of notebook; do not add ./)

In [339]:
# generate_perms('songs')
# generate_perms('/Volumes/Lexar/medley_processed/V2')
generate_perms('V2')

Songs: 100%|██████████| 74/74 [15:51<00:00, 12.85s/it]
