## Parsing MedleyDB

In [111]:
import librosa
import soundfile as sf
import os
import glob
import shutil
import icecream as ic
import numpy as np
import itertools
import zipfile
import yaml
import re

### Copy yaml metadata from metadata folder into songs folder

In [112]:
def parse_file(metadata_path, songs_path):
    songs_folder = os.listdir(songs_path)

    # filters out non yaml files
    metadata = glob.glob('*.yaml', root_dir=metadata_path)

    for file in metadata:

        yaml = file.split('.')[0]

        if yaml.endswith("_METADATA") == False:
            continue

        song_name = yaml[:-9]

        song_folder = songs_path + '/' + song_name
        assert os.path.isdir(song_folder), "Song folder does not exist"

        if song_name in songs_folder and not os.path.exists(song_folder + '/' + yaml + '.yaml'):
            shutil.copy(metadata_path + '/' + file, song_folder + '/' + file)



### Run
first arg: metadata folder (relative to parent directory of notebook; do not add ./)

second arg: song folder (same deal)

In [116]:
parse_file('metadata', 'songs')

### Sort stems into instrument folders

In [126]:
def sort_stems(songs_folder):
    songs = os.listdir(songs_folder)
    songs = [song for song in songs if os.path.isdir(songs_folder + '/' + song)]

    for song in songs:
        with open(f"{songs_folder}/{song}/{song}_METADATA.yaml") as file:
            
            metadata = yaml.safe_load(file)

        for stem in metadata['stems']:
            # if stem.split('_')[-2] != 'STEM':
            # if stem.rsplit('_', 1)[1] != 'STEM':
            instrument = metadata['stems'][stem]['instrument']
            instrument = instrument.replace("/", "_")
            # ic.ic(instrument)
            # metadata['stems'][stem]['instrument'] = metadata['stems'][stem]['instrument'].replace("/", "_")
            # ic.ic(metadata['stems'][stem]['instrument'])
            instrument = instrument.replace(" ", "")
            # metadata['stems'][stem]['instrument'].replace(" ", "")
            # metadata['stems'][stem]['instrument'] = instrument
            file = metadata['stems'][stem]['filename']

            if not re.search(r'.*STEM_\d+\.wav$', file):
                print(f"Skipping {file}")
                continue

            if(os.path.isdir(f"{songs_folder}/{song}/{instrument}") == False):
                os.mkdir(f"{songs_folder}/{song}/{instrument}")

            num_instr_stems = len(os.listdir(f"{songs_folder}/{song}/{instrument}"))

            if os.path.exists(f"{songs_folder}/{song}/{song}_STEMS/{file}"):

                # Rename stems to their instrument name + number
                os.rename(f"{songs_folder}/{song}/{song}_STEMS/{file}", f"{songs_folder}/{song}/{song}_STEMS/{instrument}_{num_instr_stems}.wav")

                # Update metadata with new name
                metadata['stems'][stem]['filename'] = f"{instrument}_{num_instr_stems}.wav"
                # ic.ic(metadata['stems'][stem]['filename'])

            # Sort into instrument folders
            shutil.copy(f"{songs_folder}/{song}/{song}_STEMS/{instrument}_{num_instr_stems}.wav", f"{songs_folder}/{song}/{instrument}/{instrument}_{num_instr_stems}.wav")

            with open(f"{songs_folder}/{song}/{song}_METADATA.yaml", 'w') as file:
                # ic.ic(metadata)
                yaml.safe_dump(metadata, file)
            # yaml.safe_dump(metadata, file)
                

In [127]:
sort_stems('songs')

### generate_perms
zips entire song folder at the end with sh.make_archive

In [136]:
SAMPLE_RATE = 44100

def generate_perms(songs_folder):
    songs = os.listdir(songs_folder)

    for song in songs:
        song_folder = songs_folder + '/' + song
        
        # if there is no song folder, skip
        if os.path.isdir(songs_folder + '/' + song) == False:
            continue

        # if the song has already been processed and zipped, skip
        if os.path.exists(song_folder + '.zip'):
            print(f"Skipping {song_folder}")
            continue

        stems_folder = song_folder + '/' + song + '_STEMS'

        # get all the stems
        stems = glob.glob('*.wav', root_dir=stems_folder)
        num_stems = len(stems)

        # generate all possible permutations of the stems of lengths num_stems-2 to num_stems, or 2 to num_stems if num_stems <= 3
        perms = []
        if num_stems > 3:
            for i in range(num_stems-2, num_stems):
                perms += itertools.combinations(stems, i)
        else:
            for i in range(2, num_stems):
                perms += itertools.combinations(stems, i)
            
        # ic.ic(perms)

        # for each permutation, create a new folder and copy the stems into it
        for i, perm in enumerate(perms):
            # combine the names of all of the stems in the permutation
            perm_instruments = "_".join([p.split('.')[0] for p in perm])

            # perm_instruments = [p[:len(p)/2].join(p[:-2]) for p in perm_instruments] cut in half (backup plan)

            # if perm_instruments is greater than 225 characters, name it after the stems not in the permutation
            if len(perm_instruments) > 200 and len(perm) > num_stems - len(perm):
                perm_instruments = "_".join([p.split('.')[0] for p in stems if p not in perm])
                perm_instruments = perm_instruments + "_EXCL" # mark it as an exclusive permutation name

            perm_folder = song_folder + '/' + song + '_' + perm_instruments
            if(os.path.isdir(perm_folder) == False):
                os.mkdir(perm_folder)
            else:
                continue

            stems_arr = []

            for stem in perm:
                shutil.copy(stems_folder + '/' + stem, perm_folder + '/' + stem)

                stem_audio, sr = librosa.load(perm_folder + '/' + stem, mono=False, sr=SAMPLE_RATE)
                stems_arr.append(stem_audio)
                # ic.ic(stem_audio)

            perm_sum = np.sum(stems_arr, axis=0)
            # sf.write(perm_folder + '/' + song + '_PERM' + str(i) + '.wav', perm_sum.T, SAMPLE_RATE)
            # sf.write(perm_folder + '/' + song + '_PERM' + str(i) + '.wav', perm_sum.T, SAMPLE_RATE)
            sf.write(perm_folder + '/' + song + '_' + perm_instruments + '.wav', perm_sum.T, SAMPLE_RATE)
            
        # shutil.make_archive(song_folder, format='zip', root_dir=song_folder)
        with zipfile.ZipFile(song_folder + '.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(f'{song_folder}'):
                for file in files:
                    # zipf.write(song_folder, os.path.basename(song_folder))
                    zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), f'{song_folder}'))

### Run
arg: songs folder with stems + yaml metadata already in there (relative to parent directory of notebook; do not add ./)

In [137]:
generate_perms('songs')

Skipping songs/MidnightBlue_StarsAreScreaming
Skipping songs/CassandraJenkins_PerfectDay
Skipping songs/Verdi_IlTrovatore
Skipping songs/FennelCartwright_FlowerDrumSong
Skipping songs/Karachacha_Volamos
Skipping songs/TheKitchenettes_Alive
Skipping songs/Sweat_Tact
Skipping songs/Plasma_GoodShout
Skipping songs/Allegria_MendelssohnMovement1
Skipping songs/SlowGhost_Peanut
Skipping songs/LittleTybee_TheAlchemist
Skipping songs/Cayetana_MissThing
