In [12]:
# Raw files data engineering
import pandas as pd
import shutil
import pretty_midi

In [13]:
def generate_midi(df_rows, file_name, save_path="dataset/midi/"):
    # Create PrettyMIDI object
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  # piano=0, voice_oohs=53, lead_6=85

    for _, row in df_rows.iterrows():
        note_type = str(row[" Type"]).strip()
        if note_type == "Sound":
            start = float(row[" Start time"])
            end = float(row[" End time"])
            pitch = int(row[" Estimated MIDI code"]) # int(row[" Ground Truth MIDI code"])
            # pitch = pretty_midi.note_name_to_number(row[" Ground truth Note name"]) # Retrieve the MIDI note number for this note name
            velocity = 100

            note = pretty_midi.Note(
                velocity=velocity,
                pitch=pitch,
                start=start,
                end=end
            )
            instrument.notes.append(note)

    midi.instruments.append(instrument)

    # Save to MIDI file
    midi_path = save_path + file_name + ".mid"
    midi.write(midi_path)

    return midi_path

In [14]:
# Move wav files to dataset path
def copy_wav_files(file_name, root="/Users/martinoywa/Downloads/VocalSet/FULL/", save_path="dataset/wav/"): # TODO replace with ~/Downloads/VocalSet/FULL
    file_name_split = file_name.split("_")

    if "m" in file_name_split[0]:
        parent = "male" + file_name_split[0][1:]
    else:
        parent = "female" + file_name_split[0][1:]

    # old_path = root + parent + "/" + "/".join(file_name_split[1:-1]) + "/" + file_name + ".wav"
    old_path = root + parent + "/" + "excerpts/straight/" + file_name + ".wav"
    new_path = save_path + file_name + ".wav"
    shutil.copy(old_path, new_path)

    return new_path

In [15]:
def generate_dataset(annotated_dataset_path):
    """
    Uses raw data to generate midi version of the dataset.
    Returns a dataframe of metadata.
    Columns: wav filepath, midi filepath

    :param annotated_dataset_path:
    :return:
    """
    # metadata
    filename, midi, wav = [], [], []

    # read data and join all data
    df = pd.read_csv(annotated_dataset_path, index_col=False)
    # filter for straight only
    df_straights_filtered = df[df['File Name'].str.contains("caro_straight|dona_straight|row_straight", case=False, na=False)]

    # extract unique file names data for processing
    unique_file_names = df_straights_filtered['File Name'].unique()

    # generate midi file and copy wav file
    for file_name in unique_file_names:
        rows = df_straights_filtered[df_straights_filtered['File Name'] == file_name]
        try: # TODO, Missing .wav files still get generated .midi files. Check below and dataset folder
            midi_path = generate_midi(rows, file_name)
            if midi_path is not None:
                wav_path  = copy_wav_files(file_name)
                filename.append(file_name)
                midi.append(midi_path)
                wav.append(wav_path)
        except FileNotFoundError:
            print(f"File {file_name} not found")

    return filename, midi, wav

In [16]:
annotated_datasets = ["dataset/raw/extended_2_all_files.csv"]
metadata = {"filename": [], "midi": [], "wav": []}

for dataset in annotated_datasets:
    filename, midi, wav = generate_dataset(annotated_dataset_path=dataset)
    metadata["filename"].extend(filename)
    metadata["midi"].extend(midi)
    metadata["wav"].extend(wav)


metadata = pd.DataFrame(metadata, columns=["filename", "midi", "wav"])
metadata.to_csv("dataset/raw/metadata.csv", index=False)