# 处理[URMP](https://labsites.rochester.edu/air/projects/URMP.html)数据集

In [None]:
import os
import subprocess
import shutil
import mido
import numpy as np
import sys
sys.path.append("..")
from utils.midiarray import midi_merge, annotation2midi, midi2numpy

dataset_folder = "../data/URMP/Dataset"
output_folder = "./URMP_processed"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 目标音频
fs = 22050
hop = 256
time_step = hop / fs

# 乐器标注
urmp_instruments = {
    "vn": 40,
    "va": 41,
    "vc": 42,
    "db": 43,
    "fl": 73,
    "ob": 68,
    "cl": 71,
    "sax": 66,
    "bn": 70,
    "tpt": 56,
    "hn": 60,
    "tbn": 57,
    "tba": 58
}

生成的文件夹名称：

{name}@{i}: name是作品的名字，i表示第几个声部，如果是0表示合并的

In [None]:
for folder_name in os.listdir(dataset_folder):
    folder_path = os.path.join(dataset_folder, folder_name)
    if os.path.isdir(folder_path):  # 对每一个重奏文件夹
        print(f"Processing folder: {folder_name}")
        # 找到所有音频和标注
        sepAudios = []
        mixAudio = None
        sepNotes = {}
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                if file_name.startswith("AuMix"):
                    mixAudio = file_name
                if file_name.startswith("AuSep"):
                    sepAudios.append(file_name)
            elif file_name.endswith(".txt"):
                if file_name.startswith("Notes"):
                    sepNotes[file_name[6:-4]] = file_name

        if mixAudio is None:
            print(f"Mix audio not found in {folder_name}")
            continue

        midis = []
        for i, file in enumerate(sepAudios):
            raw_name= file[6:-4]
            print(f"\tProcessing {raw_name}")
            audio_absolute_path = os.path.join(folder_path, file)
            note_absolute_path = os.path.join(folder_path, sepNotes[raw_name])

            output_folder_path = os.path.join(output_folder, f"{folder_name}@{i+1}")
            if not os.path.exists(output_folder_path):
                os.makedirs(output_folder_path)

            # 音频用ffmpeg进行抗混叠降采样
            print("\t\tProcessing audio")
            output_audio_path = os.path.join(output_folder_path, f"{raw_name}.wav")
            try:
                subprocess.run(['ffmpeg', '-i', audio_absolute_path, '-ar', str(fs), output_audio_path], check=True)
            except subprocess.CalledProcessError as e:
                print(f"Error occurred while processing {audio_absolute_path}: {e}")

            # 生成midi
            print("\t\tProcessing midi")
            output_midi_path = os.path.join(output_folder_path, f"{raw_name}.mid")
            mid = annotation2midi(note_absolute_path, cols=["onset", "freq", "dur"], instrument=urmp_instruments[raw_name.split('_')[1]])
            mid.save(output_midi_path)
            midis.append(mid)

            # 根据midi生成numpy数组
            print("\t\tProcessing midiarray")
            arr = midi2numpy(output_midi_path, time_step)
            np.save(os.path.join(output_folder_path, f"{raw_name}.npy"), arr)
        
        # 最后处理混合版本
        raw_name= mixAudio[6:-4]
        print(f"\tProcessing {raw_name}")
        output_folder_path = os.path.join(output_folder, f"{folder_name}@0")
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        print("\t\tProcessing audio")
        audio_absolute_path = os.path.join(folder_path, mixAudio)
        output_audio_path = os.path.join(output_folder_path, f"{raw_name}.wav")
        try:
            subprocess.run(['ffmpeg', '-i', audio_absolute_path, '-ar', str(fs), output_audio_path], check=True)
        except subprocess.CalledProcessError as e:
            print(f"Error occurred while processing {audio_absolute_path}: {e}")

        print("\t\tProcessing midi")
        output_midi_path = os.path.join(output_folder_path, f"{raw_name}.mid")
        midi_merge(midis).save(output_midi_path)

        print("\t\tProcessing midiarray")
        arr = midi2numpy(output_midi_path, time_step, track_separate=False) # 如果是basicamt这种音色无关就不分音轨了
        np.save(os.path.join(output_folder_path, f"{raw_name}.npy"), arr)