# 处理[Bach10数据集](https://labsites.rochester.edu/air/datasets/Bach10%20Dataset_v1.0.pdf)

要求在`根目录/data/`下有`Bach10_v1.1-main`文件夹，处理后的数据在`./BACH10_processed`。

In [None]:
import os
import subprocess
import sys
sys.path.append('..')
from utils.midiarray import annotation2midi, midi_merge, midi2numpy
from scipy.io import loadmat
import numpy as np

dataset_folder = r"..\data\Bach10_v1.1-main"
output_folder = r".\BACH10_processed"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 目标音频
fs = 22050
hop = 256
time_step = hop / fs

instruments_id = [40, 71, 66, 70]
instruments_name = ["violin", "clarinet", "saxphone", "bassoon"]
imstruments_map = {
    "violin": 40,
    "clarinet": 71,
    "saxphone": 66,
    "bassoon": 70
}

def mat2text(mat_file_path, output_folder):
    """
    创建violin.txt、clarinet.txt、saxophone.txt、bassoon.txt四个文件，每个文件中的每一行是一个音符，格式为“onset offset pitch”。
    """
    # Load the .mat file
    mat_data = np.squeeze(loadmat(mat_file_path)['GTNotes'])
    # shape: (4,)
    for i, inst_i_notes in enumerate(mat_data):
        inst_i_notes = np.squeeze(inst_i_notes)
        with open(os.path.join(output_folder, f"{instruments_name[i]}.txt"), 'w') as f:
            for note in inst_i_notes:
                # note.shape = (2, note_len) 第一行是帧序号，第二行是midi音高，但是是小数，估计是用log算出来的
                onset = note[0,0]
                offset = note[0,-1]
                note = int(round(np.mean(note[1])))
                f.write(f"{onset} {offset} {note}\n")
        

同URMP，{name}@{i}: name是作品的名字，i表示第几个声部，如果是0表示合并的

In [2]:
for piece_name in os.listdir(dataset_folder):
    print(f"processing {piece_name}")
    piece_folder_path = os.path.join(dataset_folder, piece_name)
    if not os.path.isdir(piece_folder_path):
        continue
    # 找到GTNotes.mat文件 和所有音频文件
    mat_file_path = None
    mixed_audio = ''    # 路径
    audios = {} # "乐器名": 路径
    for file in os.listdir(piece_folder_path):
        if file.endswith('GTNotes.mat'):
            mat_file_path = os.path.join(piece_folder_path, file)
        elif file.endswith('.wav'):
            filename = file[:-4]
            if filename == piece_name:
                mixed_audio = os.path.join(piece_folder_path, file)
            else:
                instr = filename.split('-')[-1]
                if instr not in instruments_name:
                    raise ValueError(f"Unknown instrument name {instr}")
                audios[instr] = os.path.join(piece_folder_path, file)
    if mat_file_path is None:
        continue

    mat2text(mat_file_path, output_folder)
    midis = []
    for instr in instruments_name:
        print(f"\tprocessing {instr}")
        # 创建文件夹
        output_folder_name = f"{piece_name}@{instruments_name.index(instr)+1}"
        output_folder_path = os.path.join(output_folder, output_folder_name)
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        print(f"\t\tProcessing midi")
        # 文档里说窗长46ms，hop=10ms
        m = annotation2midi(os.path.join(output_folder, f"{instr}.txt"), ["onset", "offset", "note"], time_unit=0.01, instrument=imstruments_map[instr])
        midis.append(m)
        midi_path = os.path.join(output_folder_path, f"{instr}.mid")
        m.save(midi_path)

        # 音频用ffmpeg进行抗混叠降采样
        print("\t\tProcessing audio")
        output_audio_path = os.path.join(output_folder_path, f"{instr}.wav")
        try:
            subprocess.run(['ffmpeg', '-i', audios[instr], '-ar', str(fs), output_audio_path], check=True)
        except subprocess.CalledProcessError as e:
            print(f"Error occurred while processing {audios[instr]}: {e}")
        
        # 转numpy数组
        print("\t\tProcessing numpy")
        np.save(os.path.join(output_folder_path, f"{instr}.npy"), midi2numpy(midi_path, time_step))
    
    # 处理合并的
    print(f"\tprocessing mixed")
    output_folder_path = os.path.join(output_folder, f"{piece_name}@0")
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    print(f"\t\tProcessing midi")
    mid_all = midi_merge(midis)
    midi_path = os.path.join(output_folder_path, f"{piece_name}.mid")
    mid_all.save(midi_path)

    # 音频用ffmpeg进行抗混叠降采样
    print("\t\tProcessing audio")
    output_audio_path = os.path.join(output_folder_path, f"{piece_name}.wav")
    try:
        subprocess.run(['ffmpeg', '-i', mixed_audio, '-ar', str(fs), output_audio_path], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while processing {audios[instr]}: {e}")
    
    # 转numpy数组
    print("\t\tProcessing numpy")
    np.save(os.path.join(output_folder_path, f"{piece_name}.npy"), midi2numpy(midi_path, time_step, track_separate=False))

processing 01-AchGottundHerr
	processing violin
		Processing midi
		Processing audio
		Processing numpy
	processing clarinet
		Processing midi
		Processing audio
		Processing numpy
	processing saxphone
		Processing midi
		Processing audio
		Processing numpy
	processing bassoon
		Processing midi
		Processing audio
		Processing numpy
	processing mixed
		Processing midi
		Processing audio
		Processing numpy
processing 02-AchLiebenChristen
	processing violin
		Processing midi
		Processing audio
		Processing numpy
	processing clarinet
		Processing midi
		Processing audio
		Processing numpy
	processing saxphone
		Processing midi
		Processing audio
		Processing numpy
	processing bassoon
		Processing midi
		Processing audio
		Processing numpy
	processing mixed
		Processing midi
		Processing audio
		Processing numpy
processing 03-ChristederdubistTagundLicht
	processing violin
		Processing midi
		Processing audio
		Processing numpy
	processing clarinet
		Processing midi
		Processing audio
		Proc