# "音色无关转录"模型评估

## 生成转录结果并保存
经过process后每个文件夹里的文件只有后缀不同，且后缀为"npy" "wav" "mid"。wav采样率已经是22050Hz。

In [None]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
sys.path.append('..')

s_per_frame = 256 / 22050

dataset_folders = ["BACH10_processed", "URMP_processed", "PHENICX_processed"]

# CQT配置 因为BasicPitch的CQT不参与训练
import tomllib
with open('../model/config.toml', 'br') as f:
    CQTconfig = tomllib.load(f)['CQT']
s_per_frame = CQTconfig['hop'] / CQTconfig['fs']

from model.CQT import CQTsmall

cqt = CQTsmall(
    CQTconfig['fs'],
    fmin=CQTconfig['fmin'],
    octaves=CQTconfig['octaves'],
    bins_per_octave=CQTconfig['bins_per_octave'],
    hop=CQTconfig['hop'],
    filter_scale=CQTconfig['filter_scale'],
    requires_grad=False
)

In [None]:
def amt_one(model, file):
    waveform, sample_rate = torchaudio.load(file)
    waveform = waveform.unsqueeze(0)
    onset, note = model(cqt(waveform))
    onset = onset.cpu().numpy()[0]
    note = note.cpu().numpy()[0]
    return onset, note

def amt_piece(model, piece_folder):
    filename = os.listdir(piece_folder)[0]
    path = os.path.join(piece_folder, filename)[:-3]    # 去掉后缀
    onset, note = amt_one(model, path + "wav")
    midi = np.load(path+"npy")
    # 补时间长度
    freqs, times = note.shape
    if midi.shape[1] < times:
        padding = np.zeros((freqs, times - midi.shape[1]))
        midi = np.concatenate((midi, padding), axis=1)
    elif midi.shape[1] > times:
        midi = midi[:, :times]
    return onset, note, midi

def amt_dataset(model, dataset_folder, output_folder = './'):
    folder_name = os.path.basename(dataset_folder)
    print(f"processing {folder_name}")
    output_folder_name = folder_name.split("_")[0] + "_eval"
    output_path = os.path.join(output_folder, output_folder_name)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for piece_folder in os.listdir(dataset_folder):
        if os.path.isdir(os.path.join(dataset_folder, piece_folder)):
            result = amt_piece(model, os.path.join(dataset_folder, piece_folder))
            np.save(os.path.join(output_path, piece_folder+".npy"), np.stack(result, axis=0))
            print(f"\tFinish {piece_folder}")


In [None]:
# 计算帧级的评价指标
from utils.midiarray import freq_map, roll2evalarray
from utils.postprocess import min_len_
import mir_eval

s_per_frame = 256 / 22050
freqmap = freq_map((24, 107), 440)

def frame_eval(note, midi, threshold = 0.5):
    """
    对note进行阈值二值化、移除短音符、转换为mir_eval所需数
    计算帧级评价指标
    note: (freqs, times)
    midi: (freqs, times)
    """
    binary_note = (note > threshold).astype(int)    # 二值化
    # 这个min_len_是原位操作，会修改输入
    est_pitch = roll2evalarray(min_len_(binary_note, 3), freqmap)
    ref_pitch = roll2evalarray(midi, freqmap)
    rst_time = s_per_frame * np.arange(len(est_pitch))
    ref_time = s_per_frame * np.arange(len(ref_pitch))
    result = mir_eval.multipitch.evaluate(ref_time, ref_pitch, rst_time, est_pitch)
    return result   # https://github.com/mir-evaluation/mir_eval/blob/main/mir_eval/multipitch.py


def evaluate_frame_dataset(npy_pathes, threshold = 0.5, log = True):
    """
    对npy_pathes中的所有npy文件用同一个阈值进行评估
    dataset_folder: folder containing npy files, each file is a result of amt_piece, shape (3, freqs, times): onset, note, midi
    """
    accs = []
    ps = []
    rs = []
    f1s = []
    for npy_file in npy_pathes:
        result = np.load(npy_file)
        evaluation = frame_eval(result[1], result[2], threshold)
        acc = evaluation['Accuracy']
        p = evaluation['Precision']
        r = evaluation['Recall']
        accs.append(acc)
        ps.append(p)
        rs.append(r)
        f1s.append(2*p*r/(p+r) if p+r > 0 else 0)
    ACC = np.mean(accs)
    P = np.mean(ps)
    R = np.mean(rs)
    F1 = np.mean(f1s)
    if log:
        # | Acc | P | R | F1 |
        print(f"| {threshold:.5f} | {ACC:.5f} | {P:.5f} | {R:.5f} | {F1:.5f} |")
    return ACC, P, R, F1


def find_best_threshold(npy_pathes, origin_range = (0.1, 0.9), step_num = 10, generation = 4, log = True):
    if log:
        print("| threshold | Acc | P | R | F1 |")
        print("| --------- | --- |---|---|----|")
    
    start = origin_range[0]
    end = origin_range[1]
    step = (end - start) / step_num
    
    best_thre = -1
    max_f1 = -1
    best_thre_idx = -1

    for g in range(generation):
        lastF1 = -1
        thresholds = np.r_[start:end:step]
        for idx, thre in enumerate(thresholds):
            ACC, P, R, F1 = evaluate_frame_dataset(npy_pathes, thre, log)
            if F1 > max_f1:
                max_f1 = F1
                best_thre_idx = idx
                best_thre = thre
            if F1 < lastF1: # 假设F1是一个凹函数，只要开始下降就可以停止了
                break
            lastF1 = F1
        if log:
            print(f"| Best threshold | {best_thre} | ~ | ~ | F1: {max_f1} |")
        
        # 如果是边缘的话，下一轮start不会覆盖到最优值，所以提前加入；否则清空
        if best_thre_idx == -1: # 说明最优值还在左边
            best_thre_idx = -1  # -1表示最大值在左边外面
            start = best_thre
            end = thresholds[0]
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == 0:    # 最值就是最左边的
            best_thre_idx = -1
            start = best_thre
            end = thresholds[1]
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == -2:   # 如果是右边缘的右边最大，说明最优值还在右边
            best_thre_idx = -2      # -2表示最大值在右边外面
            start = thresholds[-1]
            end = best_thre
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == len(thresholds) - 1:  # 最值就是最右边的
            best_thre_idx = -2
            start = thresholds[-2]
            end = best_thre
            step = (end - start) / step_num
            start += step
        else:
            start = thresholds[best_thre_idx-1]
            end = thresholds[best_thre_idx+1]
            step = (end - start) / step_num
            start += step   # 少分析一轮
            max_f1 = -1     # 清空最大值，因为一定在区间内

    return best_thre

### 运行模型

In [None]:
model_folder_name = "basicpitch"
sys.path.append(f'../{model_folder_name}')
model = torch.load(f"../{model_folder_name}/basicpitch_model.pth")
model.eval()

In [None]:
# 得到所有运行结果
with torch.no_grad():
    for dataset_folder in dataset_folders:
        amt_dataset(model, dataset_folder, f"./{model_folder_name}")

### 计算帧级指标，并寻找最好阈值

In [None]:
# 只看BACH10的合奏结果
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith("0.npy")]
find_best_threshold(npys, (0.1, 0.5), step_num=10, generation=4, log=True)

- BACH10合奏: Best threshold: 0.212, F1: 0.7868884110956198

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.10000 | 0.07300 | 0.07319 | 0.96681 | 0.13605 |
| 0.14000 | 0.58421 | 0.63114 | 0.88663 | 0.73717 |
| 0.18000 | 0.64204 | 0.73711 | 0.83230 | 0.78167 |
| 0.22000 | 0.64897 | 0.78176 | 0.79222 | 0.78681 |
| 0.26000 | 0.64278 | 0.81081 | 0.75598 | 0.78230 |
| Best threshold | 0.22 | ~ | ~ | F1: 0.7868131183876813 |
| 0.18800 | 0.64509 | 0.74837 | 0.82334 | 0.78393 |
| 0.19600 | 0.64703 | 0.75785 | 0.81525 | 0.78536 |
| 0.20400 | 0.64826 | 0.76664 | 0.80720 | 0.78627 |
| 0.21200 | 0.64909 | 0.77487 | 0.79957 | 0.78689 |
| 0.22000 | 0.64897 | 0.78176 | 0.79222 | 0.78681 |
| Best threshold | 0.212 | ~ | ~ | F1: 0.7868884110956198 |
| 0.20560 | 0.64838 | 0.76811 | 0.80578 | 0.78636 |
| 0.20720 | 0.64836 | 0.76972 | 0.80400 | 0.78636 |
| Best threshold | 0.2056 | ~ | ~ | F1: 0.7863636740318917 |
| 0.20576 | 0.64844 | 0.76830 | 0.80568 | 0.78641 |
| 0.20592 | 0.64834 | 0.76839 | 0.80542 | 0.78634 |
| Best threshold | 0.20576 | ~ | ~ | F1: 0.7864134262655582 |

In [None]:
# 看BACH10所有音频的结果（独奏+合奏）
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith(".npy")]
find_best_threshold(npys, (0.25, 0.7), step_num=10, generation=4, log=True)

- BACH10所有音频（独奏+合奏）: Best threshold: 0.385, F1: 0.8791347943492717

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.25000 | 0.76695 | 0.82187 | 0.91765 | 0.86378 |
| 0.29500 | 0.78126 | 0.84995 | 0.90416 | 0.87294 |
| 0.34000 | 0.78959 | 0.87222 | 0.89069 | 0.87788 |
| 0.38500 | 0.79255 | 0.88961 | 0.87691 | 0.87913 |
| 0.43000 | 0.79237 | 0.90400 | 0.86317 | 0.87822 |
| Best threshold | 0.385 | ~ | ~ | F1: 0.8791347943492717 |
| 0.34900 | 0.79008 | 0.87553 | 0.88793 | 0.87810 |
| 0.35800 | 0.79094 | 0.87919 | 0.88524 | 0.87851 |
| 0.36700 | 0.79173 | 0.88286 | 0.88253 | 0.87889 |
| 0.37600 | 0.79218 | 0.88629 | 0.87972 | 0.87904 |
| 0.38500 | 0.79255 | 0.88961 | 0.87691 | 0.87913 |
| 0.39400 | 0.79246 | 0.89232 | 0.87420 | 0.87893 |
| Best threshold | 0.385 | ~ | ~ | F1: 0.8791347943492717 |
| 0.37780 | 0.79237 | 0.88706 | 0.87915 | 0.87912 |
| 0.37960 | 0.79239 | 0.88765 | 0.87860 | 0.87911 |
| Best threshold | 0.3778 | ~ | ~ | F1: 0.8791189471612734 |
| 0.37798 | 0.79235 | 0.88711 | 0.87909 | 0.87911 |
| 0.37816 | 0.79238 | 0.88718 | 0.87905 | 0.87912 |
| 0.37834 | 0.79235 | 0.88722 | 0.87897 | 0.87910 |
| Best threshold | 0.37816 | ~ | ~ | F1: 0.8791211356037368 |

In [None]:
# 看PHENICX合奏音频的结果
npyfolder = f"{model_folder_name}/PHENICX_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith(".npy")]
find_best_threshold(npys, (0.1, 0.4), step_num=10, generation=4, log=True)

- PHENICX合奏音频: Best threshold: 0.13912, F1: 0.5030662912096693

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.10000 | 0.06430 | 0.06534 | 0.80577 | 0.12063 |
| 0.13000 | 0.32715 | 0.47392 | 0.51946 | 0.49196 |
| 0.16000 | 0.33276 | 0.58757 | 0.43712 | 0.49662 |
| 0.19000 | 0.31276 | 0.63190 | 0.38397 | 0.47271 |
| Best threshold | 0.16 | ~ | ~ | F1: 0.4966192700691109 |
| 0.13600 | 0.33624 | 0.51354 | 0.49875 | 0.50181 |
| 0.14200 | 0.33905 | 0.54113 | 0.48055 | 0.50458 |
| 0.14800 | 0.33799 | 0.55964 | 0.46445 | 0.50304 |
| Best threshold | 0.142 | ~ | ~ | F1: 0.5045805958796423 |
| 0.13720 | 0.33691 | 0.51905 | 0.49508 | 0.50248 |
| 0.13840 | 0.33743 | 0.52410 | 0.49160 | 0.50300 |
| 0.13960 | 0.33748 | 0.52861 | 0.48775 | 0.50298 |
| Best threshold | 0.1384 | ~ | ~ | F1: 0.5030033494037806 |
| 0.13744 | 0.33703 | 0.52013 | 0.49437 | 0.50261 |
| 0.13768 | 0.33719 | 0.52121 | 0.49373 | 0.50278 |
| 0.13792 | 0.33725 | 0.52212 | 0.49304 | 0.50284 |
| 0.13816 | 0.33732 | 0.52307 | 0.49232 | 0.50290 |
| 0.13840 | 0.33743 | 0.52410 | 0.49160 | 0.50300 |
| 0.13864 | 0.33748 | 0.52499 | 0.49090 | 0.50303 |
| 0.13888 | 0.33750 | 0.52591 | 0.49010 | 0.50304 |
| 0.13912 | 0.33753 | 0.52686 | 0.48936 | 0.50307 |
| 0.13936 | 0.33745 | 0.52765 | 0.48851 | 0.50296 |
| Best threshold | 0.13912 | ~ | ~ | F1: 0.5030662912096693 |

In [None]:
# 只看URMP的合奏结果
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith("0.npy")]
find_best_threshold(npys, (0.1, 0.5), step_num=10, generation=4, log=True)

- URMP合奏: Best threshold: 0.19984, F1: 0.6805771669043961

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.10000 | 0.03940 | 0.03954 | 0.92550 | 0.07554 |
| 0.14000 | 0.46465 | 0.53562 | 0.77944 | 0.63012 |
| 0.18000 | 0.51735 | 0.65618 | 0.71011 | 0.67769 |
| 0.22000 | 0.51936 | 0.70875 | 0.66068 | 0.67918 |
| 0.26000 | 0.50967 | 0.74236 | 0.61969 | 0.67021 |
| Best threshold | 0.22 | ~ | ~ | F1: 0.6791830385081563 |
| 0.18800 | 0.51967 | 0.66952 | 0.69929 | 0.67969 |
| 0.19600 | 0.52056 | 0.68085 | 0.68888 | 0.68043 |
| 0.20400 | 0.52061 | 0.69097 | 0.67895 | 0.68043 |
| 0.21200 | 0.52040 | 0.70045 | 0.66975 | 0.68016 |
| Best threshold | 0.204 | ~ | ~ | F1: 0.6804304427093903 |
| 0.19760 | 0.52061 | 0.68298 | 0.68682 | 0.68047 |
| 0.19920 | 0.52066 | 0.68509 | 0.68479 | 0.68050 |
| 0.20080 | 0.52069 | 0.68716 | 0.68279 | 0.68051 |
| 0.20240 | 0.52067 | 0.68916 | 0.68082 | 0.68050 |
| Best threshold | 0.2008 | ~ | ~ | F1: 0.6805145758360888 |
| 0.19952 | 0.52071 | 0.68554 | 0.68441 | 0.68054 |
| 0.19984 | 0.52076 | 0.68602 | 0.68401 | 0.68058 |
| 0.20016 | 0.52071 | 0.68638 | 0.68360 | 0.68054 |
| Best threshold | 0.19984 | ~ | ~ | F1: 0.6805771669043961 |

In [None]:
# 只看URMP的独奏结果
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if not f.endswith("0.npy")]
find_best_threshold(npys, (0.35, 0.6), step_num=10, generation=4, log=True)

- URMP独奏: Best threshold: 0.3848, F1: 0.7961901690780004

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.35000 | 0.66955 | 0.80760 | 0.79104 | 0.79439 |
| 0.37500 | 0.67200 | 0.82350 | 0.78073 | 0.79594 |
| 0.40000 | 0.67278 | 0.83754 | 0.77053 | 0.79622 |
| 0.42500 | 0.67249 | 0.85055 | 0.76028 | 0.79570 |
| Best threshold | 0.4 | ~ | ~ | F1: 0.7962197492483829 |
| 0.38000 | 0.67229 | 0.82648 | 0.77872 | 0.79610 |
| 0.38500 | 0.67248 | 0.82929 | 0.77669 | 0.79619 |
| 0.39000 | 0.67254 | 0.83205 | 0.77453 | 0.79615 |
| Best threshold | 0.385 | ~ | ~ | F1: 0.7961863006077127 |
| 0.38100 | 0.67234 | 0.82703 | 0.77834 | 0.79612 |
| 0.38200 | 0.67241 | 0.82761 | 0.77796 | 0.79617 |
| 0.38300 | 0.67243 | 0.82820 | 0.77752 | 0.79617 |
| 0.38400 | 0.67246 | 0.82874 | 0.77710 | 0.79618 |
| 0.38500 | 0.67248 | 0.82929 | 0.77669 | 0.79619 |
| 0.38600 | 0.67247 | 0.82979 | 0.77626 | 0.79617 |
| Best threshold | 0.385 | ~ | ~ | F1: 0.7961863006077127 |
| 0.38420 | 0.67246 | 0.82888 | 0.77700 | 0.79618 |
| 0.38440 | 0.67247 | 0.82899 | 0.77691 | 0.79618 |
| 0.38460 | 0.67247 | 0.82910 | 0.77684 | 0.79619 |
| 0.38480 | 0.67248 | 0.82921 | 0.77676 | 0.79619 |
| 0.38500 | 0.67248 | 0.82929 | 0.77669 | 0.79619 |
| Best threshold | 0.3848 | ~ | ~ | F1: 0.7961901690780004 |

In [14]:
# 输出参数数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))

56517
