# "音色无关转录"模型评估
本文件记录了V4模型的评估结果
## 生成转录结果并保存
经过process后每个文件夹里的文件只有后缀不同，且后缀为"npy" "wav" "mid"。wav采样率已经是22050Hz。

In [1]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
sys.path.append('..')

s_per_frame = 256 / 22050

dataset_folders = ["BACH10_processed", "URMP_processed", "PHENICX_processed"]

In [3]:
def amt_one(model, file):
    waveform, sample_rate = torchaudio.load(file)
    waveform = waveform.unsqueeze(0)
    onset, note = model(waveform)
    onset = onset.cpu().numpy()[0]
    note = note.cpu().numpy()[0]
    return onset, note

def amt_piece(model, piece_folder):
    filename = os.listdir(piece_folder)[0]
    path = os.path.join(piece_folder, filename)[:-3]    # 去掉后缀
    onset, note = amt_one(model, path + "wav")
    midi = np.load(path+"npy")
    # 补时间长度
    freqs, times = note.shape
    if midi.shape[1] < times:
        padding = np.zeros((freqs, times - midi.shape[1]))
        midi = np.concatenate((midi, padding), axis=1)
    elif midi.shape[1] > times:
        midi = midi[:, :times]
    return onset, note, midi

def amt_dataset(model, dataset_folder, output_folder = './'):
    folder_name = os.path.basename(dataset_folder)
    print(f"processing {folder_name}")
    output_folder_name = folder_name.split("_")[0] + "_eval"
    output_path = os.path.join(output_folder, output_folder_name)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for piece_folder in os.listdir(dataset_folder):
        if os.path.isdir(os.path.join(dataset_folder, piece_folder)):
            result = amt_piece(model, os.path.join(dataset_folder, piece_folder))
            np.save(os.path.join(output_path, piece_folder+".npy"), np.stack(result, axis=0))
            print(f"\tFinish {piece_folder}")


In [4]:
# 计算帧级的评价指标
from utils.midiarray import freq_map, roll2evalarray
from utils.postprocess import min_len_
import mir_eval

s_per_frame = 256 / 22050
freqmap = freq_map((24, 107), 440)

def frame_eval(note, midi, threshold = 0.5):
    """
    对note进行阈值二值化、移除短音符、转换为mir_eval所需数
    计算帧级评价指标
    note: (freqs, times)
    midi: (freqs, times)
    """
    binary_note = (note > threshold).astype(int)    # 二值化
    # 这个min_len_是原位操作，会修改输入
    est_pitch = roll2evalarray(min_len_(binary_note, 3), freqmap)
    ref_pitch = roll2evalarray(midi, freqmap)
    rst_time = s_per_frame * np.arange(len(est_pitch))
    ref_time = s_per_frame * np.arange(len(ref_pitch))
    result = mir_eval.multipitch.evaluate(ref_time, ref_pitch, rst_time, est_pitch)
    return result   # https://github.com/mir-evaluation/mir_eval/blob/main/mir_eval/multipitch.py


def evaluate_frame_dataset(npy_pathes, threshold = 0.5, log = True):
    """
    对npy_pathes中的所有npy文件用同一个阈值进行评估
    dataset_folder: folder containing npy files, each file is a result of amt_piece, shape (3, freqs, times): onset, note, midi
    """
    accs = []
    ps = []
    rs = []
    f1s = []
    for npy_file in npy_pathes:
        result = np.load(npy_file)
        evaluation = frame_eval(result[1], result[2], threshold)
        acc = evaluation['Accuracy']
        p = evaluation['Precision']
        r = evaluation['Recall']
        accs.append(acc)
        ps.append(p)
        rs.append(r)
        f1s.append(2*p*r/(p+r) if p+r > 0 else 0)
    ACC = np.mean(accs)
    P = np.mean(ps)
    R = np.mean(rs)
    F1 = np.mean(f1s)
    if log:
        # | Acc | P | R | F1 |
        print(f"| {threshold:.5f} | {ACC:.5f} | {P:.5f} | {R:.5f} | {F1:.5f} |")
    return ACC, P, R, F1


def find_best_threshold(npy_pathes, origin_range = (0.1, 0.9), step_num = 10, generation = 4, log = True):
    if log:
        print("| threshold | Acc | P | R | F1 |")
        print("| --------- | --- |---|---|----|")
    
    start = origin_range[0]
    end = origin_range[1]
    step = (end - start) / step_num
    
    best_thre = -1
    max_f1 = -1
    best_thre_idx = -1

    for g in range(generation):
        lastF1 = -1
        thresholds = np.r_[start:end:step]
        for idx, thre in enumerate(thresholds):
            ACC, P, R, F1 = evaluate_frame_dataset(npy_pathes, thre, log)
            if F1 > max_f1:
                max_f1 = F1
                best_thre_idx = idx
                best_thre = thre
            if F1 < lastF1: # 假设F1是一个凹函数，只要开始下降就可以停止了
                break
            lastF1 = F1
        if log:
            print(f"| Best threshold | {best_thre} | ~ | ~ | F1: {max_f1} |")
        
        # 如果是边缘的话，下一轮start不会覆盖到最优值，所以提前加入；否则清空
        if best_thre_idx == -1: # 说明最优值还在左边
            best_thre_idx = -1  # -1表示最大值在左边外面
            start = best_thre
            end = thresholds[0]
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == 0:    # 最值就是最左边的
            best_thre_idx = -1
            start = best_thre
            end = thresholds[1]
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == -2:   # 如果是右边缘的右边最大，说明最优值还在右边
            best_thre_idx = -2      # -2表示最大值在右边外面
            start = thresholds[-1]
            end = best_thre
            step = (end - start) / step_num
            start += step
        elif best_thre_idx == len(thresholds) - 1:  # 最值就是最右边的
            best_thre_idx = -2
            start = thresholds[-2]
            end = best_thre
            step = (end - start) / step_num
            start += step
        else:
            start = thresholds[best_thre_idx-1]
            end = thresholds[best_thre_idx+1]
            step = (end - start) / step_num
            start += step   # 少分析一轮
            max_f1 = -1     # 清空最大值，因为一定在区间内

    return best_thre

### 运行模型

In [None]:
model_folder_name = "basicamt"
sys.path.append(f'../{model_folder_name}')
model = torch.load(f"../{model_folder_name}/basicamt_model.pth")
model.eval()

In [None]:
# 得到所有运行结果
with torch.no_grad():
    for dataset_folder in dataset_folders:
        amt_dataset(model, dataset_folder, f"./{model_folder_name}")

### 计算帧级指标，并寻找最好阈值

In [None]:
# 只看BACH10的合奏结果
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith("0.npy")]
find_best_threshold(npys, (0.1, 0.5), step_num=10, generation=4, log=True)

- BACH10的合奏: Best threshold: 0.14224, F1: 0.7962386620882139

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.10000 | 0.64832 | 0.73652 | 0.84408 | 0.78628 |
| 0.14000 | 0.66175 | 0.79144 | 0.80163 | 0.79614 |
| 0.18000 | 0.65707 | 0.82674 | 0.76222 | 0.79281 |
| Best threshold | 0.14 | ~ | ~ | F1: 0.7961447685583972 |
| 0.10800 | 0.65361 | 0.75003 | 0.83557 | 0.79017 |
| 0.11600 | 0.65786 | 0.76242 | 0.82750 | 0.79330 |
| 0.12400 | 0.66064 | 0.77375 | 0.81891 | 0.79532 |
| 0.13200 | 0.66135 | 0.78300 | 0.80993 | 0.79586 |
| 0.14000 | 0.66175 | 0.79144 | 0.80163 | 0.79614 |
| 0.14800 | 0.66180 | 0.79972 | 0.79342 | 0.79620 |
| 0.15600 | 0.66103 | 0.80714 | 0.78518 | 0.79566 |
| Best threshold | 0.148 | ~ | ~ | F1: 0.7961999935719664 |
| 0.14160 | 0.66176 | 0.79300 | 0.80005 | 0.79616 |
| 0.14320 | 0.66167 | 0.79462 | 0.79829 | 0.79609 |
| Best threshold | 0.1416 | ~ | ~ | F1: 0.7961552781664798 |
| 0.14176 | 0.66175 | 0.79316 | 0.79987 | 0.79615 |
| 0.14192 | 0.66178 | 0.79340 | 0.79968 | 0.79617 |
| 0.14208 | 0.66180 | 0.79353 | 0.79957 | 0.79618 |
| 0.14224 | 0.66188 | 0.79374 | 0.79947 | 0.79624 |
| 0.14240 | 0.66182 | 0.79388 | 0.79926 | 0.79620 |
| Best threshold | 0.14224 | ~ | ~ | F1: 0.7962386620882139 |

In [None]:
# 看BACH10所有音频的结果（独奏+合奏）
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith(".npy")]
find_best_threshold(npys, (0.1, 0.7), step_num=10, generation=4, log=True)

- BACH10所有音频: Best threshold: 0.29728, F1: 0.8550319205664922

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.10000 | 0.67358 | 0.71401 | 0.92169 | 0.79670 |
| 0.16000 | 0.72860 | 0.79166 | 0.90021 | 0.83610 |
| 0.22000 | 0.75024 | 0.83295 | 0.88194 | 0.85058 |
| 0.28000 | 0.75735 | 0.85808 | 0.86499 | 0.85482 |
| 0.34000 | 0.75803 | 0.87536 | 0.84937 | 0.85451 |
| Best threshold | 0.28 | ~ | ~ | F1: 0.854823299821161 |
| 0.23200 | 0.75219 | 0.83847 | 0.87851 | 0.85181 |
| 0.24400 | 0.75433 | 0.84439 | 0.87501 | 0.85315 |
| 0.25600 | 0.75561 | 0.84935 | 0.87151 | 0.85389 |
| 0.26800 | 0.75650 | 0.85366 | 0.86827 | 0.85436 |
| 0.28000 | 0.75735 | 0.85808 | 0.86499 | 0.85482 |
| 0.29200 | 0.75766 | 0.86176 | 0.86175 | 0.85486 |
| 0.30400 | 0.75804 | 0.86545 | 0.85874 | 0.85500 |
| 0.31600 | 0.75822 | 0.86894 | 0.85562 | 0.85495 |
| Best threshold | 0.304 | ~ | ~ | F1: 0.8549980720124114 |
| 0.29440 | 0.75786 | 0.86261 | 0.86120 | 0.85497 |
| 0.29680 | 0.75796 | 0.86342 | 0.86055 | 0.85502 |
| 0.29920 | 0.75795 | 0.86408 | 0.85991 | 0.85500 |
| Best threshold | 0.2968 | ~ | ~ | F1: 0.8550222647514515 |
| 0.29488 | 0.75788 | 0.86278 | 0.86107 | 0.85499 |
| 0.29536 | 0.75789 | 0.86293 | 0.86095 | 0.85499 |
| 0.29584 | 0.75790 | 0.86306 | 0.86083 | 0.85499 |
| 0.29632 | 0.75792 | 0.86323 | 0.86069 | 0.85500 |
| 0.29680 | 0.75796 | 0.86342 | 0.86055 | 0.85502 |
| 0.29728 | 0.75797 | 0.86354 | 0.86045 | 0.85503 |
| 0.29776 | 0.75797 | 0.86367 | 0.86032 | 0.85503 |
| Best threshold | 0.29728 | ~ | ~ | F1: 0.8550319205664922 |

In [None]:
# 看PHENICX合奏音频的结果
npyfolder = f"{model_folder_name}/PHENICX_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith(".npy")]
find_best_threshold(npys, (0.01, 0.3), step_num=10, generation=4, log=True)

- PHENICX合奏: Best threshold: 0.059416, F1: 0.6014269470574202

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.01000 | 0.21776 | 0.22678 | 0.84575 | 0.35745 |
| 0.03900 | 0.40028 | 0.49052 | 0.68578 | 0.57129 |
| 0.06800 | 0.43015 | 0.60474 | 0.59709 | 0.60011 |
| 0.09700 | 0.41780 | 0.65481 | 0.53374 | 0.58724 |
| Best threshold | 0.068 | ~ | ~ | F1: 0.6001100338083282 |
| 0.04480 | 0.42036 | 0.53312 | 0.66540 | 0.59123 |
| 0.05060 | 0.42773 | 0.55813 | 0.64650 | 0.59831 |
| 0.05640 | 0.43032 | 0.57620 | 0.62899 | 0.60066 |
| 0.06220 | 0.43121 | 0.59188 | 0.61271 | 0.60134 |
| 0.06800 | 0.43015 | 0.60474 | 0.59709 | 0.60011 |
| Best threshold | 0.0622 | ~ | ~ | F1: 0.6013360502611242 |
| 0.05756 | 0.43076 | 0.57970 | 0.62572 | 0.60106 |
| 0.05872 | 0.43106 | 0.58294 | 0.62253 | 0.60131 |
| 0.05988 | 0.43121 | 0.58609 | 0.61924 | 0.60142 |
| 0.06104 | 0.43110 | 0.58890 | 0.61582 | 0.60127 |
| Best threshold | 0.05988 | ~ | ~ | F1: 0.6014200578203976 |
| 0.05895 | 0.43112 | 0.58355 | 0.62196 | 0.60136 |
| 0.05918 | 0.43119 | 0.58425 | 0.62131 | 0.60143 |
| 0.05942 | 0.43120 | 0.58486 | 0.62062 | 0.60143 |
| 0.05965 | 0.43116 | 0.58542 | 0.61990 | 0.60138 |
| Best threshold | 0.059416 | ~ | ~ | F1: 0.6014269470574202 |

In [None]:
# 只看URMP的合奏结果
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if f.endswith("0.npy")]
find_best_threshold(npys, (0.05, 0.5), step_num=10, generation=4, log=True)

- URMP的合奏: Best threshold: 0.12632, F1: 0.7297977379242105

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.05000 | 0.50216 | 0.56511 | 0.81476 | 0.66458 |
| 0.09500 | 0.57162 | 0.70343 | 0.74968 | 0.72360 |
| 0.14000 | 0.57882 | 0.76295 | 0.70303 | 0.72920 |
| 0.18500 | 0.57076 | 0.79811 | 0.66499 | 0.72230 |
| Best threshold | 0.14 | ~ | ~ | F1: 0.7292048718603422 |
| 0.10400 | 0.57553 | 0.71885 | 0.73940 | 0.72675 |
| 0.11300 | 0.57778 | 0.73203 | 0.72956 | 0.72851 |
| 0.12200 | 0.57914 | 0.74378 | 0.72044 | 0.72956 |
| 0.13100 | 0.57943 | 0.75406 | 0.71159 | 0.72976 |
| 0.14000 | 0.57882 | 0.76295 | 0.70303 | 0.72920 |
| Best threshold | 0.131 | ~ | ~ | F1: 0.7297645740182008 |
| 0.12380 | 0.57931 | 0.74594 | 0.71867 | 0.72968 |
| 0.12560 | 0.57939 | 0.74811 | 0.71682 | 0.72973 |
| 0.12740 | 0.57947 | 0.75013 | 0.71513 | 0.72980 |
| 0.12920 | 0.57947 | 0.75214 | 0.71332 | 0.72979 |
| Best threshold | 0.1274 | ~ | ~ | F1: 0.7298005275017999 |
| 0.12596 | 0.57945 | 0.74853 | 0.71652 | 0.72978 |
| 0.12632 | 0.57947 | 0.74894 | 0.71619 | 0.72980 |
| 0.12668 | 0.57945 | 0.74933 | 0.71582 | 0.72978 |
| Best threshold | 0.12632 | ~ | ~ | F1: 0.7297977379242105 |

In [None]:
# 只看URMP的独奏结果
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if not f.endswith("0.npy")]
find_best_threshold(npys, (0.05, 0.5), step_num=10, generation=4, log=True)

- URMP的独奏: Best threshold: 0.34268, F1: 0.8084040173010774

| threshold | Acc | P | R | F1 |
| --------- | --- |---|---|----|
| 0.05000 | 0.42883 | 0.45577 | 0.88274 | 0.59044 |
| 0.09500 | 0.57598 | 0.63627 | 0.85576 | 0.72139 |
| 0.14000 | 0.63258 | 0.71822 | 0.83655 | 0.76641 |
| 0.18500 | 0.66138 | 0.76817 | 0.82096 | 0.78825 |
| 0.23000 | 0.67659 | 0.80193 | 0.80714 | 0.79941 |
| 0.27500 | 0.68468 | 0.82680 | 0.79459 | 0.80516 |
| 0.32000 | 0.68876 | 0.84700 | 0.78246 | 0.80792 |
| 0.36500 | 0.68952 | 0.86298 | 0.77077 | 0.80827 |
| 0.41000 | 0.68773 | 0.87553 | 0.75920 | 0.80674 |
| Best threshold | 0.365 | ~ | ~ | F1: 0.8082664075250976 |
| 0.32900 | 0.68929 | 0.85063 | 0.78017 | 0.80826 |
| 0.33800 | 0.68947 | 0.85394 | 0.77774 | 0.80836 |
| 0.34700 | 0.68954 | 0.85712 | 0.77532 | 0.80837 |
| 0.35600 | 0.68952 | 0.86014 | 0.77294 | 0.80830 |
| Best threshold | 0.347 | ~ | ~ | F1: 0.8083715965125569 |
| 0.33980 | 0.68946 | 0.85458 | 0.77723 | 0.80834 |
| 0.34160 | 0.68951 | 0.85529 | 0.77674 | 0.80836 |
| 0.34340 | 0.68954 | 0.85593 | 0.77627 | 0.80839 |
| 0.34520 | 0.68953 | 0.85653 | 0.77579 | 0.80838 |
| Best threshold | 0.3434 | ~ | ~ | F1: 0.8083862848260331 |
| 0.34196 | 0.68951 | 0.85546 | 0.77663 | 0.80837 |
| 0.34232 | 0.68954 | 0.85559 | 0.77655 | 0.80839 |
| 0.34268 | 0.68957 | 0.85574 | 0.77646 | 0.80840 |
| 0.34304 | 0.68955 | 0.85583 | 0.77638 | 0.80839 |
| Best threshold | 0.34268 | ~ | ~ | F1: 0.8084040173010774 |

In [3]:
# 输出参数数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model.cqt))
print(count_parameters(model))

19944
61584
