# "音色分离转录"模型评估

## 生成转录结果并保存
经过process后每个文件夹里的文件只有后缀不同，且后缀为"npy" "wav" "mid"。wav采样率已经是22050Hz。

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import os
from timbre_sepatate_eval_utils import *
import sys
sys.path.append('..')

s_per_frame = 256 / 22050

### 运行模型

In [None]:
model_folder_name = "septimbre"
sys.path.append(f'../{model_folder_name}')
model = torch.load(f"../{model_folder_name}/sepamt_model.pth", weights_only=False)
model.eval()

In [None]:
# 得到所有2、3混合的运行结果
with torch.no_grad():
    amt_mix_dataset(model, 'BACH10_processed', 2, f"./{model_folder_name}")
    amt_mix_dataset(model, 'BACH10_processed', 3, f"./{model_folder_name}")
    amt_mix_dataset(model, 'BACH10_processed', 4, f"./{model_folder_name}")
    amt_urmp(model, f"./{model_folder_name}", 2)
    amt_urmp(model, f"./{model_folder_name}", 3)

### 寻找最好阈值

In [None]:
# 为了方便粘贴到excel
output_results = {}
def print_results(results):
    result_arr = []
    for key, detail in results.items():
        result_arr.extend([str(v) for k, v in detail.items()])
    print("|".join(result_arr))

In [None]:
# BACH10的2合奏
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 2)]
best_thre, max_acc, max_p, max_r, max_f1 = find_best_threshold(npys, (0.3, 0.8), step_num=6, generation=5, log=True)
print("note level evaluation at best threshold:")
best_onset_thre, note_p, note_r, note_f, note_overlap = find_best_onset_threshold(npys, best_thre, (0.3, 0.9), step_num=6, generation=5, log=True)
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=2, log=True)

output_results["BACH10_2mix"] = {
    "frame_thre": best_thre,
    "frame_acc": max_acc,
    "frame_p": max_p,
    "frame_r": max_r,
    "frame_f1": max_f1,
    "onset_thre": best_onset_thre,
    "note_p": note_p,
    "note_r": note_r,
    "note_f": note_f,
    "note_overlap": note_overlap,
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
# BACH10的3合奏
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 3)]
best_thre, max_acc, max_p, max_r, max_f1 = find_best_threshold(npys, (0.1, 0.6), step_num=6, generation=5, log=True)
print("note level evaluation at best threshold:")
best_onset_thre, note_p, note_r, note_f, note_overlap = find_best_onset_threshold(npys, best_thre, (0.2, 0.8), step_num=6, generation=5, log=True)
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=3, log=True)

output_results["BACH10_3mix"] = {
    "frame_thre": best_thre,
    "frame_acc": max_acc,
    "frame_p": max_p,
    "frame_r": max_r,
    "frame_f1": max_f1,
    "onset_thre": best_onset_thre,
    "note_p": note_p,
    "note_r": note_r,
    "note_f": note_f,
    "note_overlap": note_overlap,
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
# BACH10的4合奏
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 4)]
best_thre, max_acc, max_p, max_r, max_f1 = find_best_threshold(npys, (0.1, 0.6), step_num=6, generation=5, log=True)
print("note level evaluation at best threshold:")
best_onset_thre, note_p, note_r, note_f, note_overlap = find_best_onset_threshold(npys, best_thre, (0.2, 0.8), step_num=6, generation=5, log=True)
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=4, log=True)
output_results["BACH10_4mix"] = {
    "frame_thre": best_thre,
    "frame_acc": max_acc,
    "frame_p": max_p,
    "frame_r": max_r,
    "frame_f1": max_f1,
    "onset_thre": best_onset_thre,
    "note_p": note_p,
    "note_r": note_r,
    "note_f": note_f,
    "note_overlap": note_overlap,
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
# URMP的2合奏
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 2)]
best_thre, max_acc, max_p, max_r, max_f1 = find_best_threshold(npys, (0.1, 0.6), step_num=6, generation=5, log=True)
print("note level evaluation at best threshold:")
best_onset_thre, note_p, note_r, note_f, note_overlap = find_best_onset_threshold(npys, best_thre, (0.2, 0.6), step_num=6, generation=5, log=True)
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=2, log=True)
output_results["URMP_2mix"] = {
    "frame_thre": best_thre,
    "frame_acc": max_acc,
    "frame_p": max_p,
    "frame_r": max_r,
    "frame_f1": max_f1,
    "onset_thre": best_onset_thre,
    "note_p": note_p,
    "note_r": note_r,
    "note_f": note_f,
    "note_overlap": note_overlap,
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
# URMP的3合奏
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 3)]
best_thre, max_acc, max_p, max_r, max_f1 = find_best_threshold(npys, (0.1, 0.6), step_num=6, generation=5, log=True)
print("note level evaluation at best threshold:")
best_onset_thre, note_p, note_r, note_f, note_overlap = find_best_onset_threshold(npys, best_thre, (0.2, 0.6), step_num=6, generation=5, log=True)
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=3, log=True)
output_results["URMP_3mix"] = {
    "frame_thre": best_thre,
    "frame_acc": max_acc,
    "frame_p": max_p,
    "frame_r": max_r,
    "frame_f1": max_f1,
    "onset_thre": best_onset_thre,
    "note_p": note_p,
    "note_r": note_r,
    "note_f": note_f,
    "note_overlap": note_overlap,
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
print_results(output_results)

In [None]:
# 输出参数数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model.note_branch))
print(count_parameters(model.encoder))
print(count_parameters(model))

## For fixed note_branch
/septimbre/sepamt_note_branch.pth

In [None]:
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 2)]
best_thre = 0.391975565736498
best_onset_thre = 0.585714285714285
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=2, log=True)
output_results["BACH10_2mix_fixed"] = {
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 3)]
best_thre = 0.334902124114952
best_onset_thre = 0.544897959183673
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=3, log=True)
output_results["BACH10_3mix_fixed"] = {
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
npyfolder = f"{model_folder_name}/BACH10_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 4)]
best_thre = 0.270068027210884
best_onset_thre = 0.535276967930029
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=4, log=True)
output_results["BACH10_4mix_fixed"] = {
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 2)]
best_thre = 0.314008052200472
best_onset_thre = 0.39047619047619
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=2, log=True)
output_results["URMP_2mix_fixed"] = {
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

In [None]:
npyfolder = f"{model_folder_name}/URMP_eval"
npys = [os.path.join(npyfolder, f) for f in os.listdir(npyfolder) if ifNmix(f, 3)]
sep_P, sep_R, sep_F, sep_Overlap = evaluate_sep_note_dataset(npyfolder, frame_thresh=best_thre, onset_thresh=best_onset_thre, mix=3, log=True)
output_results["URMP_3mix_fixed"] = {
    "sep_p": sep_P,
    "sep_r": sep_R,
    "sep_f": sep_F,
    "sep_overlap": sep_Overlap,
}

## t-SNE可视化音色编码
并不好。只有2&4（clarinet & bassoon）分得开，其余都混在一起。

In [None]:
import sys
sys.path.append('..')
from utils.midiarray import array2notes
from sklearn.manifold import TSNE
import numpy as np
from matplotlib import pyplot as plt

# npy_path = './septimbre/BACH10_eval/03-ChristederdubistTagundLicht_1&4&2&3.npz'
npy_path = "./septimbre/URMP_eval/18_Nocturne_vn_fl_tpt_1&2&3.npz"

result = np.load(npy_path)

In [None]:
emb = result['emb']  # (D, T, 128)
midi = result['midi']  # (mix, T)
emb_extracted = []
for n in midi:
    p = np.where(n > 0.1)
    emb_one = emb[:, p[0], p[1]].T        # (n, 18)
    emb_extracted.append(emb_one)

# 合并所有mix的数据
X_all = np.concatenate(emb_extracted, axis=0)
y_all = []
for idx, X_part in enumerate(emb_extracted):
    y_all.extend([idx] * len(X_part))
y_all = np.array(y_all)

tsne = TSNE(
    n_components=2,
    perplexity=100,
    random_state=3407
)

X_tsne = tsne.fit_transform(X_all)
for idx in range(len(emb_extracted)):
    plt.scatter(
        X_tsne[y_all == idx, 0],
        X_tsne[y_all == idx, 1],
        label=f"mix {idx+1}",
        alpha=0.5
    )
plt.legend()
plt.title(f"t-SNE visualization for {npy_path}")
plt.show()

In [None]:
# 音符级别分离可视化
emb = result['emb']  # (D, T, 128)
midi = result['midi']  # (mix, F, T)
frame = result['frame']  # (F, T)
note_events = array2notes(midi)
emb_extracted = []
for track in note_events:
    emb_notes = []
    for onset, offset, note in track:
        frame_weight = frame[note, onset:offset]    # (duration,)
        emb_note = emb[:, note, onset:offset]  # (D, duration)
        emb_note = (emb_note * frame_weight).sum(axis=1)    # (D,)
        emb_note = emb_note / np.linalg.norm(emb_note)
        emb_notes.append(emb_note)
    emb_extracted.append(emb_notes)

tsne = TSNE(
    n_components=2,
    perplexity=10,
    random_state=42
)

# t-SNE 可视化
X_all = np.concatenate(emb_extracted, axis=0)
y_all = []
for idx, emb_notes in enumerate(emb_extracted):
    y_all.extend([idx] * len(emb_notes))
y_all = np.array(y_all)

X_tsne = tsne.fit_transform(X_all)

for idx in range(len(emb_extracted)):
    plt.scatter(
        X_tsne[y_all == idx, 0],
        X_tsne[y_all == idx, 1],
        label=f"class {idx+1}",
        alpha=0.6
    )
plt.legend()
plt.title(f"Note-level t-SNE visualization for {npy_path}")
plt.show()