In [5]:
import os, numpy as np, pandas as pd

MODELS = ['FS_PCA_NN', 'RF', 'FS_PCA_QDA', 'FS_PCA_KNN', 'FS_PCA_LR']
PROBA_DIR = './output/probabilities'
N_FOLDS = 10

records = {}  # sample_id -> dict(label, votes[], max_p[], true_p[])

for mdl in MODELS:
    for fold in range(1, N_FOLDS+1):
        arr = np.load(f'{PROBA_DIR}/pred_dist_{mdl}_{fold}.npy')
        sid, probs = arr[:,0].astype(int), arr[:,1:]
        pred = probs.argmax(1) 
        maxp = probs.max(1)
        for s, p, m in zip(sid, pred, maxp):
            r = records.setdefault(s, {'votes':[], 'maxp':[], 'truep':[]})
            r['votes'].append(p)
            r['maxp'].append(m)
            r['probs'] = probs  # 留给后面算 true_p


In [6]:
df = pd.read_csv('Cancer2025exam.csv')
y_true = df.iloc[:,0].values
for s, r in records.items():
    label = y_true[s] - 1
    r['label'] = label
    # 用 NN 概率作为 true_p 示例；也可平均
    r['truep'] = r['probs'][r['votes'].index(r['votes'][0]), label]


In [10]:
candidates = []
for s, r in records.items():
    # a) 五个模型中 ≥4 票都不是真实标签
    wrong_votes = sum(v != r['label'] for v in r['votes'])
    # b) 平均最大置信度 ≥0.9
    mean_maxp = np.mean(r['maxp'])
    # c) 对真实标签的概率 ≤0.1
    if wrong_votes >= 4 and mean_maxp >= .9 and r['truep'] <= .1:
        candidates.append((s, r['label']+1, np.bincount(r['votes']).argmax()+1,
                           wrong_votes, mean_maxp))


In [14]:
cand_df = pd.DataFrame(candidates,
        columns=['sample_id','orig_label','consensus_pred',
                 'wrong_votes','mean_maxp'])
print('\n疑似错标样本（top 10）：')
print(cand_df.to_string(index=False))
print(cand_df.shape[0], '个样本疑似错标')



疑似错标样本（top 10）：
 sample_id  orig_label  consensus_pred  wrong_votes  mean_maxp
      1081           2               1            5   0.902863
       492           2               1            5   0.910863
      1019           2               1            5   0.927117
       587           2               1            5   0.906756
       803           2               1            5   0.900030
      1561           2               1            5   0.948526
       831           2               1            5   0.907399
      1076           2               1            5   0.916688
      1640           2               1            5   0.908648
9 个样本疑似错标


In [15]:
from sklearn.manifold import TSNE
X = df.iloc[:,1:].values
emb = TSNE(n_components=2, perplexity=30, random_state=0).fit_transform(X)
...  # 只用 plt.scatter 即可，无需保存


Ellipsis

In [16]:
from cleanlab.filter import find_label_issues
issues = find_label_issues(y=y_true, pred_probs=probs_all_folds, return_indices_ranked_by='self_confidence')
print('cleanlab 前 20 个问题索引:', issues[:20])


ModuleNotFoundError: No module named 'cleanlab'