In [14]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from glob import glob
from PIL import Image
import numpy as np
from torchvision import transforms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
from PIL import Image
import os

In [15]:
# 读取数据
train_csv_path = r"C:\Users\Vivo\2025_medicalimage_and_AI\MURA-v1.1\train_labeled_studies.csv"
valid_csv_path = r"C:\Users\Vivo\2025_medicalimage_and_AI\MURA-v1.1\valid_labeled_studies.csv"
data_root = r"C:\Users\Vivo\2025_medicalimage_and_AI"


In [16]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=1)
])

# 获取病人统计信息的函数
def get_patient_stats(csv_path, mode):
    df = pd.read_csv(csv_path, header=None, names=['path', 'label'])
    stats = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        study_rel_path, label = row['path'], row['label']
        study_path = os.path.join(data_root, study_rel_path)
        image_paths = glob(os.path.join(study_path, "*.png"))

        pixel_means = []
        pixel_stds = []

        for img_path in image_paths:
            img = Image.open(img_path).convert("L")
            img = transform(img)  # 你可以用适当的 transform
            img_np = np.array(img).astype(np.float32) / 255.0
            pixel_means.append(img_np.mean())
            pixel_stds.append(img_np.std())

        patient_mean = np.mean(pixel_means)
        patient_std = np.mean(pixel_stds)
        body_part = study_rel_path.split("/")[2]

        stats.append({
            "path": study_rel_path,  # 加入 path
            "body_part": body_part,
            "label": label,
            "mean": patient_mean,
            "std": patient_std,
            "mode": mode
        })

    return pd.DataFrame(stats)


In [17]:

# 获取训练集和验证集的病人统计信息
train_stats = get_patient_stats(train_csv_path, "train")
valid_stats = get_patient_stats(valid_csv_path, "valid")

# 合并训练集和验证集的统计信息
df = pd.concat([train_stats, valid_stats], ignore_index=True)
print(df.head())
print(df.shape)  # 检查数据集大小

100%|██████████| 13457/13457 [02:00<00:00, 111.86it/s]
100%|██████████| 1199/1199 [00:10<00:00, 116.21it/s]

                                                path    body_part  label  \
0  MURA-v1.1/train/XR_SHOULDER/patient00001/study...  XR_SHOULDER      1   
1  MURA-v1.1/train/XR_SHOULDER/patient00002/study...  XR_SHOULDER      1   
2  MURA-v1.1/train/XR_SHOULDER/patient00003/study...  XR_SHOULDER      1   
3  MURA-v1.1/train/XR_SHOULDER/patient00004/study...  XR_SHOULDER      1   
4  MURA-v1.1/train/XR_SHOULDER/patient00005/study...  XR_SHOULDER      1   

       mean       std   mode  
0  0.289677  0.138886  train  
1  0.172353  0.117897  train  
2  0.246644  0.115778  train  
3  0.287982  0.133889  train  
4  0.178952  0.121661  train  
(14656, 6)





In [20]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# 步骤 1：分层抽样与 KMeans + PCA 聚类
stratified = df.groupby('body_part')

df_train_new = []
df_valid_new = []

# 先依 body_part 分層
for body_part, body_group in df.groupby('body_part'):
    label_counts = body_group['label'].value_counts(normalize=True)
    label_ratio_0 = label_counts.get(0, 0)
    label_ratio_1 = label_counts.get(1, 0)

    # 根据原始比例计算验证集的目标大小
    total_samples = len(body_group)
    valid_size = int(total_samples * 0.2)  # 验证集大小为 20%

    # 计算验证集标签 0 和标签 1 的目标样本数
    valid_size_0 = int(valid_size * (label_ratio_0))
    valid_size_1 = valid_size - valid_size_0  # 剩余的验证集大小作为标签 1 的样本数

    # 计算每个标签的新倍率
    new_ratio_0 = (valid_size / 2) / valid_size_0  # 调整后倍率，确保标签 0 和标签 1 数量平衡
    new_ratio_1 = (valid_size / 2) / valid_size_1  # 调整后倍率，确保标签 0 和标签 1 数量平衡

    # 引入随机性，范围 ±0.03
    random_factor = np.random.uniform(-0.05, 0.05)

    # 调整倍率，确保两者总和不变
    adjusted_ratio_0 = new_ratio_0 + random_factor
    adjusted_ratio_1 = new_ratio_1 - random_factor

    # 计算实际抽样数量
    label_group_0 = body_group[body_group['label'] == 0]
    label_group_1 = body_group[body_group['label'] == 1]

    # 根据调整后的倍率计算每个标签的抽样数量
    label_group_valid_0 = label_group_0.sample(n=int(valid_size_0 * adjusted_ratio_0), random_state=42)
    label_group_valid_1 = label_group_1.sample(n=int(valid_size_1 * adjusted_ratio_1), random_state=42)

    # 剩余样本作为训练集
    label_group_train_0 = label_group_0.drop(label_group_valid_0.index)
    label_group_train_1 = label_group_1.drop(label_group_valid_1.index)

    # 合并训练集和验证集
    df_train_new.append(label_group_train_0)
    df_train_new.append(label_group_train_1)
    df_valid_new.append(label_group_valid_0)
    df_valid_new.append(label_group_valid_1)

# 合并最终的训练集和验证集
df_train_final = pd.concat(df_train_new, ignore_index=True)
df_valid_final = pd.concat(df_valid_new, ignore_index=True)

# 确保每行只有路径和标签
df_train_final_c = df_train_final[['path', 'label']]
df_valid_final_c = df_valid_final[['path', 'label']]

# 输出最终的训练集和验证集
df_train_final_c.to_csv('train_cluster_split.csv', index=False, header=False)
df_valid_final_c.to_csv('valid_cluster_split.csv', index=False, header=False)

# 可选：生成数据分布报告
print("训练集分布：")
print(df_train_final_c['label'].value_counts())
print("\n验证集分布：")
print(df_valid_final_c['label'].value_counts())


训练集分布：
label
0    7462
1    4269
Name: count, dtype: int64

验证集分布：
label
0    1479
1    1446
Name: count, dtype: int64


In [21]:

# 打印分布报告
def show_distribution(df_new, name):
    dist = df_new.groupby(["body_part", "label"]).size().unstack(fill_value=0)
    print(f"\n📊 {name} Data Distribution:")
    print(dist)
    print("-" * 40)

show_distribution(df_train_final, "Train")
show_distribution(df_valid_final, "Valid")
print("shape of train:", df_train_final.shape)
print("shape of valid:", df_valid_final.shape)
print("✅ Done! New train/valid csv saved.")


📊 Train Data Distribution:
label           0     1
body_part              
XR_ELBOW      989   540
XR_FINGER    1163   526
XR_FOREARM    562   248
XR_HAND      1391   365
XR_HUMERUS    321   263
XR_SHOULDER  1154  1259
XR_WRIST     1882  1068
----------------------------------------

📊 Valid Data Distribution:
label          0    1
body_part            
XR_ELBOW     197  186
XR_FINGER    209  212
XR_FOREARM    97  103
XR_HAND      207  222
XR_HUMERUS    68   75
XR_SHOULDER  309  293
XR_WRIST     392  355
----------------------------------------
shape of train: (11731, 6)
shape of valid: (2925, 6)
✅ Done! New train/valid csv saved.
