In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pydicom
import scipy.io as io

In [3]:
ROOT = Path(r"C:\Users\이정민\Desktop\breast cancer")
ROOT_E = Path(r"E:")

BASE_DIRS = {
    (1, 100):  ROOT / "1~100"  / "1~100_screen",
    (101, 200): ROOT_E / "101~200_screen",
    (201, 300): ROOT_E / "201~300_screen",
    (301, 400): ROOT_E / "301~400_screen",
    (401, 514): ROOT_E / "401~514_screen",
}


In [4]:
bbox_files = [
    ROOT / "boundarybox" / "1.xlsx",
    ROOT / "boundarybox" / "101.xlsx",
    ROOT / "boundarybox" / "201.xlsx",
    ROOT / "boundarybox" / "301.xlsx",
    ROOT / "boundarybox" / "401.xlsx",
]

bbox_dfs = []

for p in bbox_files:
    # 1) 헤더 없이 읽기 (첫 행도 데이터로 취급)
    df = pd.read_excel(p, header=None)

    # 2) 앞 7개 열만 사용: [patient_idx, x1, x2, y1, y2, z1, z2]
    df = df.iloc[:, :7].copy()

    df.columns = ["patient_idx", "x1", "x2", "y1", "y2", "z1", "z2"]

    # 3) 환자 번호를 숫자로 변환 (혹시 모를 NaN/문자 처리)
    df["patient_idx"] = pd.to_numeric(df["patient_idx"], errors="coerce")

    # 4) patient_idx가 NaN이 아닌 행만 사용
    df = df[df["patient_idx"].notna()].copy()
    df["patient_idx"] = df["patient_idx"].astype(int)

    bbox_dfs.append(df)

bbox_all = pd.concat(bbox_dfs, ignore_index=True)

print("bbox rows (유효한 환자 수):", len(bbox_all))
print(bbox_all.head())
print("patient_idx min/max:", bbox_all["patient_idx"].min(), bbox_all["patient_idx"].max())
print("patient_idx 예시:", sorted(bbox_all["patient_idx"].unique())[:20], "...")


bbox rows (유효한 환자 수): 514
   patient_idx     x1     x2     y1     y2     z1     z2
0            1  114.0  173.0  222.0  287.0   46.0   83.0
1            2  105.0  158.0  265.0  314.0  108.0  128.0
2            3  130.0  172.0  254.0  314.0   71.0   86.0
3            4  163.0  200.0  224.0  272.0   94.0  137.0
4            5  411.0  430.0  253.0  278.0   76.0   86.0
patient_idx min/max: 1 514
patient_idx 예시: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20)] ...


In [5]:
# 각 축 별 box 크기 계산
x_size_all = (bbox_all[["x1", "x2"]].max(axis=1) - bbox_all[["x1", "x2"]].min(axis=1))
y_size_all = (bbox_all[["y1", "y2"]].max(axis=1) - bbox_all[["y1", "y2"]].min(axis=1))
z_size_all = (bbox_all[["z1", "z2"]].max(axis=1) - bbox_all[["z1", "z2"]].min(axis=1))

# 0보다 큰 값만
valid_x = x_size_all[x_size_all > 0]
valid_y = y_size_all[y_size_all > 0]
valid_z = z_size_all[z_size_all > 0]

min_x_size = int(valid_x.min())
min_y_size = int(valid_y.min())
min_z_size = int(valid_z.min())

X = min_x_size // 2
Y = min_y_size // 2
Z = min_z_size // 2

print("전체 514명 기준 size 분포:")
print("x_size min/median/max:", valid_x.min(), valid_x.median(), valid_x.max())
print("y_size min/median/max:", valid_y.min(), valid_y.median(), valid_y.max())
print("z_size min/median/max:", valid_z.min(), valid_z.median(), valid_z.max())
print()
print("공통 최소 크기:", min_x_size, min_y_size, min_z_size)
print("half sizes (X,Y,Z):", X, Y, Z)


전체 514명 기준 size 분포:
x_size min/median/max: 14.0 41.0 181.0
y_size min/median/max: 12.0 43.5 211.0
z_size min/median/max: 2.0 20.0 120.0

공통 최소 크기: 14 12 2
half sizes (X,Y,Z): 7 6 1


In [6]:
# 환자 정보 엑셀 파일들
info_files = [
    ROOT / "exeldata" / "1~100.xlsx",
    ROOT / "exeldata" / "101~200.xlsx",
    ROOT / "exeldata" / "201~300.xlsx",
    ROOT / "exeldata" / "301~400.xlsx",
    ROOT / "exeldata" / "401~514.xlsx",
]

info_dfs = [pd.read_excel(p) for p in info_files]
info_all = pd.concat(info_dfs, ignore_index=True)

mrn_col = "병록번호"
mol_col = "Molecular subtype(1 : luminal A, 2 : luminal B, 3: Her2 positive, 4: Triple negative)"

# 1~4 → 0~3
info_all["label"] = info_all[mol_col].astype(int) - 1

# 병록번호(MRN) → label 매핑
mrn_to_label = dict(zip(info_all[mrn_col].astype(int), info_all["label"].astype(int)))

print("총 환자 수(엑셀 기준):", len(info_all))
print("MRN→label 예시:", list(mrn_to_label.items())[:5])


총 환자 수(엑셀 기준): 514
MRN→label 예시: [(25563346, 0), (25553781, 3), (25515626, 2), (16207837, 1), (25590139, 0)]


In [7]:
def load_dicom_series_xyz(folder_path: str):
    folder = Path(folder_path)
    files = sorted(folder.glob("*.dcm"))
    if not files:
        raise FileNotFoundError(f"No DICOM files found in {folder}")

    datasets = [pydicom.dcmread(f) for f in files]

    instance_numbers = [int(getattr(ds, "InstanceNumber", i)) for i, ds in enumerate(datasets)]
    ds_with_idx = list(zip(datasets, instance_numbers, files))
    ds_with_idx.sort(key=lambda x: x[1])

    datasets_sorted = [d[0] for d in ds_with_idx]

    vol_zyx = np.stack([ds.pixel_array for ds in datasets_sorted])  # (z, y, x)
    vol_xyz = np.transpose(vol_zyx, (2, 1, 0))                      # (x, y, z)

    return vol_xyz


In [8]:
def get_base_dir_for_patient(patient_idx: int) -> Path:
    for (lo, hi), base in BASE_DIRS.items():
        if lo <= patient_idx <= hi:
            return base
    raise ValueError(f"No BASE_DIR for patient_idx={patient_idx}")

def get_fastpeak_dir_and_mrn(patient_idx: int):
    """
    patient_idx: 1~514

    상위 폴더:   /1~100/1~100_screen/001
                 /101~200/101~200_screen/101
    fast peak 폴더: '{patient_idx}-3 fast peak enhancement'
                    예: '1-3 fast peak enhancement', '101-3 fast peak enhancement'
    """
    base_dir = get_base_dir_for_patient(patient_idx)

    pid_str = f"{patient_idx:03d}"   # 1 -> '001', 101 -> '101'
    patient_dir = base_dir / pid_str

    fast_folder_name = f"{patient_idx}-3 fast peak enhancement"
    fast_root = patient_dir / fast_folder_name

    if not fast_root.exists():
        raise FileNotFoundError(f"No '{fast_folder_name}' in {patient_dir}")

    subdirs = [d for d in fast_root.iterdir() if d.is_dir()]
    if not subdirs:
        raise FileNotFoundError(f"No MRN subfolder inside {fast_root}")

    fastpeak_dir = subdirs[0]

    mrn_str = fastpeak_dir.name.split('_')[0]
    mrn = int(mrn_str)

    return fastpeak_dir, mrn


In [None]:
vols = []
labels = []

for idx, row in bbox_all.iterrows():
    patient_idx = int(row["patient_idx"])   # 1~514

    # 1) bbox 좌표
    x1, x2 = row["x1"], row["x2"]
    y1, y2 = row["y1"], row["y2"]
    z1, z2 = row["z1"], row["z2"]

    if any(np.isnan(v) for v in [x1, x2, y1, y2, z1, z2]):
        print(f"[WARN] patient {patient_idx}: NaN in bbox, skipped.")
        continue

    x_min = int(min(x1, x2))
    x_max = int(max(x1, x2))
    y_min = int(min(y1, y2))
    y_max = int(max(y1, y2))
    z_min = int(min(z1, z2))
    z_max = int(max(z1, z2))

    x_size = x_max - x_min
    y_size = y_max - y_min
    z_size = z_max - z_min

    # 환자 bbox 자체가 우리가 정한 공통 크기보다 작은 경우 스킵
    if x_size < 2*X or y_size < 2*Y or z_size < 2*Z:
        print(f"[WARN] patient {patient_idx}: bbox too small "
              f"({x_size},{y_size},{z_size}) < ({2*X},{2*Y},{2*Z}), skipped.")
        continue

    # 2) fast peak 폴더 + MRN
    try:
        fastpeak_dir, mrn = get_fastpeak_dir_and_mrn(patient_idx)
    except Exception as e:
        print(f"[WARN] patient {patient_idx}: fast peak dir not found - {e}")
        continue

    # 3) MRN → label
    if mrn not in mrn_to_label:
        print(f"[WARN] patient {patient_idx}, MRN {mrn}: label not found, skipped.")
        continue

    label = int(mrn_to_label[mrn])

    # 4) DICOM 로딩
    try:
        vol_xyz = load_dicom_series_xyz(fastpeak_dir)
    except Exception as e:
        print(f"[WARN] patient {patient_idx}, MRN {mrn}: DICOM load failed - {e}")
        continue

    Xdim, Ydim, Zdim = vol_xyz.shape

    # 5) 먼저 bbox로 잘라서 bbox_vol 만들기
    if x_min < 0 or x_max > Xdim or y_min < 0 or y_max > Ydim or z_min < 0 or z_max > Zdim:
        print(f"[WARN] patient {patient_idx}, MRN {mrn}: bbox out of volume; "
              f"vol={vol_xyz.shape}, bbox=({x_min}:{x_max},{y_min}:{y_max},{z_min}:{z_max})")
        continue

    bbox_vol = vol_xyz[x_min:x_max, y_min:y_max, z_min:z_max]
    bx, by, bz = bbox_vol.shape

    # bbox 안의 중심 기준으로 공통 크기(2X,2Y,2Z) crop
    cx, cy, cz = bx // 2, by // 2, bz // 2

    x_start, x_end = cx - X, cx + X
    y_start, y_end = cy - Y, cy + Y
    z_start, z_end = cz - Z, cz + Z

    vol_crop = bbox_vol[x_start:x_end, y_start:y_end, z_start:z_end]

    expected_shape = (2*X, 2*Y, 2*Z)
    if vol_crop.shape != expected_shape:
        print(f"[WARN] patient {patient_idx}, MRN {mrn}: crop shape {vol_crop.shape} != {expected_shape}, skipped.")
        continue

    vols.append(vol_crop)
    labels.append(label)

    print(f"[OK] patient {patient_idx}, MRN {mrn}: added {vol_crop.shape}, label={label}")

vols = np.array(vols)
labels = np.array(labels)

print("최종 vols.shape:", vols.shape)
print("최종 labels.shape:", labels.shape)


[OK] patient 1, MRN 25563346: added (14, 12, 2), label=0
[OK] patient 2, MRN 25553781: added (14, 12, 2), label=3
[OK] patient 3, MRN 25515626: added (14, 12, 2), label=2
[OK] patient 4, MRN 16207837: added (14, 12, 2), label=1
[OK] patient 5, MRN 25590139: added (14, 12, 2), label=0
[OK] patient 6, MRN 25541973: added (14, 12, 2), label=3
[OK] patient 7, MRN 25556052: added (14, 12, 2), label=0
[OK] patient 8, MRN 12936582: added (14, 12, 2), label=0
[OK] patient 9, MRN 25686117: added (14, 12, 2), label=1
[OK] patient 10, MRN 25613304: added (14, 12, 2), label=0
[OK] patient 11, MRN 25612116: added (14, 12, 2), label=2
[OK] patient 12, MRN 25598094: added (14, 12, 2), label=1
[OK] patient 13, MRN 24420617: added (14, 12, 2), label=2
[OK] patient 14, MRN 23623507: added (14, 12, 2), label=0
[OK] patient 15, MRN 23549957: added (14, 12, 2), label=1
[OK] patient 16, MRN 12016301: added (14, 12, 2), label=1
[OK] patient 17, MRN 25571710: added (14, 12, 2), label=2
[OK] patient 18, MRN 25