# 0. 구글 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. 데이터 재구성

In [6]:
# === 0) 기본 세팅 ============================================================
# 런타임: Python 3.x (Colab), 런타임 유형: 고사양(가능하면 High-RAM) 권장
import os, re, json, gc
import pandas as pd
import numpy as np

# from google.colab import drive
# drive.mount('/content/drive')

# === 1) 경로/옵션 =============================================================
HOSP_DIR   = f"/content/drive/MyDrive/DILAB/mimiciv_3.1/files/hosp"
ICU_DIR    = f"/content/drive/MyDrive/DILAB/mimiciv_3.1/files/icu"
NOTE_DIR   = f"/content/drive/MyDrive/DILAB/mimic-iv-note_2.2/files/note"

OUTPUT_DIR = "/content/drive/MyDrive/DILAB/mimic-iv_reconstructed" # 👈 결과 저장 경로
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 테스트/성능 옵션
HADM_LIMIT                = 3    # e.g., 2000  # 먼저 소량으로 검증 후 None으로 전체
USE_EMAR                  = False   # eMAR(투약 실행) 요약 포함 (매우 큼) 💊
USE_LABS                  = True    # labevents 집계 사용 (큼) 🧪
USE_MICROBIO             = True    # 미생물 이벤트 요약 🦠
USE_RADIOLOGY_TEXT        = True    # 영상 판독 텍스트 요약 🖼️
USE_ICU_LIGHT             = True    # ICUSTAYS(경량)만: 입출실+LOS 집계 (안전) 🛏️
USE_ICU_VENT_PRESSOR_HEAVY= False   # chartevents/inputevents로 VENT/바소프레서 요약 (매우 큼) 🔌

SAVE_PARQUET = True
SAVE_CSV     = False

# 유틸
def td(dt):  # 안전한 to_datetime
    return pd.to_datetime(dt, errors="coerce", utc=True)

def jdump(x):
    return json.dumps(x, ensure_ascii=False, separators=(",", ":"))

def safe_read_csv(path, usecols=None, dtype=None):
    return pd.read_csv(path, usecols=usecols, dtype=dtype, low_memory=False)

# === 2) 코어 테이블: 환자/입원 =================================================
print("📥 Loading patients/admissions ...")
patients = safe_read_csv(
    f"{HOSP_DIR}/patients.csv",
    usecols=["subject_id","gender","anchor_age","anchor_year","anchor_year_group"]
)

admissions = safe_read_csv(
    f"{HOSP_DIR}/admissions.csv",
    usecols=[
        "subject_id","hadm_id","admittime","dischtime","deathtime","admission_type",
        "admit_provider_id","admission_location","discharge_location","insurance",
        "edregtime","edouttime","hospital_expire_flag"
    ]
)

admissions["admittime"]  = td(admissions["admittime"])
admissions["dischtime"]  = td(admissions["dischtime"])
admissions["deathtime"]  = td(admissions["deathtime"])
admissions["edregtime"]  = td(admissions["edregtime"])
admissions["edouttime"]  = td(admissions["edouttime"])

core = admissions.merge(patients, on="subject_id", how="left")

# 선택적으로 적은 수의 HADM으로 제한(초기 검증용)
if HADM_LIMIT is not None:
    keep_hadm = core["hadm_id"].dropna().unique()[:int(HADM_LIMIT)]
    core = core[core["hadm_id"].isin(keep_hadm)].copy()

print(f"✅ core rows: {len(core):,}")

# === 3) 퇴원요약 섹션: discharge_detail 우선 + discharge 텍스트 백업 ===========
print("📥 Loading discharge notes ...")
# 3-1) 원문(백업 파서 용)
discharge = safe_read_csv(
    f"{NOTE_DIR}/discharge.csv",
    usecols=["note_id","subject_id","hadm_id","note_type","note_seq","charttime","storetime","text"]
)
discharge["charttime"] = td(discharge["charttime"])
discharge["storetime"] = td(discharge["storetime"])

# 동일 hadm_id 내 최신 note_seq만 선택(가장 최신본)
discharge = (discharge
             .sort_values(["hadm_id","note_seq"])
             .drop_duplicates(subset=["hadm_id"], keep="last"))

# 3-2) detail(구조화 섹션)
detail = safe_read_csv(
    f"{NOTE_DIR}/discharge_detail.csv",
    usecols=["note_id","subject_id","field_name","field_value","field_ordinal"]
)

# detail → (note_id, field_name)별 field_ordinal 순서대로 이어붙임
detail = detail.sort_values(["note_id","field_name","field_ordinal"])
agg_detail = (detail
              .groupby(["note_id","field_name"], as_index=False)["field_value"]
              .apply(lambda s: "\n".join([str(x) for x in s if pd.notna(x)]))
             )

# 표준화 키 매핑(필요 시 추가 동의어 확장)
FIELD_MAP = {
    "Chief Complaint": "chief_complaint",
    "History of Present Illness": "hpi",
    "Past Medical History": "pmh",
    "Family History": "fhx",
    "Social History": "shx",
    "Allergies": "allergies",
    "Physical Exam": "physical_exam",
    "Assessment": "assessment",
    "Impression": "assessment",  # 일부 기관에서 Impression로 표기
    "Hospital Course": "hospital_course",
    "Discharge Diagnosis": "discharge_diagnosis",
    "Plan": "plan",
    "Assessment and Plan": "assessment_plan",
    "Discharge Instructions": "discharge_instructions",
    "Medications on Discharge": "meds_on_discharge",
    "Discharge Medications": "meds_on_discharge",
}

agg_detail["std_key"] = agg_detail["field_name"].map(FIELD_MAP).fillna(agg_detail["field_name"])
# pivot: note_id x std_key
pivot_detail = agg_detail.pivot_table(index="note_id", columns="std_key", values="field_value", aggfunc="first").reset_index()

# discharge + pivot_detail
dis_with_detail = discharge.merge(pivot_detail, on="note_id", how="left")

# 텍스트 백업 파서(디테일 없는 경우에만 사용)
SECTION_PATTERNS = [
    # (표준키, 정규식 시작패턴(대소문 무시))
    ("chief_complaint", r"(?:^|\n)\s*(?:chief complaint|cc)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("hpi",              r"(?:^|\n)\s*(?:history of present illness|hpi)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("pmh",              r"(?:^|\n)\s*(?:past medical history|pmh)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("fhx",              r"(?:^|\n)\s*(?:family history)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("shx",              r"(?:^|\n)\s*(?:social history)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("allergies",        r"(?:^|\n)\s*(?:allergies?)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("physical_exam",    r"(?:^|\n)\s*(?:physical (?:exam|examination))\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("assessment",       r"(?:^|\n)\s*(?:assessment|impression)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("hospital_course",  r"(?:^|\n)\s*(?:hospital course)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("discharge_diagnosis", r"(?:^|\n)\s*(?:discharge diagnosis(?:es)?)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("plan",             r"(?:^|\n)\s*(?:plan|discharge plan)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("discharge_instructions", r"(?:^|\n)\s*(?:discharge instructions?)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
    ("meds_on_discharge",r"(?:^|\n)\s*(?:medications? on discharge|discharge medications?)\s*[:\-]\s*(.*?)(?=\n[A-Z][^\n]{0,60}\s*:|\Z)"),
]

def fallback_parse_sections(text):
    res = {}
    if not isinstance(text, str) or len(text) == 0:
        return res
    t = text.lower()
    for key, pat in SECTION_PATTERNS:
        m = re.search(pat, t, flags=re.S|re.I)
        if m:
            # 원문에서 해당 구간을 대략 추출(소문 대치했지만 원문 보존 어려움 → 소문으로 저장)
            val = m.group(1).strip()
            res[key] = val
    return res

# detail 없을 때만 백업 파싱 채움
section_cols = [
    "chief_complaint","hpi","pmh","fhx","shx","allergies","physical_exam",
    "assessment","hospital_course","discharge_diagnosis","plan","discharge_instructions","meds_on_discharge","assessment_plan"
]
for col in section_cols:
    if col not in dis_with_detail.columns:
        dis_with_detail[col] = np.nan

mask_need_parse = dis_with_detail[section_cols].isna().all(axis=1)

if mask_need_parse.any():
    # 1) 텍스트를 파싱해서 DataFrame으로
    parsed_list = dis_with_detail.loc[mask_need_parse, "text"].apply(fallback_parse_sections)
    parsed_df = pd.json_normalize(parsed_list).reindex(columns=section_cols)

    # 2) 인덱스를 원본과 정확히 맞추기 (매우 중요)
    parsed_df.index = dis_with_detail.index[mask_need_parse]

    # 3-A) where 로 NaN만 채우기 (권장)
    sub = dis_with_detail.loc[mask_need_parse, section_cols]
    sub = sub.where(sub.notna(), parsed_df)  # 같은 모양/인덱스 필요
    dis_with_detail.loc[mask_need_parse, section_cols] = sub

    # 3-B) (대안) combine_first도 가능
    # sub = dis_with_detail.loc[mask_need_parse, section_cols].combine_first(parsed_df)
    # dis_with_detail.loc[mask_need_parse, section_cols] = sub

# mask_need_parse = dis_with_detail[section_cols].isna().all(axis=1)
# parsed_list = dis_with_detail.loc[mask_need_parse, "text"].apply(fallback_parse_sections)
# parsed_df = pd.json_normalize(parsed_list).reindex(columns=section_cols)
# dis_with_detail.loc[mask_need_parse, section_cols] = dis_with_detail.loc[mask_need_parse, section_cols].fillna(parsed_df.values)


# hadm_id 기준으로 조인 준비(최신 note만 남아있음)
dis_sections = dis_with_detail[["hadm_id"] + section_cols].copy()

# === 4) 투약(처방/실행/메타) 요약 ===============================================
print("📥 Loading prescriptions ...")
presc_cols = [
    "subject_id","hadm_id","pharmacy_id","poe_id","poe_seq","order_provider_id",
    "starttime","stoptime","drug_type","drug","formulary_drug_cd","gsn","ndc",
    "prod_strength","form_rx","dose_val_rx","dose_unit_rx",
    "form_val_disp","form_unit_disp","doses_per_24_hrs","route"
]
presc = safe_read_csv(f"{HOSP_DIR}/prescriptions.csv", usecols=presc_cols)
presc["starttime"] = td(presc["starttime"])
presc["stoptime"]  = td(presc["stoptime"])

# 현재 core에 있는 hadm만
presc = presc[presc["hadm_id"].isin(core["hadm_id"])].copy()

# 입원창(admittime~dischtime)과 겹치는 처방만 (시간 정보 없으면 일단 포함)
adm_times = core.set_index("hadm_id")[["admittime","dischtime"]].to_dict("index")

def overlaps_adm(row):
    a = adm_times.get(row["hadm_id"])
    if a is None: return False
    s, e = row["starttime"], row["stoptime"]
    # 어떤 값이 NaT이면 보수적으로 포함
    if pd.isna(s) or pd.isna(e) or pd.isna(a["admittime"]) or pd.isna(a["dischtime"]):
        return True
    return not (e < a["admittime"] or s > a["dischtime"])

presc = presc[presc.apply(overlaps_adm, axis=1)]

def summarize_prescriptions(df):
    # hadm_id별 약물 리스트(JSON)
    def one_hadm(g):
        items = []
        for _, r in g.iterrows():
            items.append({
                "drug": r["drug"],
                "route": r["route"],
                "dose": (str(r["dose_val_rx"]) if pd.notna(r["dose_val_rx"]) else None),
                "dose_unit": r["dose_unit_rx"],
                "doses_per_24_hrs": r["doses_per_24_hrs"],
                "start": (r["starttime"].isoformat() if pd.notna(r["starttime"]) else None),
                "stop":  (r["stoptime"].isoformat() if pd.notna(r["stoptime"]) else None),
                "drug_type": r["drug_type"],
            })
        return jdump(items)
    return df.groupby("hadm_id").apply(one_hadm).rename("inpatient_med_summary_json").reset_index()

presc_summary = summarize_prescriptions(presc)

# eMAR(선택)
if USE_EMAR:
    print("📥 Loading eMAR (this may be large) ...")
    emar = safe_read_csv(
        f"{HOSP_DIR}/emar.csv",
        usecols=["subject_id","hadm_id","emar_id","emar_seq","poe_id","pharmacy_id","enter_provider_id","charttime","medication","event_txt","scheduletime","storetime"]
    )
    emar["charttime"] = td(emar["charttime"])
    emar = emar[emar["hadm_id"].isin(core["hadm_id"])].copy()

    def summarize_emar(df):
        def one_hadm(g):
            # 간단 요약: 약물별 event_txt 카운트 + 마지막 투약시각
            out = {}
            for med, gg in g.groupby("medication"):
                d = {
                    "n_events": int(len(gg)),
                    "n_given": int((gg["event_txt"].str.lower()=="given").sum()),
                    "last_charttime": (gg["charttime"].max().isoformat() if pd.notna(gg["charttime"].max()) else None),
                }
                out[med] = d
            return jdump(out)
        return df.groupby("hadm_id").apply(one_hadm).rename("emar_admin_summary_json").reset_index()

    emar_summary = summarize_emar(emar)
else:
    emar_summary = pd.DataFrame(columns=["hadm_id","emar_admin_summary_json"])

# === 5) 검사/미생물/영상 요약 ================================================
# 5-1) Labs (선택)
if USE_LABS:
    print("📥 Loading d_labitems & labevents (filtered) ...")
    dlab = safe_read_csv(f"{HOSP_DIR}/d_labitems.csv", usecols=["itemid","label","fluid","category"])
    # 대표 항목 라벨 정의(필요시 확장)
    TARGET_LABELS = {
        "WBC": "WBC",
        "Hemoglobin": "Hgb",
        "Platelet Count": "Plt",
        "Sodium": "Na",
        "Potassium": "K",
        "Chloride": "Cl",
        "Bicarbonate": "HCO3",
        "Creatinine": "Cr",
        "Urea Nitrogen": "BUN",
        "Glucose": "Glucose",
    }
    target_dlab = dlab[dlab["label"].isin(TARGET_LABELS.keys())].copy()
    target_dlab["short"] = target_dlab["label"].map(TARGET_LABELS)

    target_itemids = set(target_dlab["itemid"].tolist())
    target_map     = dict(zip(target_dlab["itemid"], target_dlab["short"]))

    # labevents는 청크로 읽으며 hadm_id + itemid 필터
    lab_summary_rows = []
    cols = ["labevent_id","subject_id","hadm_id","specimen_id","itemid","charttime","value","valuenum","valueuom","ref_range_lower","ref_range_upper","flag","priority"]
    chunk_iter = pd.read_csv(f"{HOSP_DIR}/labevents.csv", usecols=cols, chunksize=1_000_000, low_memory=False)
    keep_hadm_set = set(core["hadm_id"].dropna().unique())
    for chunk in chunk_iter:
        chunk = chunk[chunk["hadm_id"].isin(keep_hadm_set)]
        chunk = chunk[chunk["itemid"].isin(target_itemids)]
        if chunk.empty:
            continue
        chunk["charttime"] = td(chunk["charttime"])
        # 입원창과 겹치는 결과만(시간 없는 건 보수적으로 포함)
        def in_window(r):
            a = adm_times.get(r["hadm_id"])
            if a is None: return False
            ct = r["charttime"]
            if pd.isna(ct) or pd.isna(a["admittime"]) or pd.isna(a["dischtime"]):
                return True
            return (a["admittime"] <= ct) and (ct <= a["dischtime"])
        chunk = chunk[chunk.apply(in_window, axis=1)]
        if chunk.empty:
            continue

        # hadm_id, itemid 별 min/max/last(valuenum) 집계
        chunk["short"] = chunk["itemid"].map(target_map)
        agg = (chunk.sort_values("charttime")
                    .groupby(["hadm_id","short"])
                    .agg(min_val=("valuenum","min"),
                         max_val=("valuenum","max"),
                         last_val=("valuenum","last"),
                         unit=("valueuom","last"))
                    .reset_index())
        lab_summary_rows.append(agg)
        del chunk, agg
        gc.collect()

    if lab_summary_rows:
        labs_agg = pd.concat(lab_summary_rows, ignore_index=True)
        # hadm_id 별 JSON으로 직렬화
        def one_hadm(g):
            out = {}
            for _, r in g.iterrows():
                out[r["short"]] = {
                    "min": (None if pd.isna(r["min_val"]) else float(r["min_val"])),
                    "max": (None if pd.isna(r["max_val"]) else float(r["max_val"])),
                    "last":(None if pd.isna(r["last_val"]) else float(r["last_val"])),
                    "unit": r["unit"]
                }
            return jdump(out)
        labs_summary = labs_agg.groupby("hadm_id").apply(one_hadm).rename("lab_summary_json").reset_index()
    else:
        labs_summary = pd.DataFrame(columns=["hadm_id","lab_summary_json"])
else:
    labs_summary = pd.DataFrame(columns=["hadm_id","lab_summary_json"])

# 5-2) Microbiology (선택)
if USE_MICROBIO:
    print("📥 Loading microbiologyevents ...")
    micro = safe_read_csv(
        f"{HOSP_DIR}/microbiologyevents.csv",
        usecols=["microevent_id","subject_id","hadm_id","micro_specimen_id","chartdate","charttime",
                 "spec_itemid","spec_type_desc","test_itemid","test_name","org_itemid","org_name",
                 "isolate_num","quantity","ab_itemid","ab_name","dilution_text","dilution_comparison","dilution_value","interpretation","comments"]
    )
    micro["charttime"] = td(micro["charttime"])
    micro = micro[micro["hadm_id"].isin(core["hadm_id"])].copy()

    def summarize_micro(g):
        # 간단 요약: 균/검체별 해석 카운트
        res = []
        for (spec, org), gg in g.groupby(["spec_type_desc","org_name"], dropna=False):
            inter = gg["interpretation"].dropna().value_counts().to_dict()
            res.append({
                "specimen": spec,
                "organism": org,
                "n": int(len(gg)),
                "interpretation_cnt": inter
            })
        return jdump(res)
    micro_summary = micro.groupby("hadm_id").apply(summarize_micro).rename("microbio_summary_json").reset_index()
else:
    micro_summary = pd.DataFrame(columns=["hadm_id","microbio_summary_json"])

# 5-3) Radiology reports (선택)
if USE_RADIOLOGY_TEXT:
    print("📥 Loading radiology notes ...")
    radio = safe_read_csv(
        f"{NOTE_DIR}/radiology.csv",
        usecols=["note_id","subject_id","hadm_id","note_seq","charttime","storetime","text"]
    )
    radio["charttime"] = td(radio["charttime"])
    # 같은 검사에 대한 addendum이 있을 수 있으므로 hadm_id별 최신 n개만 단순 요약
    def summarize_radio(g):
        g = g.sort_values("charttime").tail(5)  # 최신 5건까지만
        # 간단히 첫 800자씩만 이어붙이기
        parts = []
        for _, r in g.iterrows():
            t = str(r["text"]) if pd.notna(r["text"]) else ""
            parts.append(t[:800])
        return jdump({"n_reports": int(len(g)), "snippets": parts})
    radio_summary = radio.groupby("hadm_id").apply(summarize_radio).rename("radiology_summary_text").reset_index()
else:
    radio_summary = pd.DataFrame(columns=["hadm_id","radiology_summary_text"])

# === 6) ICU 관련(경량/선택적 고급) ============================================
if USE_ICU_LIGHT:
    print("📥 Loading icustays (light) ...")
    icu = safe_read_csv(
        f"{ICU_DIR}/icustays.csv",
        usecols=["subject_id","hadm_id","stay_id","first_careunit","last_careunit","intime","outtime","los"]
    )
    icu["intime"] = td(icu["intime"])
    icu["outtime"]= td(icu["outtime"])

    # hadm_id별 ICU 체류 집계
    icu_agg = (icu.groupby("hadm_id", as_index=False)
                  .agg(icu_stay_count=("stay_id","count"),
                       icu_first_intime=("intime","min"),
                       icu_last_outtime=("outtime","max"),
                       icu_total_los_days=("los","sum")))
else:
    icu_agg = pd.DataFrame(columns=["hadm_id","icu_stay_count","icu_first_intime","icu_last_outtime","icu_total_los_days"])

# (매우 무거운 ventilator/pressor 요약은 필요 시 확장)
if USE_ICU_VENT_PRESSOR_HEAVY:
    print("⚠️ Heavy ICU streams are disabled by default. Set USE_ICU_VENT_PRESSOR_HEAVY=True to implement.")

# === 7) 조립: master wide 테이블 ==============================================
print("🧩 Assembling wide table ...")
wide = (core
    .merge(dis_sections, on="hadm_id", how="left")
    .merge(presc_summary, on="hadm_id", how="left")
    .merge(emar_summary, on="hadm_id", how="left")
    .merge(labs_summary, on="hadm_id", how="left")
    .merge(micro_summary, on="hadm_id", how="left")
    .merge(radio_summary, on="hadm_id", how="left")
    .merge(icu_agg, on="hadm_id", how="left")
)

# 날짜 문자열화(일부 모델 학습 전처리에 유리)
date_cols = ["admittime","dischtime","deathtime","edregtime","edouttime",
             "icu_first_intime","icu_last_outtime"]
for c in date_cols:
    if c in wide.columns:
        wide[c] = wide[c].dt.tz_convert(None).astype(str).replace("NaT","")

# === 8) 저장 ==================================================================
print(f"✅ Final rows: {len(wide):,}")
out_base = os.path.join(OUTPUT_DIR, "mimiciv_clinical_record_sheet")

if SAVE_PARQUET:
    wide.to_parquet(out_base + ".parquet", index=False)
    print(f"💾 Saved Parquet → {out_base}.parquet")

if SAVE_CSV:
    wide.to_csv(out_base + ".csv", index=False)
    print(f"💾 Saved CSV     → {out_base}.csv")

print("🎉 Done!")

📥 Loading patients/admissions ...
✅ core rows: 3
📥 Loading discharge notes ...


 '"i can\'t stop being moody and irritable and i\'ve been thinking \nmore about suicide."'
 nan 'morbid obesity']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  dis_with_detail.loc[mask_need_parse, section_cols] = sub
 'ms. ___ is a ___ year-old woman with pmh significant for \nchronic anemia, osteoporosis, hypertension, ataxia, and recent \nl5 fracture in the setting of recurrent falls who presents from \nhome with fatigue and generalized weakness and diarrhea.  \n the patient\'s recent history is notable for the follow:  \n - on ___, she presented with 4 days of lbp s/p fall from \nstanding at which time imaging revealed acute l5 fracture. she \nwas evaluated by spine team who recommended early mobilization, \npain control, but no brace required. she was evaluated by ___, \nand discharged to ___.  \n - she was discharged home with ___ on ___.  \n - on ___, she again presented to ___ s/p fall from \nstanding while trying to reach for a glas

📥 Loading prescriptions ...


  return df.groupby("hadm_id").apply(one_hadm).rename("inpatient_med_summary_json").reset_index()


📥 Loading d_labitems & labevents (filtered) ...


  labs_summary = labs_agg.groupby("hadm_id").apply(one_hadm).rename("lab_summary_json").reset_index()


📥 Loading microbiologyevents ...


  micro_summary = micro.groupby("hadm_id").apply(summarize_micro).rename("microbio_summary_json").reset_index()


📥 Loading radiology notes ...


  radio_summary = radio.groupby("hadm_id").apply(summarize_radio).rename("radiology_summary_text").reset_index()


📥 Loading icustays (light) ...
🧩 Assembling wide table ...
✅ Final rows: 3
💾 Saved Parquet → /content/drive/MyDrive/DILAB/mimic-iv_reconstructed/mimiciv_clinical_record_sheet.parquet
🎉 Done!
