In [None]:
import os 
import pandas as pd
from tqdm import tqdm

HOSP_DIR = "data/mimiciv/hosp"
ICU_DIR = "data/mimiciv/icu"
OUTPUT_DIR = "data/processed_mimiciv"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def load_and_format_table(filepath, time_col, id_col, code_col, event_type_label, nrows=None):
    print(f"📄 Loading {os.path.basename(filepath)} ...")

    try:
        df = pd.read_csv(filepath, usecols=[id_col, time_col, code_col], low_memory=False, nrows=nrows)
    except ValueError:
        # fallback if usecols are partially missing (e.g., diagnoses_icd)
        df = pd.read_csv(filepath, low_memory=False, nrows=nrows)

    # Check if time_col exists in the file
    if time_col in df.columns:
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
        df = df.dropna(subset=[time_col])
        df = df.rename(columns={time_col: "timestamp"})
    else:
        df["timestamp"] = pd.NaT

    df = df.dropna(subset=[id_col, code_col])
    df["event_type"] = event_type_label
    df["event_code"] = df[code_col].astype(str)
    df = df.rename(columns={id_col: "subject_id"})

    return df[["subject_id", "timestamp", "event_type", "event_code"]]


def process_events():
    all_dfs = []

    # Diagnoses: No timestamp available
    diag_path = os.path.join(HOSP_DIR, "diagnoses_icd.csv")
    if os.path.exists(diag_path):
        diag_df = load_and_format_table(diag_path, "chartdate", "subject_id", "icd_code", "diagnosis")
        all_dfs.append(diag_df)

    # Procedures: No timestamp available
    proc_path = os.path.join(HOSP_DIR, "procedures_icd.csv")
    if os.path.exists(proc_path):
        proc_df = load_and_format_table(proc_path, "chartdate", "subject_id", "icd_code", "procedure")
        all_dfs.append(proc_df)

    # Prescriptions: has starttime
    rx_path = os.path.join(HOSP_DIR, "prescriptions.csv")
    if os.path.exists(rx_path):
        rx_df = load_and_format_table(rx_path, "starttime", "subject_id", "ndc", "medication")
        all_dfs.append(rx_df)

    # Chartevents: very large → load only first 1M rows
    chart_path = os.path.join(ICU_DIR, "chartevents.csv")
    if os.path.exists(chart_path):
        chart_df = load_and_format_table(chart_path, "charttime", "subject_id", "itemid", "chart", nrows=1_000_000)
        all_dfs.append(chart_df)

    if not all_dfs:
        print("❌ No valid CSV files found. Please check your dataset paths.")
        return

    print("🔗 Concatenating and sorting all event data ...")
    combined = pd.concat(all_dfs)
    combined = combined.dropna(subset=["subject_id", "event_code"])
    combined = combined.sort_values(by=["subject_id", "timestamp"])

    print(f"💾 Saving processed events to {OUTPUT_DIR}/all_events.csv")
    combined.to_csv(os.path.join(OUTPUT_DIR, "all_events.csv"), index=False)
    print("✅ Done!")


if __name__ == "__main__":
    process_events()