# NWICU Preprocess (standardized)

## 0) Setup

### 0.1 Libraries

In [None]:
import argparse
import gc
import os
import re
import shutil
import time
from types import SimpleNamespace
import numpy as np
import pandas as pd
import tables
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from pathlib import Path
from __future__ import annotations
from typing import List
import sys

### 0.2 Paths & environment

In [None]:
from pathlib import Path

# Ajuste estes caminhos
BASE_DIR = Path(r"<DATA_DIR>/nwicu")  # raiz com nw_icu/csv e nw_hosp/csv
OUT_DIR = Path(r"<OUTPUT_DIR>/nwicu")  # diret?rio de sa?da

PREPROCESS_DIR = OUT_DIR / "preprocess"
IMPUTED_DIR = PREPROCESS_DIR / "imputed_full"
LABELS_PATH = PREPROCESS_DIR / "labels.parquet"
FEATURES_DIR = PREPROCESS_DIR / "batches" / "features_raw"
H5_PATH = PREPROCESS_DIR / "h5" / "dataset_features.h5"

PREPROCESS_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

### 0.3 Defaults / mappings

In [None]:
VITALS_MAP = {320045: 'heart_rate', 320179: 'sbp', 320180: 'dbp', 320050: 'sbp_art', 320051: 'dbp_art', 320210: 'resp_rate', 320277: 'spo2', 323761: 'temperature_f', 326707: 'height_in', 326531: 'weight_oz', 300001: 'bmi'}
LABS_MAP = {100001: 'glucose', 100013: 'lab_50803', 100010: 'lab_50983', 100011: 'lab_50971', 100015: 'lab_50970', 100044: 'lab_50808', 100031: 'lab_50813', 100020: 'lab_50885', 100049: 'lab_50883', 100042: 'lab_50861', 100032: 'lab_50878', 100053: 'lab_50889', 100021: 'lab_50862', 100006: 'lab_51638', 100007: 'lab_50855', 100079: 'lab_50852', 100349: 'lab_51223', 100014: 'platelet_count', 100030: 'lab_51274', 100046: 'lab_51275', 100048: 'lab_51214', 100145: 'lab_52144', 100359: 'lab_51647', 100085: 'lab_50910', 100285: 'lab_50908', 100059: 'lab_51002', 100057: 'lab_51003', 100071: 'lab_50963', 100002: 'lab_50912', 100004: 'lab_51006', 100009: 'lab_50960', 100209: 'lab_51099', 100075: 'lab_50915', 100221: 'lab_50966', 100216: 'lab_50967', 100223: 'lab_50975', 100052: 'lab_50924', 100123: 'lab_50935', 100150: 'lab_50805', 100134: 'lab_50856'}
VASO_REGEX = 'norepinephrine|epinephrine|phenylephrine|vasopressin|dopamine|dobutamine|milrinone'
MAP_THRESHOLD = 65.0
LACTATE_THRESHOLD = 2.0
GRID_MINUTES = 5

config = SimpleNamespace(
    VITALS_MAP=VITALS_MAP,
    LABS_MAP=LABS_MAP,
    VASO_REGEX=VASO_REGEX,
    MAP_THRESHOLD=MAP_THRESHOLD,
    LACTATE_THRESHOLD=LACTATE_THRESHOLD,
    GRID_MINUTES=GRID_MINUTES,
    BASE_DIR=BASE_DIR,
    OUTPUT_DIR=OUT_DIR,
)

In [None]:
"""Pré-processamento: mapeia itemids, pivot vitais/labs e marca vasopressor."""

from __future__ import annotations

import argparse
import re
from pathlib import Path

import numpy as np
import pandas as pd



def load_vitals(chartevents_path: Path) -> pd.DataFrame:
    use_ids = set(config.VITALS_MAP)
    chunks = []
    for chunk in pd.read_csv(
        chartevents_path,
        chunksize=500_000,
        parse_dates=["charttime"],
        usecols=["stay_id", "charttime", "itemid", "valuenum"],
    ):
        chunk = chunk[chunk["itemid"].isin(use_ids)]
        if chunk.empty:
            continue
        # remove sentinelas/valores absurdos
        chunk["valuenum"] = chunk["valuenum"].where(chunk["valuenum"].abs() < 1e6, np.nan)
        chunk["feature"] = chunk["itemid"].map(config.VITALS_MAP)
        chunk["charttime"] = pd.to_datetime(chunk["charttime"]).dt.tz_localize(None)
        chunks.append(chunk[["stay_id", "charttime", "feature", "valuenum"]])
    if not chunks:
        return pd.DataFrame()
    df = pd.concat(chunks, ignore_index=True)
    wide = (
        df.pivot_table(index=["stay_id", "charttime"], columns="feature", values="valuenum")
        .reset_index()
        .sort_values(["stay_id", "charttime"])
    )
    if "mbp" not in wide.columns and {"sbp", "dbp"}.issubset(wide.columns):
        wide["mbp"] = (2 * wide["dbp"] + wide["sbp"]) / 3.0
    if "temperature_f" in wide.columns and "temperature" not in wide.columns:
        wide["temperature"] = (wide["temperature_f"] - 32.0) * 5.0 / 9.0
    # Conversões antropométricas
    if "weight_oz" in wide.columns and "weight_kg" not in wide.columns:
        wide["weight_kg"] = wide["weight_oz"] * 0.0283495
    if "height_in" in wide.columns and "height_cm" not in wide.columns:
        wide["height_cm"] = wide["height_in"] * 2.54
    return wide


def load_labs(labevents_path: Path, icustays_df: pd.DataFrame) -> pd.DataFrame:
    use_ids = set(config.LABS_MAP)
    chunks = []
    for chunk in pd.read_csv(
        labevents_path,
        chunksize=500_000,
        parse_dates=["charttime"],
        usecols=["subject_id", "hadm_id", "charttime", "itemid", "valuenum"],
    ):
        chunk = chunk[chunk["itemid"].isin(use_ids)]
        if chunk.empty:
            continue
        # remove sentinelas/valores absurdos
        chunk["valuenum"] = chunk["valuenum"].where(chunk["valuenum"].abs() < 1e6, np.nan)
        chunk = chunk.merge(icustays_df[["subject_id", "hadm_id", "stay_id"]], on=["subject_id", "hadm_id"], how="left")
        chunk = chunk.dropna(subset=["stay_id"])
        chunk["stay_id"] = chunk["stay_id"].astype(int)
        chunk["feature"] = chunk["itemid"].map(config.LABS_MAP)
        chunk["charttime"] = pd.to_datetime(chunk["charttime"]).dt.tz_localize(None)
        chunks.append(chunk[["stay_id", "charttime", "feature", "valuenum"]])
    if not chunks:
        return pd.DataFrame()
    df = pd.concat(chunks, ignore_index=True)
    wide = (
        df.pivot_table(index=["stay_id", "charttime"], columns="feature", values="valuenum")
        .reset_index()
        .sort_values(["stay_id", "charttime"])
    )
    return wide


def load_vaso_flags(prescriptions_path: Path, icustays_df: pd.DataFrame) -> pd.DataFrame:
    pattern = re.compile(config.VASO_REGEX, flags=re.IGNORECASE)
    df = pd.read_csv(
        prescriptions_path,
        usecols=["subject_id", "hadm_id", "starttime", "stoptime", "drug"],
        parse_dates=["starttime", "stoptime"],
    )
    df = df[df["drug"].fillna("").str.contains(pattern)]
    df = df.merge(icustays_df[["subject_id", "hadm_id", "stay_id"]], on=["subject_id", "hadm_id"], how="left")
    df = df.dropna(subset=["stay_id", "starttime", "stoptime"])
    df["stay_id"] = df["stay_id"].astype(int)
    return df


def load_static(admissions_path: Path, icustays_path: Path) -> pd.DataFrame:
    adm = pd.read_csv(
        admissions_path,
        usecols=["subject_id", "hadm_id", "deathtime", "hospital_expire_flag", "admittime", "dischtime"],
        parse_dates=["deathtime", "admittime", "dischtime"],
    )
    icu = pd.read_csv(
        icustays_path,
        usecols=["subject_id", "hadm_id", "stay_id", "intime", "outtime", "los"],
        parse_dates=["intime", "outtime"],
    )
    patients = pd.read_csv(
        Path(admissions_path).parent / "patients.csv",
        usecols=["subject_id", "gender", "anchor_age", "dod"],
        parse_dates=["dod"],
    )
    static = icu.merge(adm, on=["subject_id", "hadm_id"], how="left")
    static = static.merge(patients, on="subject_id", how="left")
    static["mortality"] = ((static["hospital_expire_flag"] == 1) | static["deathtime"].notna() | static["dod"].notna()).astype(int)
    return static[["stay_id", "subject_id", "hadm_id", "intime", "outtime", "los", "mortality", "gender", "anchor_age"]]


def prep_merge_main():
    parser = argparse.ArgumentParser(description="Prepara merge de vitais/labs/vaso para NWICU.")
    parser.add_argument("--base-dir", type=str, default=config.BASE_DIR, help="Pasta raiz com nw_icu/csv e nw_hosp/csv.")
    parser.add_argument("--out-dir", type=str, default=config.OUTPUT_DIR, help="Diretório de saída.")
    args = parser.parse_args()

    base = Path(args.base_dir)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    chartevents = base / "nw_icu" / "csv" / "chartevents.csv"
    labevents = base / "nw_hosp" / "csv" / "labevents.csv"
    prescriptions = base / "nw_hosp" / "csv" / "prescriptions.csv"
    admissions = base / "nw_hosp" / "csv" / "admissions.csv"
    icustays = base / "nw_icu" / "csv" / "icustays.csv"

    icu_df = pd.read_csv(icustays, usecols=["subject_id", "hadm_id", "stay_id"])

    print("Lendo vitais...")
    vitals = load_vitals(chartevents)
    print(f"Vitais: {len(vitals)} linhas")

    print("Lendo labs...")
    labs = load_labs(labevents, icu_df)
    print(f"Labs: {len(labs)} linhas")

    print("Lendo vasopressores...")
    vaso = load_vaso_flags(prescriptions, icu_df)
    print(f"Intervalos de vaso: {len(vaso)}")

    print("Unindo vitais + labs...")
    merged = pd.merge(vitals, labs, on=["stay_id", "charttime"], how="outer")
    merged = merged.sort_values(["stay_id", "charttime"])

    if "weight_kg" not in merged.columns and "weight_oz" in merged.columns:
        merged["weight_kg"] = merged["weight_oz"] * 0.0283495
    if "height_cm" not in merged.columns and "height_in" in merged.columns:
        merged["height_cm"] = merged["height_in"] * 2.54

    merged["vasopressor_ativo"] = 0
    if not vaso.empty:
        for stay_id, group in vaso.groupby("stay_id"):
            mask = merged["stay_id"] == stay_id
            if not mask.any():
                continue
            for _, row in group.iterrows():
                mtime = (merged["charttime"] >= row["starttime"]) & (merged["charttime"] <= row["stoptime"])
                merged.loc[mask & mtime, "vasopressor_ativo"] = 1

    print("Salvando merged...")
    merged.to_parquet(out_dir / "merged.parquet", index=False)

    print("Gerando estáticos...")
    static = load_static(admissions, icustays)
    static.to_parquet(out_dir / "static.parquet", index=False)

    print("Feito.")

## 1) Preprocess pipeline

# NWICU preprocess (self-contained)

Este notebook cont?m TODO o preprocess do NWICU/CircEWS em um ?nico fluxo, sem depender de scripts externos. Ajuste apenas os caminhos e execute em ordem.

## 0) Setup


### 0.1 Libraries


In [None]:
import argparse
import gc
import os
import re
import shutil
import time
from types import SimpleNamespace

import numpy as np
import pandas as pd
import tables
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Mappings do NWICU (extra?dos de datasets/nwicu/src/config.py)

### 0.2 Paths


### 0.3 Defaults / mappings


## 2) Preprocess step: prep_merge

## 3) Preprocess step: impute

In [None]:
"""Imputação estilo CircularyFailure: grade 5 min, defaults fisiológicos, forward-fill e flags *_imputed."""

from __future__ import annotations

import argparse
import shutil
from pathlib import Path

import numpy as np
import pandas as pd



def iter_grids(df: pd.DataFrame, grid_minutes: int):
    freq = f"{grid_minutes}min"
    for stay_id, sub in df.groupby("stay_id"):
        sub = sub.sort_values("charttime").set_index("charttime")
        grid = sub.resample(freq).last()
        yield stay_id, grid


def impute_main():
    parser = argparse.ArgumentParser(description="Imputa merged NWICU com grade fixa, defaults e flags *_imputed.")
    parser.add_argument("--merged", type=str, required=True, help="Parquet merged (saída do prep_merge).")
    parser.add_argument("--static", type=str, default=config.OUTPUT_DIR / "preprocess" / "static.parquet", help="Parquet static com mortality (opcional).")
    parser.add_argument("--out", type=str, default=config.OUTPUT_DIR / "preprocess" / "imputed_full", help="Diretório de saída (ou base).")
    parser.add_argument("--grid-minutes", type=int, default=config.GRID_MINUTES, help="Tamanho da grade (min).")
    args = parser.parse_args()

    merged = pd.read_parquet(args.merged)
    merged["charttime"] = pd.to_datetime(merged["charttime"]).dt.tz_localize(None)

    # Defaults globais (mediana + overrides fisiológicos)
    numeric_cols = [c for c in merged.columns if c not in {"stay_id", "charttime"}]
    global_defaults = merged[numeric_cols].median(skipna=True)
    phys = {
        "heart_rate": 80.0,
        "resp_rate": 18.0,
        "sbp": 120.0,
        "sbp_art": 120.0,
        "dbp": 70.0,
        "dbp_art": 70.0,
        "mbp": 85.0,
        "spo2": 98.0,
        "temperature": 36.8,
        "temperature_f": 98.2,
        "glucose": 110.0,
        # labs (valores de referência aproximados)
        "lab_50803": 24.0,   # bicarbonato/CO2 total
        "lab_50805": 1.5,
        "lab_50808": 1.1,
        "lab_50809": 90.0,
        "lab_50813": 1.0,
        "lab_50852": 5.5,
        "lab_50855": 14.0,
        "lab_50856": 10.0,
        "lab_50861": 30.0,
        "lab_50862": 4.3,
        "lab_50878": 40.0,
        "lab_50883": 0.3,
        "lab_50884": 0.6,
        "lab_50885": 1.0,
        "lab_50889": 5.0,
        "lab_50908": 0.8,
        "lab_50910": 150.0,
        "lab_50912": 1.0,
        "lab_50915": 0.5,
        "lab_50924": 120.0,
        "lab_50928": 100.0,
        "lab_50931": 90.0,
        "lab_50935": 100.0,
        "lab_50960": 2.0,
        "lab_50963": 100.0,
        "lab_50966": 15.0,
        "lab_50967": 15.0,
        "lab_50968": 10.0,
        "lab_50969": 80.0,
        "lab_50970": 3.5,
        "lab_50971": 4.5,
        "lab_50975": 5.5,
        "lab_50983": 140.0,
        "lab_50990": 10.0,
        "lab_51002": 0.01,
        "lab_51003": 0.01,
        "lab_51006": 15.0,
        "lab_51099": 0.2,
        "lab_51196": 0.5,
        "lab_51214": 300.0,
        "lab_51222": 14.0,
        "lab_51223": 3.0,
        "lab_51265": 200.0,
        "lab_51266": 1.0,
        "lab_51274": 12.0,
        "lab_51275": 35.0,
        "lab_51290": 0.0,
        "lab_51291": 0.0,
        "lab_51292": 0.0,
        "lab_51464": 0.0,
        "lab_51568": 0.2,
        "lab_51569": 0.2,
        "lab_51570": 0.2,
        "lab_51580": 0.8,
        "lab_51623": 300.0,
        "lab_51631": 6.0,
        "lab_51638": 45.0,
        "lab_51640": 14.0,
        "lab_51643": 3.0,
        "lab_51647": 3.0,
        "lab_51966": 0.0,
        "lab_52116": 300.0,
        "lab_52117": 300.0,
        "lab_52142": 10.0,
        "lab_52144": 1.0,
        "lab_52546": 1.0,
        "lab_52551": 0.5,
        "lab_52642": 0.01,
        "lab_53085": 4.3,
    }
    for k, v in phys.items():
        if k in global_defaults:
            global_defaults[k] = v
    if "weight_kg" not in global_defaults or pd.isna(global_defaults.get("weight_kg")):
        global_defaults["weight_kg"] = 70.0
    if "height_cm" not in global_defaults or pd.isna(global_defaults.get("height_cm")):
        global_defaults["height_cm"] = 170.0
    if "bmi" not in global_defaults or pd.isna(global_defaults.get("bmi")):
        global_defaults["bmi"] = 24.0

    out_base = Path(args.out)
    out_dir = out_base if not out_base.suffix else out_base.with_suffix("")
    if out_dir.exists():
        shutil.rmtree(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    static = (
        pd.read_parquet(args.static)[["stay_id", "mortality"]].drop_duplicates("stay_id")
        if args.static and Path(args.static).exists()
        else None
    )

    batch = []
    batch_size = 200  # stays por batch
    total_rows = 0

    for idx, (stay_id, grid) in enumerate(iter_grids(merged, args.grid_minutes), start=1):
        grid = grid.sort_index()
        orig_mask = grid.notna()
        grid = grid.ffill()
        grid = grid.fillna(global_defaults)
        # flags
        for col in [c for c in grid.columns if c not in {"stay_id"}]:
            grid[f"{col}_imputed"] = (~orig_mask[col]).astype(int)
        grid = grid.reset_index()
        grid["stay_id"] = stay_id
        # MBP se faltar
        if "mbp" not in grid.columns and {"sbp", "dbp"}.issubset(grid.columns):
            grid["mbp"] = (2 * grid["dbp"] + grid["sbp"]) / 3.0
        # peso/altura/bmi coerentes
        if "weight_kg" not in grid.columns and "weight_oz" in grid.columns:
            grid["weight_kg"] = grid["weight_oz"] * 0.0283495
        if "height_cm" not in grid.columns and "height_in" in grid.columns:
            grid["height_cm"] = grid["height_in"] * 2.54
        if "weight_kg" in grid.columns and "height_cm" in grid.columns:
            grid["bmi"] = grid.get("bmi", np.nan)
            grid["bmi"] = grid["bmi"].fillna(grid["weight_kg"] / ((grid["height_cm"] / 100.0) ** 2))
        # mortalidade constante
        if static is not None:
            grid = grid.merge(static, on="stay_id", how="left")
        else:
            grid["mortality"] = np.nan

        # reordena: charttime, stay_id, pares col/col_imputed, depois vasopressor_ativo, mortality
        order = ["charttime", "stay_id"]
        base_cols = [c for c in grid.columns if not c.endswith("_imputed") and c not in {"charttime", "stay_id"}]
        for col in base_cols:
            if col in {"vasopressor_ativo", "mortality"}:
                continue
            order.append(col)
            flag = f"{col}_imputed"
            if flag in grid.columns:
                order.append(flag)
        for tail in ["vasopressor_ativo", "mortality"]:
            if tail in grid.columns and tail not in order:
                order.append(tail)
        grid = grid[order]

        batch.append(grid)

        if len(batch) >= batch_size:
            df_batch = pd.concat(batch, ignore_index=True)
            part_path = out_dir / f"part_{idx:05d}.parquet"
            df_batch.to_parquet(part_path, index=False, compression="snappy", engine="pyarrow")
            total_rows += len(df_batch)
            batch.clear()

    if batch:
        df_batch = pd.concat(batch, ignore_index=True)
        part_path = out_dir / "part_tail.parquet"
        df_batch.to_parquet(part_path, index=False, compression="snappy", engine="pyarrow")
        total_rows += len(df_batch)

    print(f"Imputed salvo em {out_dir} (~{total_rows} linhas, múltiplas parts).")

## 4) Preprocess step: label

In [None]:
"""Gera labels de falência (circEWS-like) a partir do imputed."""

from __future__ import annotations

import argparse
from pathlib import Path

import pandas as pd



def label_main():
    parser = argparse.ArgumentParser(description="Gera labels de falência (MAP<=65 ou vaso & lactato>=2).")
    parser.add_argument("--imputed", type=str, required=True, help="Parquet ou diretório com parts (saída do impute).")
    parser.add_argument("--out", type=str, default=config.OUTPUT_DIR / "preprocess" / "labels.parquet", help="Arquivo de saída.")
    args = parser.parse_args()

    imp_path = Path(args.imputed)
    if imp_path.is_dir():
        files = sorted(imp_path.glob("*.parquet"))
    else:
        files = [imp_path]

    out_path = Path(args.out)
    if out_path.exists():
        out_path.unlink()
    out_path.parent.mkdir(parents=True, exist_ok=True)

    total = 0
    parts = []
    for f in files:
        df = pd.read_parquet(f)
        df["charttime"] = pd.to_datetime(df["charttime"])
        df = df.sort_values(["stay_id", "charttime"])

        df["mbp_baixa"] = df.get("mbp", pd.Series(index=df.index, dtype=float)) <= config.MAP_THRESHOLD
        df["lactato_alto"] = df.get("lab_50813", pd.Series(index=df.index, dtype=float)) >= config.LACTATE_THRESHOLD
        df["vasopressor_ativo"] = df.get("vasopressor_ativo", pd.Series(index=df.index, dtype=float)).fillna(0) > 0

        df["falencia"] = 0
        df.loc[df["mbp_baixa"] | (df["vasopressor_ativo"] & df["lactato_alto"]), "falencia"] = 1

        df["falencia_onset"] = 0
        for stay_id, sub in df.groupby("stay_id"):
            idx = sub.index[sub["falencia"] == 1]
            if len(idx):
                df.loc[idx[0], "falencia_onset"] = 1

        parts.append(df[["stay_id", "charttime", "falencia", "falencia_onset"]])
        total += len(df)

    df_out = pd.concat(parts, ignore_index=True)
    df_out.to_parquet(out_path, index=False)
    print(f"Labels salvos em {out_path} ({len(df_out)} linhas).")

## 5) Preprocess step: feature extraction

In [None]:
"""Extrai features derivadas estilo CircularyFailure (n_meas/min/max/mean/instab/intens/cumul) por stay."""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import List

import pandas as pd
from tqdm import tqdm



def derive_features_for_stay(sub: pd.DataFrame, feature_cols: List[str], window: int = 12) -> pd.DataFrame:
    sub = sub.sort_values("charttime")
    data = {
        "stay_id": sub["stay_id"].values,
        "charttime": sub["charttime"].values,
    }
    for col in feature_cols:
        s = sub[col]
        data[f"n_meas_{col}"] = s.notna().cumsum().values
        data[f"min_{col}"] = s.expanding().min().values
        data[f"max_{col}"] = s.expanding().max().values
        data[f"mean_{col}"] = s.expanding().mean().values
        data[f"{col}_instab"] = s.rolling(window=window, min_periods=1).std().values
        data[f"{col}_intens"] = s.diff().abs().values
        data[f"{col}_cumul"] = s.fillna(0).cumsum().values
        data[col] = s.values
    out = pd.DataFrame(data)
    return out


def process_part(part_path: Path, labels_df: pd.DataFrame, out_dir: Path, window: int = 12):
    df = pd.read_parquet(part_path)
    df["charttime"] = pd.to_datetime(df["charttime"])
    # só gera features para vitais/labs; exclui targets/estáticos e flags imputed
    skip = {"stay_id", "charttime", "falencia", "falencia_onset", "mortality", "vasopressor_ativo", "rel_minutes"}
    feature_cols = [c for c in df.columns if c not in skip and not c.endswith("_imputed")]
    imputed_cols = [c for c in df.columns if c.endswith("_imputed")]
    stay_ids = df["stay_id"].unique()
    lbl_sub = labels_df[labels_df["stay_id"].isin(stay_ids)]

    results = []
    for stay_id, sub in tqdm(df.groupby("stay_id"), desc=f"{part_path.name}", leave=False):
        feats = derive_features_for_stay(sub, feature_cols, window=window)
        lbl = lbl_sub[lbl_sub["stay_id"] == stay_id][["charttime", "falencia", "falencia_onset"]]
        feats = feats.merge(lbl, on="charttime", how="left")
        # mantem mortalidade e vasopressor_ativo (nao geram features derivadas)
        feats = feats.merge(sub[["charttime", "mortality"]].drop_duplicates("charttime"), on="charttime", how="left")
        feats = feats.merge(sub[["charttime", "vasopressor_ativo"]].drop_duplicates("charttime"), on="charttime", how="left")
        # mantem colunas _imputed para inspecao
        if imputed_cols:
            feats = feats.merge(sub[["charttime"] + imputed_cols], on="charttime", how="left")
        feats = feats.fillna(0)
        results.append(feats)

    out = pd.concat(results, ignore_index=True)

    # Reordena colunas para deixar flags _imputed ao lado da respectiva variável e remover rel_minutes se vier
    ordered: List[str] = []
    for base in ("stay_id", "charttime", "falencia", "falencia_onset", "mortality", "vasopressor_ativo"):
        if base in out.columns:
            ordered.append(base)
        if base == "vasopressor_ativo":
            imp = "vasopressor_ativo_imputed"
            if imp in out.columns:
                ordered.append(imp)
    for col in feature_cols:
        for derived in (
            f"n_meas_{col}",
            f"min_{col}",
            f"max_{col}",
            f"mean_{col}",
            f"{col}_instab",
            f"{col}_intens",
            f"{col}_cumul",
            col,
        ):
            if derived in out.columns:
                ordered.append(derived)
        imp = f"{col}_imputed"
        if imp in out.columns:
            ordered.append(imp)
    for col in out.columns:
        if col not in ordered and col != "rel_minutes":
            ordered.append(col)
    out = out[ordered]

    out_path = out_dir / part_path.name.replace(".parquet", "_features.parquet")
    out.to_parquet(out_path, index=False)
    # salva amostra CSV para inspeção
    sample_path = out_dir / part_path.name.replace(".parquet", "_features_amostra.csv")
    out.head(200).to_csv(sample_path, index=False)
    return out_path


def fe_extract_main():
    parser = argparse.ArgumentParser(description="Extrai features derivadas por stay dos imputed parts.")
    parser.add_argument("--imputed-dir", type=str, required=True, help="Diretório com imputed parts (parquet).")
    parser.add_argument("--labels", type=str, required=True, help="Parquet de labels (charttime, falencia, falencia_onset).")
    parser.add_argument("--out-dir", type=str, default=config.OUTPUT_DIR / "preprocess/batches/features_raw", help="Diretório de saída.")
    parser.add_argument("--window", type=int, default=12, help="Janela (n passos) para instabilidade (std rolling).")
    args = parser.parse_args()

    imp_dir = Path(args.imputed_dir)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    labels_df = pd.read_parquet(args.labels)
    labels_df["charttime"] = pd.to_datetime(labels_df["charttime"])

    parts = sorted(imp_dir.glob("*.parquet"))
    if not parts:
        raise SystemExit(f"Nenhum parquet encontrado em {imp_dir}")

    for p in tqdm(parts, desc="Parts"):
        process_part(p, labels_df, out_dir, window=args.window)

    print(f"Features salvas em {out_dir}")

## 6) Build HDF5 from features

In [None]:
"""Constrói H5 a partir de features derivadas (partes) + labels."""

from __future__ import annotations

import argparse
import gc
import time
from pathlib import Path

import numpy as np
import pandas as pd
import tables
from sklearn.model_selection import train_test_split
from tqdm import tqdm



def create_groups(h5: tables.File, n_features: int):
    for split in ["train", "val", "test"]:
        h5.create_earray("/data", split, atom=tables.Float32Atom(), shape=(0, n_features))
        h5.create_earray("/labels", split, atom=tables.Float32Atom(), shape=(0, 1))
        h5.create_earray("/patient_windows", split, atom=tables.Int32Atom(), shape=(0, 3))
        h5.create_earray("/patient_windows", f"{split}_stay_ids", atom=tables.Int32Atom(), shape=(0,))


def build_h5_features_main():
    parser = argparse.ArgumentParser(description="Cria dataset HDF5 a partir de features derivadas + labels.")
    parser.add_argument("--features-dir", type=str, required=True, help="Diretório com *_features.parquet.")
    parser.add_argument("--out", type=str, default=config.OUTPUT_DIR / "preprocess" / "h5" / "dataset_features.h5")
    parser.add_argument("--split-path", type=str, default=None, help="TSV opcional stay_id/split.")
    args = parser.parse_args()

    feat_dir = Path(args.features_dir)
    parts = sorted(feat_dir.glob("*_features.parquet"))
    if not parts:
        raise SystemExit(f"Nenhum *_features.parquet em {feat_dir}")

    start = time.time()

    # Determina feature_cols (exclui targets, mortalidade e antropometria/derivados para evitar leak)
    sample = pd.read_parquet(parts[0])
    drop_bases = {"bmi", "height", "weight"}

    def is_drop(col: str) -> bool:
        if col in {"stay_id", "charttime", "falencia", "falencia_onset", "mortality"}:
            return True
        for base in drop_bases:
            if (
                col == base
                or col == f"{base}_imputed"
                or col.startswith(f"n_meas_{base}")
                or col.startswith(f"min_{base}")
                or col.startswith(f"max_{base}")
                or col.startswith(f"mean_{base}")
                or col.startswith(f"{base}_instab")
                or col.startswith(f"{base}_intens")
                or col.startswith(f"{base}_cumul")
            ):
                return True
        return False

    feature_cols = [c for c in sample.columns if not is_drop(c)]
    n_features = len(feature_cols)

    out_h5 = Path(args.out)
    out_h5.parent.mkdir(parents=True, exist_ok=True)
    if out_h5.exists():
        out_h5.unlink()

    # Split
    all_stays = pd.concat([pd.read_parquet(p, columns=["stay_id"]).drop_duplicates() for p in parts])["stay_id"].unique()
    if args.split_path and Path(args.split_path).exists():
        split_df = pd.read_csv(args.split_path, sep="\t")
        split_map = dict(zip(split_df["stay_id"], split_df["split"]))
    else:
        train_ids, test_ids = train_test_split(all_stays, test_size=0.15, random_state=42)
        train_ids, val_ids = train_test_split(train_ids, test_size=0.1765, random_state=42)
        split_map = {sid: "train" for sid in train_ids}
        split_map.update({sid: "val" for sid in val_ids})
        split_map.update({sid: "test" for sid in test_ids})

    with tables.open_file(out_h5, mode="w") as h5:
        h5.create_group("/", "data", "features")
        h5.create_group("/", "labels", "labels")
        h5.create_group("/", "patient_windows", "windows")
        create_groups(h5, n_features)

        total_stays = 0
        for part in tqdm(parts, desc="Parts"):
            df = pd.read_parquet(part)
            df = df.sort_values(["stay_id", "charttime"])
            df[feature_cols] = df[feature_cols].fillna(0).astype(np.float32)
            # normalização simples z-score por coluna
            means = df[feature_cols].mean()
            stds = df[feature_cols].std().replace(0, 1.0)
            df[feature_cols] = (df[feature_cols] - means) / stds
            df["falencia"] = df["falencia"].fillna(0).astype(np.float32)
            for split in ["train", "val", "test"]:
                sub = df[df["stay_id"].map(split_map).fillna("train") == split]
                if sub.empty:
                    continue
                data_arr = h5.get_node(f"/data/{split}")
                label_arr = h5.get_node(f"/labels/{split}")
                window_arr = h5.get_node(f"/patient_windows/{split}")
                stay_ids_arr = h5.get_node(f"/patient_windows/{split}_stay_ids")
                for stay_id, sid_df in sub.groupby("stay_id"):
                    start_idx = data_arr.nrows
                    data_arr.append(sid_df[feature_cols].to_numpy())
                    label_arr.append(sid_df[["falencia"]].to_numpy())
                    stop_idx = data_arr.nrows
                    window_arr.append(np.array([[start_idx, stop_idx, int(stay_id)]], dtype=np.int32))
                    stay_ids_arr.append(np.array([int(stay_id)], dtype=np.int32))
                    total_stays += 1
            del df
            gc.collect()
        h5.set_node_attr("/", "feature_names", feature_cols)

    elapsed = time.time() - start
    print(f"H5 salvo em {out_h5} com {n_features} features derivadas, {total_stays} stays. Tempo: {elapsed/60:.1f} min.")

## 7) Run pipeline

In [None]:
# 1) prep_merge
import sys

sys.argv = ["prep_merge", "--base-dir", str(BASE_DIR), "--out-dir", str(PREPROCESS_DIR)]
prep_merge_main()

# 2) impute
merged_path = PREPROCESS_DIR / "merged.parquet"
static_path = PREPROCESS_DIR / "static.parquet"

sys.argv = [
    "impute",
    "--merged", str(merged_path),
    "--static", str(static_path),
    "--out", str(IMPUTED_DIR),
    "--grid-minutes", str(GRID_MINUTES),
]
impute_main()

# 3) labels
sys.argv = ["label", "--imputed", str(IMPUTED_DIR), "--out", str(LABELS_PATH)]
label_main()

# 4) feature extraction
sys.argv = [
    "fe_extract",
    "--imputed-dir", str(IMPUTED_DIR),
    "--labels", str(LABELS_PATH),
    "--out-dir", str(FEATURES_DIR),
    "--window", "12",
]
fe_extract_main()

# 5) build HDF5
sys.argv = [
    "build_h5",
    "--features-dir", str(FEATURES_DIR),
    "--out", str(H5_PATH),
]
build_h5_features_main()

## Build HDF5

In [None]:
# Build HDF5 (NWICU)
from pathlib import Path
import numpy as np
import pandas as pd
import tables

BATCH_DIR = Path(OUT_DIR) / "preprocess" / "batches" / "features_filtered"
H5_PATH = Path(OUT_DIR) / "preprocess" / "h5" / "dataset_features.h5"
SPLIT_TSV = Path(OUT_DIR) / "split_all.tsv"

ID_COL = "stay_id"
TIME_COL = "charttime"
LABEL_COL = "falencia"
SEED = 42
TRAIN_FRAC = 0.8
VAL_FRAC = 0.1

# --- helpers ---
def ensure_groups(h5, groups):
    for g in groups:
        if f"/{g}" not in h5:
            h5.create_group("/", g, f"{g} group")


def create_earray(h5, path, atom, n_cols, expectedrows=10_000_000):
    parent, name = path.rsplit("/", 1)
    return h5.create_earray(parent, name, atom=atom, shape=(0, n_cols), expectedrows=expectedrows)


def build_split(stay_ids, seed, train_frac, val_frac):
    rng = np.random.default_rng(seed)
    stay_ids = np.array(sorted(stay_ids), dtype=int)
    rng.shuffle(stay_ids)
    n = len(stay_ids)
    n_train = int(n * train_frac)
    n_val = int(n * val_frac)
    return {
        "train": set(stay_ids[:n_train].tolist()),
        "val": set(stay_ids[n_train:n_train + n_val].tolist()),
        "test": set(stay_ids[n_train + n_val:].tolist()),
    }


def load_split_map(split_tsv, stay_ids, seed, train_frac, val_frac):
    if split_tsv.exists():
        df = pd.read_csv(split_tsv, sep="	")
        if {ID_COL, "split"}.issubset(df.columns):
            return dict(zip(df[ID_COL].astype(int), df["split"]))
    split_sets = build_split(stay_ids, seed, train_frac, val_frac)
    split_tsv.parent.mkdir(parents=True, exist_ok=True)
    with split_tsv.open("w", encoding="utf-8") as f:
        f.write(f"{ID_COL}\tsplit\n")
        for split, ids in split_sets.items():
            for sid in sorted(ids):
                f.write(f"{sid}\t{split}\n")
    return {sid: split for split, ids in split_sets.items() for sid in ids}


paths = sorted(p for p in BATCH_DIR.glob("batch_*.parquet") if not p.name.endswith("_labels.parquet"))
if not paths:
    raise SystemExit(f"No batch_*.parquet found in {BATCH_DIR}")

stay_ids = set()
for p in paths:
    df_ids = pd.read_parquet(p, columns=[ID_COL])
    stay_ids.update(df_ids[ID_COL].dropna().astype(int).unique().tolist())

split_map = load_split_map(SPLIT_TSV, stay_ids, SEED, TRAIN_FRAC, VAL_FRAC)

first_df = pd.read_parquet(paths[0])
drop_cols = {ID_COL, TIME_COL, LABEL_COL}
feature_cols = [c for c in first_df.columns if c not in drop_cols]

H5_PATH.parent.mkdir(parents=True, exist_ok=True)
if H5_PATH.exists():
    H5_PATH.unlink()

with tables.open_file(H5_PATH, mode="w") as h5:
    ensure_groups(h5, ["data", "labels", "patient_windows"])
    data_arrays = {}
    label_arrays = {}
    window_arrays = {}
    stay_id_arrays = {}

    for split in ["train", "val", "test"]:
        data_arrays[split] = create_earray(h5, f"/data/{split}", tables.Float32Atom(), n_cols=len(feature_cols))
        label_arrays[split] = create_earray(h5, f"/labels/{split}", tables.Float32Atom(), n_cols=1)
        window_arrays[split] = create_earray(h5, f"/patient_windows/{split}", tables.Int32Atom(), n_cols=3)
        stay_id_arrays[split] = h5.create_earray("/patient_windows", f"{split}_stay_ids", atom=tables.Int32Atom(), shape=(0,))

    for idx, path in enumerate(paths, start=1):
        df = first_df if idx == 1 else pd.read_parquet(path)
        if TIME_COL in df.columns:
            df = df.sort_values([ID_COL, TIME_COL])
        else:
            df = df.sort_values([ID_COL])
        df[ID_COL] = df[ID_COL].astype(int)

        for stay_id, df_sid in df.groupby(ID_COL, sort=False):
            split = split_map.get(int(stay_id), "train")
            d_arr = data_arrays[split]
            l_arr = label_arrays[split]
            w_arr = window_arrays[split]
            sid_arr = stay_id_arrays[split]

            start = d_arr.nrows
            feat = df_sid[feature_cols].astype(np.float32).to_numpy()
            lbl = df_sid[[LABEL_COL]].astype(np.float32).to_numpy() if LABEL_COL in df_sid.columns else np.zeros((len(df_sid), 1), dtype=np.float32)
            d_arr.append(feat)
            l_arr.append(lbl)
            stop = d_arr.nrows

            w_arr.append(np.array([[start, stop, int(stay_id)]], dtype=np.int32))
            sid_arr.append(np.array([int(stay_id)], dtype=np.int32))

        if idx % 10 == 0 or idx == 1 or idx == len(paths):
            print(f"[{idx}/{len(paths)}] {path.name}")

    h5.set_node_attr("/", "feature_names", feature_cols)
    h5.set_node_attr("/", "label_column", LABEL_COL)

print("H5 saved to", H5_PATH, "with", len(feature_cols), "features")

In [None]:
# Build per-stay falencia summary (falencia_normal + 45/60min + mortality)
import sys
from pathlib import Path
import subprocess

PREPROCESS_DIR = OUT_DIR / "preprocess"
INPUT_DIR = PREPROCESS_DIR / "batches" / "features_filtered"
OUT_CSV = PREPROCESS_DIR / "falencia_stay_summary.csv"
MORTALITY_PATH = PREPROCESS_DIR / "mortality_by_stay.csv"

cmd = [
    sys.executable,
    str(Path("src/clustering/minirocket/pipelines/build_falencia_stay_summary.py")),
    "--input_dir", str(INPUT_DIR),
    "--mortality_path", str(MORTALITY_PATH),
    "--out_csv", str(OUT_CSV),
    "--time_col", "charttime",
    "--id_col", "stay_id",
    "--mbp_col", "mbp",
    "--vaso_col", "vasopressor_ativo",
    "--lactate_col", "lab_50813",
    "--falencia_col", "falencia",
    "--step_min_45", "5", "--window_min_45", "45",
    "--step_min_60", "60", "--window_min_60", "60",
]
print(" ".join(cmd))
subprocess.run(cmd, check=True)
