In [None]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


DATA_DIR = os.path.join(Path('../data'), 'raw/')


In [2]:
file = os.path.join(DATA_DIR, "ed2022-stata.dta")

df = pd.read_stata(file, convert_categoricals=False)
print(df.shape)
display(df.head())

(16025, 913)


Unnamed: 0,VMONTH,VDAYR,ARRTIME,WAITTIME,LOV,AGE,AGER,AGEDAYS,RESIDNCE,SEX,...,RX30V3C2,RX30V3C3,RX30V3C4,SETTYPE,YEAR,CSTRATM,CPSUM,PATWT,EDWT,BOARDED
0,9,2,604,10,228,23,2,-7,1,1,...,,,,3,2022,20122201,100001,3665.56958,8.36413,-7
1,9,2,1053,40,319,15,2,-7,1,2,...,,,,3,2022,20122201,100001,3665.56958,,-7
2,9,2,1419,70,551,19,2,-7,1,2,...,,,,3,2022,20122201,100001,3665.56958,,-7
3,9,2,1825,-7,-9,0,1,298,1,2,...,,,,3,2022,20122201,100001,3665.56958,,-7
4,9,2,2243,14,168,18,2,-7,1,1,...,,,,3,2022,20122201,100001,3665.56958,,-7


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16025 entries, 0 to 16024
Columns: 913 entries, VMONTH to BOARDED
dtypes: float32(2), float64(90), int16(118), int32(49), int8(130), object(524)
memory usage: 83.8+ MB


In [8]:
list(df.columns)

['VMONTH',
 'VDAYR',
 'ARRTIME',
 'WAITTIME',
 'LOV',
 'AGE',
 'AGER',
 'AGEDAYS',
 'RESIDNCE',
 'SEX',
 'ETHUN',
 'ETHIM',
 'RACEUN',
 'RACER',
 'RACERETH',
 'ARREMS',
 'AMBTRANSFER',
 'NOPAY',
 'PAYPRIV',
 'PAYMCARE',
 'PAYMCAID',
 'PAYWKCMP',
 'PAYSELF',
 'PAYNOCHG',
 'PAYOTH',
 'PAYDK',
 'PAYTYPER',
 'TEMPF',
 'PULSE',
 'RESPR',
 'BPSYS',
 'BPDIAS',
 'POPCT',
 'IMMEDR',
 'PAINSCALE',
 'SEEN72',
 'RFV1',
 'RFV2',
 'RFV3',
 'RFV4',
 'RFV5',
 'RFV13D',
 'RFV23D',
 'RFV33D',
 'RFV43D',
 'RFV53D',
 'EPISODE',
 'INJURY',
 'INJPOISAD',
 'INJURY72',
 'INTENT15',
 'INJURY_ENC',
 'CAUSE1',
 'CAUSE2',
 'CAUSE3',
 'DIAG1',
 'DIAG2',
 'DIAG3',
 'DIAG4',
 'DIAG5',
 'PRDIAG1',
 'PRDIAG2',
 'PRDIAG3',
 'PRDIAG4',
 'PRDIAG5',
 'ETOHAB',
 'ALZHD',
 'ASTHMA',
 'CANCER',
 'CEBVD',
 'CKD',
 'COPD',
 'CHF',
 'CAD',
 'DEPRN',
 'DIABTYP1',
 'DIABTYP2',
 'DIABTYP0',
 'ESRD',
 'HPE',
 'EDHIV',
 'HYPLIPID',
 'HTN',
 'OBESITY',
 'OSA',
 'OSTPRSIS',
 'SUBSTAB',
 'NOCHRON',
 'TOTCHRON',
 'DIAGSCRN',
 'ABG',
 'B

In [38]:
def pipe_fit(X, num_cols, cat_cols, target, multi_class="ovr"):
    # alvo
    y = df[target].copy()

    # remove NaN no y e alinha X
    mask = y.notna()
    y = y[mask]
    X = X.loc[mask]

    # pré-processamento
    numeric_tf = Pipeline([("imputer", SimpleImputer(strategy="median"))])
    categorical_tf = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
    ])
    pre = ColumnTransformer([
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ])

    clf = RandomForestClassifier(
        n_estimators=400, random_state=42, n_jobs=-1,
        class_weight="balanced_subsample"
    )
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # split (se for multiclasse, o stratify aceita y categórico)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_te)

    print(classification_report(y_te, y_pred, digits=3))

    # === AUC binário vs multiclasse ===
    # n_classes detectado a partir do estimador treinado
    classes_ = pipe.named_steps["clf"].classes_
    n_classes = len(classes_)

    if n_classes == 2:
        # binário: usar coluna da classe positiva
        y_proba = pipe.predict_proba(X_te)[:, 1]
        auc = roc_auc_score(y_te, y_proba)
        print("AUC (binário):", auc)
    else:
        # multiclasse: passar a matriz inteira de probabilidades
        # shape esperado: (n_amostras, n_classes); cada linha soma 1.0
        y_proba = pipe.predict_proba(X_te)  # NÃO fatie [:, 1] aqui!
        # se vier lista (raro), empilha
        if isinstance(y_proba, list):
            y_proba = np.column_stack(y_proba)
        auc = roc_auc_score(
            y_te, y_proba, multi_class=multi_class, average="weighted"
        )
        print(f"AUC (multiclasse={multi_class}, weighted):", auc)

    # === Importância das features (robusto) ===
    rf  = pipe.named_steps["clf"]
    pre = pipe.named_steps["pre"]

    feat_names = []

    # nomes numéricos (se o bloco numérico existir)
    if "num" in pre.named_transformers_ and pre.named_transformers_["num"] != "drop":
        feat_names += list(num_cols)

    # nomes categóricos (se o bloco categórico existir e estiver fitted)
    if "cat" in pre.named_transformers_ and pre.named_transformers_["cat"] != "drop":
        cat_pipe = pre.named_transformers_["cat"]
        if hasattr(cat_pipe, "named_steps") and "onehot" in cat_pipe.named_steps:
            ohe = cat_pipe.named_steps["onehot"]
            if hasattr(ohe, "categories_"):
                feat_names += ohe.get_feature_names_out(cat_cols).tolist()

    # Fallback: se ainda ficar vazio, tente usar o próprio ColumnTransformer
    if not feat_names:
        try:
            feat_names = pre.get_feature_names_out().tolist()
        except Exception:
            # último recurso: crie índices
            feat_names = [f"f{i}" for i in range(rf.n_features_in_)]

    importances = pd.DataFrame(
        {"feature": feat_names, "importance": rf.feature_importances_}
    ).sort_values("importance", ascending=False)

    print(importances.head(20))

    return pipe, importances

In [39]:
from sklearn.ensemble import RandomForestClassifier

# features = [
#     "AGE","SEX","ETHIM","RACER","ARREMS",         # demografia / chegada
#     "TEMPDF","PULSED","RESPR","BPSYS","BPDIAS",   # sinais vitais (versões imputadas/derivadas)
#     "RFV1","DIAG1"                                # categóricas de alta cardinalidade
# ]
features = ["AGE","SEX","ETHIM","RACER","ARREMS","TEMPDF","PULSED","RESPR","BPSYS","BPDIAS","RFV1"]
target = "ADMITHOS"

X = df[features].copy()


def top_k_or_other(s, k=200):
    top = s.value_counts().nlargest(k).index
    return s.where(s.isin(top), other="__OTHER__")

for c in ["RFV1"]:
    if c in cat_cols:
        X[c] = top_k_or_other(X[c].astype(str), k=200)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("num_cols:", num_cols)
print("cat_cols:", cat_cols)
assert len(num_cols) + len(cat_cols) > 0, "Sem colunas! Reveja X/features."

pipe_fit(X, num_cols=num_cols, cat_cols=cat_cols, target=target)


num_cols: ['AGE', 'SEX', 'ETHIM', 'RACER', 'ARREMS', 'TEMPDF', 'PULSED', 'RESPR', 'BPSYS', 'BPDIAS', 'RFV1']
cat_cols: []
              precision    recall  f1-score   support

           0      0.879     0.986     0.929      2781
           1      0.545     0.113     0.188       424

    accuracy                          0.870      3205
   macro avg      0.712     0.549     0.558      3205
weighted avg      0.835     0.870     0.831      3205

AUC (binário): 0.7729416424117835
   feature  importance
0      AGE    0.223737
10    RFV1    0.156275
8    BPSYS    0.126941
9   BPDIAS    0.120624
6   PULSED    0.102280
7    RESPR    0.080565
4   ARREMS    0.076676
5   TEMPDF    0.050198
3    RACER    0.025732
1      SEX    0.021568
2    ETHIM    0.015404


(Pipeline(steps=[('pre',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median'))]),
                                                   ['AGE', 'SEX', 'ETHIM',
                                                    'RACER', 'ARREMS', 'TEMPDF',
                                                    'PULSED', 'RESPR', 'BPSYS',
                                                    'BPDIAS', 'RFV1']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('onehot',
                                                                    OneHotEncoder(handle_unknown='ig

In [40]:
target = "HDSTAT"
pipe_fit(X, num_cols=num_cols, cat_cols=cat_cols, target=target, multi_class='ovr')


              precision    recall  f1-score   support

          -9      0.000     0.000     0.000         5
          -8      0.000     0.000     0.000         5
          -7      0.873     0.994     0.930      2781
           1      0.525     0.052     0.095       402
           2      0.000     0.000     0.000        12

    accuracy                          0.869      3205
   macro avg      0.280     0.209     0.205      3205
weighted avg      0.824     0.869     0.819      3205

AUC (multiclasse=ovr, weighted): 0.7886810157445944
   feature  importance
0      AGE    0.189363
10    RFV1    0.167290
9   BPDIAS    0.125858
8    BPSYS    0.124029
6   PULSED    0.113553
7    RESPR    0.095246
4   ARREMS    0.058027
5   TEMPDF    0.052742
3    RACER    0.028020
1      SEX    0.026835
2    ETHIM    0.019037


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


(Pipeline(steps=[('pre',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median'))]),
                                                   ['AGE', 'SEX', 'ETHIM',
                                                    'RACER', 'ARREMS', 'TEMPDF',
                                                    'PULSED', 'RESPR', 'BPSYS',
                                                    'BPDIAS', 'RFV1']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('onehot',
                                                                    OneHotEncoder(handle_unknown='ig

In [24]:
print(df["HDSTAT"].value_counts(dropna=False))
print(df["HDSTAT"].unique())


HDSTAT
-7    13904
 1     2011
 2       59
-8       28
-9       23
Name: count, dtype: int64
[-7  2  1 -8 -9]


In [15]:
pyxis["gsn"] = pyxis["gsn"].fillna("Unknown")

In [16]:
display(triage.isnull().sum()/len(triage))

subject_id        0.000000
stay_id           0.000000
temperature       0.055083
heartrate         0.040204
resprate          0.047880
o2sat             0.048451
sbp               0.043029
dbp               0.044911
pain              0.030424
acuity            0.016437
chiefcomplaint    0.000054
dtype: float64

In [19]:
triage.dtypes

subject_id               int64
stay_id                  int64
temperature            float64
heartrate              float64
resprate               float64
o2sat                  float64
sbp                    float64
dbp                    float64
pain                    object
acuity                 float64
chiefcomplaint          object
temperature_missing      int64
heartrate_missing        int64
resprate_missing         int64
o2sat_missing            int64
sbp_missing              int64
dbp_missing              int64
pain_missing             int64
dtype: object

In [21]:
# Dados numéricos

imputer = SimpleImputer(strategy="median")
for col in ["temperature","heartrate","resprate","o2sat","sbp","dbp"]:
    triage[col+"_missing"] = triage[col].isna().astype(int)
    triage[col] = imputer.fit_transform(triage[[col]])

In [22]:
imputer = SimpleImputer(strategy="most_frequent")
triage["acuity_missing"] = triage["acuity"].isna().astype(int)
triage["acuity"] = imputer.fit_transform(triage[["acuity"]])

In [23]:
triage["chiefcomplaint"] = triage["chiefcomplaint"].fillna("unknown")

In [26]:
list(triage["pain"].unique())

['7',
 '0',
 '10',
 '13',
 '5',
 '8',
 '1',
 '3',
 '6',
 '2',
 '4',
 'critical',
 nan,
 'unable',
 '9',
 'uta',
 'Non-verbal',
 'ett',
 'UTA',
 '2-3',
 'moderate',
 'u/a',
 'ua',
 'some',
 'Critical',
 'c',
 'unable ',
 'ok',
 '>10',
 '5-6',
 'UA',
 '0-3',
 '9.5',
 'denies ',
 'yes',
 'denies',
 '8.5',
 '3.5',
 '8.4',
 'o',
 '7.5',
 '.5',
 '7-8',
 'leg pain',
 '"pretty high"',
 'pre-hosp',
 'uable',
 '3-4',
 '4-5',
 'crit',
 'Unable',
 'alot',
 'ETT',
 '8 9 or 10',
 '6 ',
 '1-2',
 '?',
 ' 0',
 'sore',
 '"15"',
 '8-9',
 '15',
 '15-20',
 'u',
 '20',
 '1.5',
 '0 ',
 '5/10',
 'pre hosp',
 'UTA ',
 '11',
 '00',
 '10 ',
 'critical ',
 '"alot"',
 'sleeping',
 'Pt states he is unable to give me a number ',
 'sedated',
 '0.5',
 '6-7',
 'asleep',
 'bad',
 '___',
 '4-10',
 'Bad',
 '"a little"',
 'NAD',
 'not bad',
 '12',
 'pressure',
 '"feels like shit"',
 '.',
 '069',
 'crying',
 'mild',
 'uto',
 'too much',
 'total body pain',
 'unqble',
 '0-8',
 '0-',
 '6.5',
 'burning',
 'slight',
 'feels ok'

In [None]:
import re
def clean_pain(value):
    if pd.isna(value):
        return np.nan, "missing"
    
    val = str(value).strip().lower()
    
    # Casos de ausência
    if any(x in val for x in [
                                      "unable", "intub", "sedat", "refus", "unresp",
                                      "non-verbal", "uta","ett", "u/a", "ua",
                                      "uable", "sleeping", "Pt states he is unable to give me a number",
                                      "sedated", "asleep", "uto", "unqble",
                                      "unable to score - abd pain", "uanble", "non verbal",
                                      "nonverbal", "numb", "unable to rate",
                                      "unabl3", "unabke"
                                      ]):
        return np.nan, "unable"
    
    if any(x in val for x in [
                                "no pain", "denies", "none", "ok", "nad", "0pain", "o",
                                "feels ok", "no", "dont"
                              ]):
        return 0, "valid"
    
    # Extrair números
    nums = re.findall(r"\d+\.?\d*", val)
    if nums:
        nums = [float(x) for x in nums]
        score = np.mean(nums)  # média se for range tipo 7-8
        if score > 10:
            score = 10
        return score, "valid"
    
    # Casos de descrições qualitativas
    if any(x in val for x in [
                                "mild", "slight", "some", "\"a little\"", "not bad",
                                "tightness", "under control", "litte", "less sharp"
                        ]):
        return 2, "approx"
    if any(x in val for x in [
                                "moderate", "yes", "5/10", "pressure", "\"hurts\"",
                                "\"annoying\"", "not much", "not much pain",
                                "not strong at all", "\"pain\"", "manageable",
                                "annoying", "uncomfortable", "pain", "it hurts"
                            ]):
        return 5, "approx"
    if any(x in val for x in [
                                "\"pretty high \"", "pre-hosp", "pre hosp", "sore", "bad",
                                "feels like shit", "crying", "total body pain", "burning",
                                "mucho", "+"
                              ]):
        return 7, "approx"
    if "7/8" in val:
        return 7.5, "approx"
    if any(x in val for x in [
                                "severe", "terrible", "awful", "hurts a lot", "critical",
                                "crit", "alot", "8 9 or 10", "\"alot\"", "too much",
                                "very painful", "a lot", "throbbing", "very bad", "++",
                                "moaning", "very"
                              ]):
        return 9, "approx"
    if any(x in val for x in [
                                "\"15\"", "10+", "10/10"
                              ]):
        return 10, "approx"
    
    return np.nan, "unknown"


SyntaxError: invalid syntax (1776592811.py, line 54)

In [None]:
display(vitalsign.isnull().sum())