<h1>Outcomes Using Trust Features</h1>
Does performance improve for tasks (code status, leaving AMA, and in-hosp mortality) when adding mistrust features on top of demographics?
Yes

In [1]:
import psycopg2
import pandas as pd
from time import gmtime, strftime
import tqdm
from datetime import timedelta
import os
from pandas_gbq import read_gbq
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import defaultdict
import pylab as pl
from scipy.stats import mannwhitneyu

# Make pandas dataframes prettier
from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery
from google.cloud.bigquery import Client

In [2]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define save paths
drive_path = "/content/drive/MyDrive/mimic_data"
os.makedirs(drive_path, exist_ok=True)

Mounted at /content/drive


In [3]:
# authenticate
auth.authenticate_user()

In [4]:
# Set up environment variables
project_id = 'CHANGE-ME'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

bq_client = bigquery.Client(project=project_id)

# Modified run_query function using BigQuery client
def run_query(query: str):
    query_job = bq_client.query(query)
    return query_job.to_dataframe(create_bqstorage_client=True)

# set the dataset
# if you want to use the demo, change this to mimic_demo
hosp_dataset_4 = 'mimiciv_3_1_hosp'
icu_dataset_4 = 'mimiciv_3_1_icu'
derived_dataset_4 = 'mimiciv_3_1_derived'
derived_dataset_3 = 'mimiciii_derived'
clinical_dataset_3 = 'mimiciii_clinical'
note_dataset_3 = 'mimiciii_notes'

#indicate whether to run a limited sample size for testing purposes
limited_sample = False

In [5]:
# Print current UTC time
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# LABEL: code status
code_query = f"""
SELECT DISTINCT hadm_id, label, value
FROM physionet-data.{clinical_dataset_3}.chartevents c
JOIN physionet-data.{clinical_dataset_3}.d_items i ON i.itemid = c.itemid
WHERE label = 'Code Status'
"""
code_status = run_query(code_query)

# Binary labels
code_labels = {}
for _, row in tqdm.tqdm(code_status.iterrows(), total=code_status.shape[0]):
    if isinstance(row.value, str):
        if any(term in row.value for term in ['DNR', 'DNI', 'Comfort', 'Do Not']):
            label = 'DNR/CMO'
        elif row.value.lower() == 'full code':
            label = 'Full Code'
    code_labels[int(row.hadm_id)] = label

2025-05-07 16:48:11


100%|██████████| 46121/46121 [00:03<00:00, 11822.79it/s]


In [6]:
print(set(code_status['value'].values))

{'CPR Not Indicate', 'DNR (do not resuscitate)', 'Other/Remarks', 'Do Not Intubate', None, 'Comfort Measures', 'Comfort measures only', 'DNI (do not intubate)', 'Do Not Resuscita', 'DNR / DNI', 'Full Code', 'Full code'}


In [7]:
import pickle

with open(os.path.join(drive_path, "code_labels.pkl"), "wb") as f:
    pickle.dump(code_labels, f)

In [8]:
# hadm -> race
import tqdm

def normalize_race(race):
    if 'HISPANIC' in race:
        return 'Hispanic'
    if 'SOUTH AMERICAN' in race:
        return 'Hispanic'
    if 'AMERICAN INDIAN' in race:
        return 'Native American'
    if 'ASIAN' in race:
        return 'Asian'
    if 'BLACK' in race:
        return 'Black'
    if 'WHITE' in race:
        return 'White'
    return 'Other'

def normalize_insurance(insurance):
    if insurance in ['Medicare', 'Medicaid', 'Government']:
        return 'Public'
    else:
        return insurance

In [9]:
# LABEL: left hospital against medical advice

# Query for discharge info
discharge_query = f"""
SELECT DISTINCT hadm_id, discharge_location
FROM physionet-data.{clinical_dataset_3}.admissions
"""
discharge = run_query(discharge_query)

# Binary labels
ama_labels = {}
for _, row in tqdm.tqdm(discharge.iterrows(), total=discharge.shape[0]):
    label = 'AMA' if row.discharge_location == 'LEFT AGAINST MEDICAL ADVI' else 'compliant'
    ama_labels[int(row.hadm_id)] = label

#discharge.head()

100%|██████████| 58976/58976 [00:03<00:00, 17187.36it/s]


In [10]:
with open(os.path.join(drive_path, "ama_labels.pkl"), "wb") as f:
    pickle.dump(ama_labels, f)

In [11]:
# LABEL: in-hospital mortality

# Query for discharge info
mortality_query = f"""
SELECT DISTINCT hadm_id, hospital_expire_flag
FROM physionet-data.{clinical_dataset_3}.admissions
"""
mortality = run_query(mortality_query)

# Binary labels
mortality_labels = {}
for _, row in tqdm.tqdm(mortality.iterrows(), total=mortality.shape[0]):
    label = 'deceased' if row.hospital_expire_flag else 'survived'
    mortality_labels[int(row.hadm_id)] = label

#mortality.head()

100%|██████████| 58976/58976 [00:03<00:00, 17900.86it/s]


In [12]:
with open(os.path.join(drive_path, "mortality_labels.pkl"), "wb") as f:
    pickle.dump(mortality_labels, f)

# Helper functions

In [13]:
import random

#Data splitting function
def data_split(ids, ratio=0.6):
    random.shuffle(ids)
    train = ids[:int(len(ids)*ratio) ]
    test  = ids[ int(len(ids)*ratio):]
    return train, test

In [14]:
# Write informative features code
def analyze(task, vect, clf, count_top=False):
    ind2feat = {i: f for f, i in vect.vocabulary_.items()}

    # Get coefficients from classifier
    coef_ = clf.coef_

    print(task)

    # Get indices of features sorted by importance
    informative_feats = np.argsort(coef_)

    # If binary classification, flatten arrays
    if informative_feats.ndim == 2:
        informative_feats = informative_feats[0]
        coef_ = coef_[0]

    # Print most informative features in descending order
    for feat in reversed(informative_feats):
        val = coef_[feat]
        word = ind2feat.get(feat, f"[unknown_{feat}]")
        print(f'\t{str(word):<25}: {val:7.4f}')

In [15]:
%matplotlib inline

import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import matplotlib.pyplot as plt


def compute_stats(task, pred, P, ref, labels_map, verbose):
    if len(labels_map) == 2:
        scores = P[:, 1] - P[:, 0]
        return compute_stats_binary(task, pred, scores, ref, labels_map, verbose)
    else:
        return compute_stats_multiclass(task, pred, P, ref, labels_map, verbose)


def compute_stats_binary(task, pred, P, ref, labels, verbose):
    assert all((P > 0).astype(int) == pred)

    conf = np.zeros((2, 2), dtype='int32')
    for p, r in zip(pred, ref):
        conf[p][r] += 1

    if verbose:
        print(conf)
        print()

    tp = conf[1, 1]
    tn = conf[0, 0]
    fp = conf[1, 0]
    fn = conf[0, 1]

    precision   = tp / (tp + fp + 1e-9)
    recall      = tp / (tp + fn + 1e-9)
    sensitivity = recall
    specificity = tn / (tn + fp + 1e-9)
    f1 = (2 * precision * recall) / (precision + recall + 1e-9)

    tpr = true_positive_rate(pred, ref)
    fpr = false_positive_rate(pred, ref)

    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-9)

    auc = None
    if len(set(ref)) == 2:
        auc = sklearn.metrics.roc_auc_score(ref, P)

    if verbose:
        print(f'\tspecificity: {specificity:.3f}')
        print(f'\tsensitivity: {sensitivity:.3f}')
        print(f'\tauc:         {auc:.3f}' if auc is not None else '')
        print(f'\taccuracy:    {accuracy:.3f}')
        print(f'\tprecision:   {precision:.3f}')
        print(f'\trecall:      {recall:.3f}')
        print(f'\tf1:          {f1:.3f}')
        print(f'\tTPR:         {tpr:.3f}')
        print(f'\tFPR:         {fpr:.3f}')
        print('TODO: VIZ THE ROC CURVE')

    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1,
        'tpr': tpr, 'fpr': fpr, 'auc': auc,
        'sensitivity': sensitivity, 'specificity': specificity
    }


def compute_stats_multiclass(task, pred, P, ref, labels_map, verbose):
    assert all(P.argmax(axis=1) == pred)

    n = len(labels_map)
    conf = np.zeros((n, n), dtype='int32')
    for p, r in zip(pred, ref):
        conf[p][r] += 1

    labels = [label for label, _ in sorted(labels_map.items(), key=lambda t: t[1])]

    print(conf)

    precisions, recalls, f1s = [], [], []
    print('\t prec  rec    f1   label')
    for i in range(n):
        label = labels[i]
        tp = conf[i, i]
        pred_pos = conf[i, :].sum()
        ref_pos = conf[:, i].sum()

        precision = tp / (pred_pos + 1e-9)
        recall = tp / (ref_pos + 1e-9)
        f1 = (2 * precision * recall) / (precision + recall + 1e-9)

        print(f'\t{precision:.3f} {recall:.3f} {f1:.3f} {label}')
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)

    print('\t--------------------------')
    print(f'\t{avg_precision:.3f} {avg_recall:.3f} {avg_f1:.3f} avg')
    print('TODO: VIZ THE F1S')

    return {'precisions': precisions, 'recalls': recalls, 'f1s': f1s}


def true_positive_rate(pred, ref):
    tp = sum(1 for p, r in zip(pred, ref) if p == 1 and r == 1)
    fn = sum(1 for p, r in zip(pred, ref) if p == 0 and r == 1)
    return tp / (tp + fn + 1e-9)


def false_positive_rate(pred, ref):
    fp = sum(1 for p, r in zip(pred, ref) if p == 1 and r == 0)
    tn = sum(1 for p, r in zip(pred, ref) if p == 0 and r == 0)
    return fp / (fp + tn + 1e-9)


def classification_results(svm, labels_map, X, Y, task, verbose=True):
    P_ = svm.decision_function(X)

    if len(labels_map) == 2:
        m = X.shape[0]
        P = np.zeros((m, 2))
        P[:, 0] = -P_
        P[:, 1] = P_
    else:
        P = P_

    pred = P.argmax(axis=1)

    if verbose:
        print(task)
    res = compute_stats(task, pred, P, Y, labels_map, verbose)
    if verbose:
        print('\n')
    return res


def regression_results(lr, test_X, test_Y, description, verbose=True):
    res = {}
    pred_Y = lr.predict(test_X)
    res['rms'] = sqrt(mean_squared_error(test_Y, pred_Y))
    res['mas'] = mean_absolute_error(test_Y, pred_Y)

    if verbose:
        print(description)
        print(f'\tRMS: {res["rms"]}')
        print(f'\tMAS: {res["mas"]}')

        plt.figure()
        perfect = np.linspace(min(test_Y), max(test_Y), 100)
        plt.scatter(perfect, perfect, color='red', s=0.01)
        plt.scatter(test_Y, pred_Y, color='blue', s=1)
        plt.xlabel('actual')
        plt.ylabel('prediction')
        plt.show()

    return res

In [19]:
import numpy as np
import pandas as pd
import pickle
from google.cloud import bigquery

# Initialize BigQuery client
project_id = 'my-project-1532744781277'
bq_client = bigquery.Client(project=project_id)

# BigQuery run_query helper
def run_query(query: str):
    query_job = bq_client.query(query)
    return query_job.to_dataframe(create_bqstorage_client=True)

# Normalization function
def normalize(scores: dict) -> dict:
    vals = np.array(list(scores.values()))
    mu, std = vals.mean(), vals.std()
    return {k: (v - mu) / std for k, v in scores.items()}

# Query data from BigQuery
insurance_query = """
SELECT DISTINCT hadm_id, insurance
FROM physionet-data.mimiciii_clinical.admissions
"""
insurance = run_query(insurance_query)

oasis_query = """
SELECT DISTINCT hadm_id, oasis.oasis
FROM physionet-data.mimiciii_derived.oasis
"""
oasis = run_query(oasis_query)

'''
patients_query = """
SELECT DISTINCT hadm_id, gender, admission_age, ethnicity, los_hospital
FROM physionet-data.mimiciii_derived.icustay_detail
"""
patients = run_query(patients_query)
print(len(patients))
old_patients = patients.loc[patients['admission_age'] > 0]
'''

# This query had to be rewritten due to schema changes
bq_patients_query = """
SELECT DISTINCT
  icu.hadm_id,
  icu.gender,
  icu.admission_age,
  icu.ethnicity,
  adm.admission_type,
  icu.los_hospital
FROM
  `physionet-data.mimiciii_derived.icustay_detail` AS icu
JOIN
  `physionet-data.mimiciii_clinical.admissions` AS adm
ON
  icu.hadm_id = adm.hadm_id
"""
patients = run_query(bq_patients_query)
print("num_patients:", len(patients))
new_patients = patients.loc[patients['admission_type'] != 'NEWBORN']

patients = new_patients

# Load trust score dictionaries from Google Drive
base_path = "/content/drive/MyDrive/mimic_data/data"

with open(f"{base_path}/mistrust_noncompliant.pkl", "rb") as f:
    noncompliant_dict = normalize(pickle.load(f))
print("noncompliant:", len(noncompliant_dict))
noncompliant_df = pd.DataFrame(noncompliant_dict.items(), columns=['hadm_id', 'noncompliant'])

with open(f"{base_path}/mistrust_autopsy.pkl", "rb") as f:
    autopsy_dict = normalize(pickle.load(f))
print("autopsy:", len(autopsy_dict))
autopsy_df = pd.DataFrame(autopsy_dict.items(), columns=['hadm_id', 'autopsy'])

with open(f"{base_path}/neg_sentiment.pkl", "rb") as f:
    sentiment_dict = normalize(pickle.load(f))
print("sentiment:", len(sentiment_dict))
sentiment_df = pd.DataFrame(sentiment_dict.items(), columns=['hadm_id', 'sentiment'])

with open(f"{base_path}/neg_sentiment_gpt_batched.pkl", "rb") as f:
  gpt_sentiment_dict = normalize(pickle.load(f))
print("gpt_sentiment:", len(gpt_sentiment_dict))
print("sample:", list(gpt_sentiment_dict.items())[:5])
gpt_sentiment_df = pd.DataFrame(gpt_sentiment_dict.items(), columns=['hadm_id', 'gpt_sentiment'])

# Merge all data into a single DataFrame
merged = insurance.merge(oasis, on='hadm_id') \
                  .merge(noncompliant_df, on='hadm_id') \
                  .merge(autopsy_df, on='hadm_id') \
                  .merge(sentiment_df, on='hadm_id') \
                  .merge(patients, on='hadm_id')

merged_with_gpt = merged.merge(gpt_sentiment_df, on='hadm_id')

use_gpt_sentiment=True
if (use_gpt_sentiment):
  merged = merged_with_gpt

# Normalize categorical values
merged['ethnicity'] = merged['ethnicity'].apply(normalize_race)
merged['insurance'] = merged['insurance'].apply(normalize_insurance)

# Final column renaming
merged = merged.rename(columns={'ethnicity': 'race', 'los_hospital': 'los'})

# `demographics` now holds the merged and cleaned data
demographics = merged


num_patients: 57422
noncompliant: 54510
autopsy: 54510
sentiment: 52726
gpt_sentiment: 34195
sample: [(114823, np.float64(-0.4437749004668177)), (175058, np.float64(-0.6656915545295373)), (113103, np.float64(-0.776649881560897)), (147438, np.float64(-0.5547332274981774)), (189690, np.float64(-0.3328165734354578))]


In [20]:
len(merged_with_gpt)

36023

In [21]:
demographics.rename(columns={'admission_age': 'age'}, inplace=True)
demographics.head()

Unnamed: 0,hadm_id,insurance,oasis,noncompliant,autopsy,sentiment,gender,age,race,admission_type,los,gpt_sentiment
0,108222,Public,33,2.607748,0.458767,-0.473128,M,29,Hispanic,EMERGENCY,6,1.997308
1,174959,Public,23,0.267848,1.586757,-0.13508,M,26,Other,EMERGENCY,4,-0.77665
2,185657,Public,31,-0.490961,0.455034,2.053435,M,24,Other,EMERGENCY,4,-0.887608
3,153005,Public,17,0.75174,-0.346618,0.461578,F,18,Other,EMERGENCY,4,-0.665692
4,107501,Public,46,0.025457,-1.148198,-0.17808,F,62,White,EMERGENCY,16,-0.887608


In [22]:
print("Sanity check size of demographics: ", len(demographics))

Sanity check size of demographics:  36023


In [None]:
import os

# Ensure target directory exists
drive_path = "/content/drive/MyDrive/mimic_data"
os.makedirs(drive_path, exist_ok=True)

# Save demographics DataFrame to CSV in Google Drive
demographics.to_csv(os.path.join(drive_path, "demographics.csv"), index=False)

In [None]:
demographics = pd.read_csv(os.path.join(drive_path, "demographics.csv"))

# Load label dictionaries
with open(os.path.join(drive_path, "ama_labels.pkl"), "rb") as f:
    ama_labels = pickle.load(f)

with open(os.path.join(drive_path, "code_labels.pkl"), "rb") as f:
    code_labels = pickle.load(f)

with open(os.path.join(drive_path, "mortality_labels.pkl"), "rb") as f:
    mortality_labels = pickle.load(f)

In [23]:
print(ama_labels)
print(code_labels)
print(mortality_labels)
demographics.head(5)

{134067: 'compliant', 109129: 'compliant', 121510: 'compliant', 106469: 'compliant', 133732: 'compliant', 119601: 'compliant', 123010: 'compliant', 174800: 'compliant', 173950: 'compliant', 171044: 'compliant', 169611: 'compliant', 123389: 'compliant', 196192: 'compliant', 121512: 'compliant', 113266: 'compliant', 150986: 'compliant', 144903: 'compliant', 119289: 'compliant', 143362: 'compliant', 189942: 'compliant', 102466: 'compliant', 145091: 'compliant', 165749: 'compliant', 175144: 'compliant', 122027: 'compliant', 102192: 'compliant', 188102: 'compliant', 122636: 'compliant', 157031: 'compliant', 116439: 'compliant', 171889: 'compliant', 153430: 'compliant', 105713: 'compliant', 134912: 'compliant', 125236: 'compliant', 156383: 'compliant', 141877: 'compliant', 104085: 'compliant', 173299: 'compliant', 169176: 'compliant', 192805: 'compliant', 137291: 'compliant', 185022: 'compliant', 186222: 'compliant', 129490: 'compliant', 136261: 'compliant', 192113: 'compliant', 166082: 'com

Unnamed: 0,hadm_id,insurance,oasis,noncompliant,autopsy,sentiment,gender,age,race,admission_type,los,gpt_sentiment
0,108222,Public,33,2.607748,0.458767,-0.473128,M,29,Hispanic,EMERGENCY,6,1.997308
1,174959,Public,23,0.267848,1.586757,-0.13508,M,26,Other,EMERGENCY,4,-0.77665
2,185657,Public,31,-0.490961,0.455034,2.053435,M,24,Other,EMERGENCY,4,-0.887608
3,153005,Public,17,0.75174,-0.346618,0.461578,F,18,Other,EMERGENCY,4,-0.665692
4,107501,Public,46,0.025457,-1.148198,-0.17808,F,62,White,EMERGENCY,16,-0.887608


In [24]:
from time import strftime, gmtime
import numpy as np
import tqdm
from sklearn.feature_extraction import DictVectorizer

# Print current UTC time
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

def normalize_mean_std(value, mu, std):
    return (value - mu) / std

# Normalize ages
ages = demographics['age'].to_numpy()
age_mu, age_std = ages.mean(), ages.std()
demographics['age'] = demographics['age'].apply(lambda val: normalize_mean_std(val, age_mu, age_std))

# Normalize OASIS scores
oasis_vals = demographics['oasis'].to_numpy()
oasis_mu, oasis_std = oasis_vals.mean(), oasis_vals.std()
demographics['oasis'] = demographics['oasis'].apply(lambda val: normalize_mean_std(val, oasis_mu, oasis_std))

# Normalize length of stay
los_vals = demographics['los'].to_numpy()
los_mu, los_std = los_vals.mean(), los_vals.std()
demographics['los'] = demographics['los'].apply(lambda val: normalize_mean_std(val, los_mu, los_std))

def build_features(enabled):
    demographics_features = {}
    for _, row in tqdm.tqdm(demographics.iterrows(), total=demographics.shape[0]):
        feats = {}

        if 'admission_type' in enabled: feats[('admission_type', row.admission_type)] = 1
        if 'oasis'          in enabled: feats[('oasis', None)] = row.oasis
        if 'age'            in enabled: feats[('age', None)] = row.age
        if 'los'            in enabled: feats[('los', None)] = row.los
        if 'insurance'      in enabled: feats[('insurance', row.insurance)] = 1
        if 'gender'         in enabled: feats[('gender', row.gender)] = 1
        if 'race'           in enabled: feats[('race', row.race)] = 1
        if 'noncompliant'   in enabled: feats[('noncompliant', None)] = row.noncompliant
        if 'autopsy'        in enabled: feats[('autopsy', None)] = row.autopsy
        if 'sentiment'      in enabled: feats[('sentiment', None)] = row.sentiment
        if 'gpt_sentiment'  in enabled: feats[('gpt_sentiment', None)] = row.gpt_sentiment

        demographics_features[int(row.hadm_id)] = feats

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # Fit vectorizer
    vect = DictVectorizer()
    vect.fit(demographics_features.values())
    print('num_features:', len(vect.get_feature_names_out()))

    # Construct feature matrix
    ids = list(demographics_features.keys())
    print('\t', strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    X = vect.transform([demographics_features[hadm_id] for hadm_id in ids])

    return demographics_features, vect

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))


2025-05-07 16:51:55
2025-05-07 16:51:55


In [25]:
demographics.head(10)

Unnamed: 0,hadm_id,insurance,oasis,noncompliant,autopsy,sentiment,gender,age,race,admission_type,los,gpt_sentiment
0,108222,Public,0.181936,2.607748,0.458767,-0.473128,M,-0.83602,Hispanic,EMERGENCY,-0.396357,1.997308
1,174959,Public,-0.949388,0.267848,1.586757,-0.13508,M,-0.890258,Other,EMERGENCY,-0.548227,-0.77665
2,185657,Public,-0.044329,-0.490961,0.455034,2.053435,M,-0.926416,Other,EMERGENCY,-0.548227,-0.887608
3,153005,Public,-1.628183,0.75174,-0.346618,0.461578,F,-1.034892,Other,EMERGENCY,-0.548227,-0.665692
4,107501,Public,1.652657,0.025457,-1.148198,-0.17808,F,-0.239406,White,EMERGENCY,0.362992,-0.887608
5,136071,Public,1.426392,-0.42927,-1.183689,-0.16211,F,-0.203248,Other,EMERGENCY,0.970472,-0.554733
6,135365,Public,-0.044329,2.539062,0.239711,0.114276,M,-0.890258,White,EMERGENCY,-0.624162,-0.221858
7,165326,Public,-0.723123,2.17378,0.239711,0.484759,M,-0.781783,White,EMERGENCY,-0.244487,1.775392
8,178233,Public,0.408201,0.332707,0.239711,0.615991,M,-0.510594,Other,EMERGENCY,-0.548227,1.553475
9,174348,Public,1.086995,0.457834,-1.148198,0.263297,M,-0.600991,Other,EMERGENCY,-0.244487,1.553475


In [27]:
from time import strftime, gmtime
from collections import defaultdict, Counter
import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression

print(f"[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Starting AMA classifier evaluation...\n")

featlists = {
    #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
    #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
    #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
    #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
    #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
    'BASELINE+GPT'         :['age', 'los', 'insurance', 'gender', 'gpt_sentiment'],
    'BASELINE+ALL+GPT'     :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment'],
    'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
}

ama_Y_vect = {'AMA': 1, 'compliant': 0}
feature_weights = defaultdict(list)

for name, featlist in featlists.items():
    print(f"=== Model Configuration: {name} ===")
    print("Enabled features:", featlist)

    demographics_features, vect = build_features(featlist)
    ind2feat = {i: f for f, i in vect.vocabulary_.items()}

    ama_ids = list(set(discharge['hadm_id'].values) & set(demographics_features.keys()))
    print(f"\nNumber of patients in cohort: {len(ama_ids)}")
    print("Label distribution:", Counter([ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_ids]))

    aucs = []

    print("\n--- Running 100 Iterations for AUC Estimation ---")
    for iteration in tqdm.tqdm(range(100)):
        # train/test split
        ama_train_ids, ama_test_ids = data_split(ama_ids)

        # extract features
        ama_train_features = [demographics_features[hadm_id] for hadm_id in ama_train_ids]
        ama_test_features  = [demographics_features[hadm_id] for hadm_id in ama_test_ids]

        ama_train_X = vect.transform(ama_train_features)
        ama_test_X  = vect.transform(ama_test_features)

        ama_train_Y = [ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_train_ids]
        ama_test_Y  = [ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_test_ids]

        # train logistic regression
        ama_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01, solver='liblinear')
        ama_svm.fit(ama_train_X, ama_train_Y)

        # evaluate model
        res = classification_results(ama_svm, ama_Y_vect, ama_test_X, ama_test_Y, 'test: ama', verbose=False)
        aucs.append(res['auc'])

        if name == 'BASELINE+ALL' or name == 'BASELINE+ALL+GPT':
            for feat, val in enumerate(ama_svm.coef_[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)

    aucs = np.array(aucs)
    mean_auc = aucs.mean()
    std_auc = aucs.std()
    ci_low = mean_auc - 1.96 * std_auc
    ci_high = mean_auc + 1.96 * std_auc

    print("\n--- AUC Results Summary ---")
    print(f"AUCs (n=100): mean = {mean_auc:.4f}, 1.96*std = {1.96 * std_auc:.4f}")
    print(f"95% Confidence Interval: ({ci_low:.4f}, {ci_high:.4f})")

    print("\n--- Most Informative Features ---")
    analyze('ama', vect, ama_svm)
    print("\n" + "=" * 60 + "\n")

    if name == 'BASELINE+ALL' or name == 'BASELINE+ALL+GPT':
        print("--- Feature Weight Summary Across Iterations ---")
        for featname, vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu, std = v.mean(), v.std()
            print(f"{str(featname[0]):<12}:{str(featname[1]):<20} || {mu:.2f} ± {1.96 * std:.2f}")

print(f"\n[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Finished all evaluations.")

[2025-05-07 16:56:54] Starting AMA classifier evaluation...

=== Model Configuration: BASELINE+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 11953.17it/s]


2025-05-07 16:56:57
num_features: 8
	 2025-05-07 16:56:57

Number of patients in cohort: 33408
Label distribution: Counter({0: 33180, 1: 228})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:30<00:00,  3.32it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.9421, 1.96*std = 0.0222
95% Confidence Interval: (0.9199, 0.9644)

--- Most Informative Features ---
ama
	('gpt_sentiment', None)  :  1.2380
	('insurance', 'Self Pay'):  0.0000
	('gender', 'M')          :  0.0000
	('insurance', 'Public')  :  0.0000
	('gender', 'F')          : -0.3237
	('insurance', 'Private') : -0.7120
	('los', None)            : -0.8107
	('age', None)            : -1.8339


=== Model Configuration: BASELINE+ALL+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 9297.38it/s]


2025-05-07 16:57:31
num_features: 17
	 2025-05-07 16:57:31

Number of patients in cohort: 33408
Label distribution: Counter({0: 33180, 1: 228})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:45<00:00,  2.20it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.9500, 1.96*std = 0.0177
95% Confidence Interval: (0.9323, 0.9677)

--- Most Informative Features ---
ama
	('gpt_sentiment', None)  :  1.1960
	('noncompliant', None)   :  0.4699
	('autopsy', None)        :  0.0544
	('sentiment', None)      :  0.0306
	('race', 'White')        :  0.0000
	('race', 'Hispanic')     :  0.0000
	('race', 'Black')        :  0.0000
	('race', 'Native American'):  0.0000
	('race', 'Other')        :  0.0000
	('gender', 'M')          :  0.0000
	('race', 'Asian')        :  0.0000
	('insurance', 'Public')  :  0.0000
	('insurance', 'Self Pay'):  0.0000
	('insurance', 'Private') : -0.3536
	('gender', 'F')          : -0.6133
	('los', None)            : -0.7446
	('age', None)            : -1.4096


--- Feature Weight Summary Across Iterations ---
age         :None                 || -1.40 ± 0.26
autopsy     :None                 || 0.17 ± 0.11
gender      :F                    || -0.39 ± 0.22
gender      :M               

100%|██████████| 36023/36023 [00:03<00:00, 10267.58it/s]


2025-05-07 16:58:20
num_features: 16
	 2025-05-07 16:58:21

Number of patients in cohort: 33408
Label distribution: Counter({0: 33180, 1: 228})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


--- AUC Results Summary ---
AUCs (n=100): mean = 0.8869, 1.96*std = 0.0280
95% Confidence Interval: (0.8589, 0.9149)

--- Most Informative Features ---
ama
	('noncompliant', None)   :  0.7179
	('sentiment', None)      :  0.2237
	('autopsy', None)        :  0.1324
	('race', 'Other')        :  0.0000
	('race', 'Native American'):  0.0000
	('race', 'Hispanic')     :  0.0000
	('race', 'Black')        :  0.0000
	('race', 'White')        :  0.0000
	('race', 'Asian')        :  0.0000
	('insurance', 'Self Pay'):  0.0000
	('gender', 'M')          :  0.0000
	('insurance', 'Public')  :  0.0000
	('gender', 'F')          : -0.2002
	('insurance', 'Private') : -0.6379
	('los', None)            : -1.3474
	('age', None)            : -1.6202


--- Feature Weight Summary Across Iterations ---
age         :None                 || -1.57 ± 0.44
autopsy     :None                 || 0.15 ± 0.12
gender      :F                    || -0.41 ± 0.21
gender      :M                    || 0.00 ± 0.00
gpt_sentiment:No




In [None]:
# Original Code Status code redacted. Cell is left for output comparison


0it [00:00, ?it/s]

2019-01-04 23:56:54
BASELINE+ALL
['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


50976it [00:09, 5236.42it/s]


2019-01-04 23:57:03
num_features: 16
	2019-01-04 23:57:04



  0%|          | 0/100 [00:00<?, ?it/s]

{0: ('age', None), 1: ('autopsy', None), 2: ('concompliant', None), 3: ('gender', 'F'), 4: ('gender', 'M'), 5: ('insurance', 'Private'), 6: ('insurance', 'Public'), 7: ('insurance', 'Self Pay'), 8: ('los', None), 9: ('race', 'Asian'), 10: ('race', 'Black'), 11: ('race', 'Hispanic'), 12: ('race', 'Native American'), 13: ('race', 'Other'), 14: ('race', 'White'), 15: ('sentiment', None)}
patients: 39125
Counter({0: 36667, 1: 2458})


100%|██████████| 100/100 [00:56<00:00,  1.73it/s]

AUCS:  [0.7797595  0.7864562  0.79014992 0.77520486 0.78525847 0.77857338
 0.78509857 0.7884304  0.79088733 0.78313686 0.78970985 0.78382875
 0.78810281 0.78749812 0.78864029 0.79968577 0.79118141 0.7766818
 0.78442367 0.79835093 0.79320046 0.78782693 0.77922702 0.77785324
 0.77653062 0.7739758  0.77545866 0.78683858 0.78426325 0.79146758
 0.78851617 0.78034419 0.78179609 0.78053583 0.79187255 0.77957677
 0.77715471 0.78672934 0.78565199 0.78691719 0.7823756  0.79901502
 0.78399456 0.79059203 0.78916896 0.78086252 0.78317603 0.789163
 0.77354958 0.7843122  0.78671648 0.78893047 0.78814728 0.79108546
 0.78363495 0.78281243 0.78065414 0.78432409 0.78813066 0.79278352
 0.7875232  0.7914536  0.78602473 0.78989935 0.78689736 0.78563218
 0.77992618 0.78788532 0.7715448  0.77773195 0.78589674 0.7889953
 0.77955621 0.78865128 0.77682282 0.78573882 0.79123264 0.79180179
 0.79263443 0.78321795 0.77307465 0.78996017 0.7806325  0.78791611
 0.77808487 0.78548196 0.78705411 0.79317152 0.7917187  0.7




In [28]:
from time import strftime, gmtime
from collections import defaultdict, Counter
import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression

print(f"[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Starting Code Status classifier evaluation...\n")

featlists = {
    #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
    #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
    #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
    #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
    #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
    'BASELINE+GPT'         :['age', 'los', 'insurance', 'gender', 'gpt_sentiment'],
    'BASELINE+ALL+GPT'     :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment'],
    'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
}

cs_Y_vect = {'DNR/CMO': 1, 'Full Code': 0}
feature_weights = defaultdict(list)

for name, featlist in featlists.items():
    print(f"=== Model Configuration: {name} ===")
    print("Enabled features:", featlist)

    demographics_features, vect = build_features(featlist)
    ind2feat = {i: f for f, i in vect.vocabulary_.items()}

    print("\nFeature index mapping:")
    print(ind2feat)

    cs_ids = list(set(code_labels.keys()) & set(demographics_features.keys()))
    print(f"\nNumber of patients in cohort: {len(cs_ids)}")
    print("Label distribution:", Counter([cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_ids]))

    aucs = []

    print("\n--- Running 100 Iterations for AUC Estimation ---")
    for iteration in tqdm.tqdm(range(100)):
        cs_train_ids, cs_test_ids = data_split(cs_ids)

        cs_train_features = [demographics_features[hadm_id] for hadm_id in cs_train_ids]
        cs_test_features  = [demographics_features[hadm_id] for hadm_id in cs_test_ids]

        cs_train_X = vect.transform(cs_train_features)
        cs_test_X  = vect.transform(cs_test_features)

        cs_train_Y = [cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_train_ids]
        cs_test_Y  = [cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_test_ids]

        cs_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01, solver='liblinear')
        cs_svm.fit(cs_train_X, cs_train_Y)

        res = classification_results(cs_svm, cs_Y_vect, cs_test_X, cs_test_Y, 'test: cs', verbose=False)
        aucs.append(res['auc'])

        if name == 'BASELINE+ALL':
            for feat, val in enumerate(cs_svm.coef_[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)

    aucs = np.array(aucs)
    mean_auc = aucs.mean()
    std_auc = aucs.std()
    ci_low = mean_auc - 1.96 * std_auc
    ci_high = mean_auc + 1.96 * std_auc

    print("\n--- AUC Results Summary ---")
    print(f"AUCs (n=100): mean = {mean_auc:.4f}, 1.96*std = {1.96 * std_auc:.4f}")
    print(f"95% Confidence Interval: ({ci_low:.4f}, {ci_high:.4f})")

    print("\n--- Most Informative Features ---")
    analyze('cs', vect, cs_svm)
    print("\n" + "=" * 60 + "\n")

    if name == 'BASELINE+ALL':
        print("--- Feature Weight Summary Across Iterations ---")
        for featname, vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu, std = v.mean(), v.std()
            print(f"{str(featname[0]):<12}:{str(featname[1]):<20} || {mu:.2f} ± {1.96 * std:.2f}")

print(f"\n[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Finished all evaluations.")

[2025-05-07 17:06:25] Starting Code Status classifier evaluation...

=== Model Configuration: BASELINE+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:02<00:00, 12413.54it/s]


2025-05-07 17:06:28
num_features: 8
	 2025-05-07 17:06:28

Feature index mapping:
{0: ('age', None), 1: ('gender', 'F'), 2: ('gender', 'M'), 3: ('gpt_sentiment', None), 4: ('insurance', 'Private'), 5: ('insurance', 'Public'), 6: ('insurance', 'Self Pay'), 7: ('los', None)}

Number of patients in cohort: 26341
Label distribution: Counter({0: 24498, 1: 1843})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:24<00:00,  4.02it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.7671, 1.96*std = 0.0142
95% Confidence Interval: (0.7529, 0.7813)

--- Most Informative Features ---
cs
	('age', None)            :  0.4460
	('gpt_sentiment', None)  :  0.2473
	('insurance', 'Self Pay'):  0.0000
	('insurance', 'Public')  :  0.0000
	('los', None)            : -0.5823
	('insurance', 'Private') : -0.8625
	('gender', 'F')          : -1.4960
	('gender', 'M')          : -1.8962


=== Model Configuration: BASELINE+ALL+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:04<00:00, 7451.28it/s]


2025-05-07 17:06:58
num_features: 17
	 2025-05-07 17:06:58

Feature index mapping:
{0: ('age', None), 1: ('autopsy', None), 2: ('gender', 'F'), 3: ('gender', 'M'), 4: ('gpt_sentiment', None), 5: ('insurance', 'Private'), 6: ('insurance', 'Public'), 7: ('insurance', 'Self Pay'), 8: ('los', None), 9: ('noncompliant', None), 10: ('race', 'Asian'), 11: ('race', 'Black'), 12: ('race', 'Hispanic'), 13: ('race', 'Native American'), 14: ('race', 'Other'), 15: ('race', 'White'), 16: ('sentiment', None)}

Number of patients in cohort: 26341
Label distribution: Counter({0: 24498, 1: 1843})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:38<00:00,  2.63it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.7937, 1.96*std = 0.0124
95% Confidence Interval: (0.7813, 0.8061)

--- Most Informative Features ---
cs
	('age', None)            :  0.4335
	('sentiment', None)      :  0.2507
	('gpt_sentiment', None)  :  0.2301
	('noncompliant', None)   :  0.1466
	('race', 'White')        :  0.0240
	('insurance', 'Self Pay'):  0.0000
	('insurance', 'Public')  :  0.0000
	('race', 'Asian')        :  0.0000
	('race', 'Native American'):  0.0000
	('race', 'Other')        : -0.0282
	('race', 'Black')        : -0.2105
	('race', 'Hispanic')     : -0.3973
	('autopsy', None)        : -0.4448
	('los', None)            : -0.6233
	('insurance', 'Private') : -0.8255
	('gender', 'F')          : -1.2927
	('gender', 'M')          : -1.6218


=== Model Configuration: BASELINE+ALL ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 10820.77it/s]


2025-05-07 17:07:40
num_features: 16
	 2025-05-07 17:07:40

Feature index mapping:
{0: ('age', None), 1: ('autopsy', None), 2: ('gender', 'F'), 3: ('gender', 'M'), 4: ('insurance', 'Private'), 5: ('insurance', 'Public'), 6: ('insurance', 'Self Pay'), 7: ('los', None), 8: ('noncompliant', None), 9: ('race', 'Asian'), 10: ('race', 'Black'), 11: ('race', 'Hispanic'), 12: ('race', 'Native American'), 13: ('race', 'Other'), 14: ('race', 'White'), 15: ('sentiment', None)}

Number of patients in cohort: 26341
Label distribution: Counter({0: 24498, 1: 1843})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


--- AUC Results Summary ---
AUCs (n=100): mean = 0.7865, 1.96*std = 0.0131
95% Confidence Interval: (0.7734, 0.7996)

--- Most Informative Features ---
cs
	('age', None)            :  0.4310
	('sentiment', None)      :  0.2516
	('noncompliant', None)   :  0.2299
	('race', 'White')        :  0.0916
	('race', 'Native American'):  0.0000
	('race', 'Asian')        :  0.0000
	('insurance', 'Self Pay'):  0.0000
	('race', 'Other')        :  0.0000
	('insurance', 'Public')  :  0.0000
	('race', 'Black')        : -0.2561
	('race', 'Hispanic')     : -0.3394
	('autopsy', None)        : -0.4652
	('los', None)            : -0.6643
	('insurance', 'Private') : -0.8303
	('gender', 'F')          : -1.4727
	('gender', 'M')          : -1.7776


--- Feature Weight Summary Across Iterations ---
age         :None                 || 0.42 ± 0.03
autopsy     :None                 || -0.43 ± 0.06
gender      :F                    || -0.58 ± 1.27
gender      :M                    || -0.90 ± 1.27
insurance   :Pri




In [None]:
# Original mortality code redacted. Cell is left for output comparison


0it [00:00, ?it/s]

2019-01-04 23:58:01
BASELINE+ALL
['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


50976it [00:09, 5101.28it/s]


2019-01-04 23:58:11
num_features: 16
	2019-01-04 23:58:11



  0%|          | 0/100 [00:00<?, ?it/s]

patients: 47543
Counter({0: 42429, 1: 5114})


100%|██████████| 100/100 [01:04<00:00,  1.54it/s]

AUCS:  [0.67597942 0.66796189 0.68214396 0.66944476 0.67395821 0.67354888
 0.67068775 0.67290468 0.66699559 0.66895833 0.67467822 0.67241983
 0.67598472 0.67080436 0.67627854 0.67294662 0.67546048 0.66784427
 0.66833621 0.67154392 0.67297642 0.68192794 0.67203645 0.67081948
 0.66367444 0.67145488 0.6727461  0.66890287 0.66642047 0.67044154
 0.66952413 0.66924098 0.67655431 0.67170851 0.66759812 0.67527588
 0.67156593 0.68010592 0.67829128 0.67254243 0.66715664 0.67592364
 0.6731543  0.6670147  0.66636753 0.6692419  0.68142159 0.67192896
 0.67877901 0.67566752 0.66671362 0.67740352 0.67710221 0.67190979
 0.67630895 0.67681584 0.67535052 0.67356308 0.67016684 0.66646245
 0.68222291 0.67226293 0.67685544 0.66950803 0.66685135 0.67073491
 0.67617529 0.66689762 0.66978465 0.66456813 0.67865127 0.67193433
 0.67314576 0.67493967 0.67532346 0.66796884 0.67105338 0.66678113
 0.66869323 0.66587796 0.66838179 0.67589294 0.66866678 0.66446579
 0.66724467 0.66733614 0.67077739 0.67085804 0.67256763




In [29]:
from time import strftime, gmtime
from collections import defaultdict, Counter
import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression

print(f"[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Starting Mortality classifier evaluation...\n")

featlists = {
    #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
    #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
    #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
    #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
    #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
    'BASELINE+GPT'         :['age', 'los', 'insurance', 'gender', 'gpt_sentiment'],
    'BASELINE+ALL+GPT'     :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment'],
    'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
}

mortality_Y_vect = {'deceased': 1, 'survived': 0}
feature_weights = defaultdict(list)

for name, featlist in featlists.items():
    print(f"=== Model Configuration: {name} ===")
    print("Enabled features:", featlist)

    demographics_features, vect = build_features(featlist)
    ind2feat = {i: f for f, i in vect.vocabulary_.items()}

    mortality_ids = list(set(mortality_labels.keys()) & set(demographics_features.keys()))
    print(f"\nNumber of patients in cohort: {len(mortality_ids)}")
    print("Label distribution:", Counter([mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_ids]))

    aucs = []

    print("\n--- Running 100 Iterations for AUC Estimation ---")
    for iteration in tqdm.tqdm(range(100)):
        mortality_train_ids, mortality_test_ids = data_split(mortality_ids)

        mortality_train_features = [demographics_features[hadm_id] for hadm_id in mortality_train_ids]
        mortality_test_features  = [demographics_features[hadm_id] for hadm_id in mortality_test_ids]

        mortality_train_X = vect.transform(mortality_train_features)
        mortality_test_X  = vect.transform(mortality_test_features)

        mortality_train_Y = [mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_train_ids]
        mortality_test_Y  = [mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_test_ids]

        mortality_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01, solver='liblinear')
        mortality_svm.fit(mortality_train_X, mortality_train_Y)

        res = classification_results(
            mortality_svm, mortality_Y_vect, mortality_test_X, mortality_test_Y,
            'test: mortality', verbose=False
        )
        aucs.append(res['auc'])

        if name == 'BASELINE+ALL':
            for feat, val in enumerate(mortality_svm.coef_[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)

    aucs = np.array(aucs)
    mean_auc = aucs.mean()
    std_auc = aucs.std()
    ci_low = mean_auc - 1.96 * std_auc
    ci_high = mean_auc + 1.96 * std_auc

    print("\n--- AUC Results Summary ---")
    print(f"AUCs (n=100): mean = {mean_auc:.4f}, 1.96*std = {1.96 * std_auc:.4f}")
    print(f"95% Confidence Interval: ({ci_low:.4f}, {ci_high:.4f})")

    print("\n--- Most Informative Features ---")
    analyze('mortality', vect, mortality_svm)
    print("\n" + "=" * 60 + "\n")

    if name == 'BASELINE+ALL':
        print("--- Feature Weight Summary Across Iterations ---")
        for featname, vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu, std = v.mean(), v.std()
            print(f"{str(featname[0]):<12}:{str(featname[1]):<20} || {mu:.2f} ± {1.96 * std:.2f}")

    print("\n\n")

print(f"[{strftime('%Y-%m-%d %H:%M:%S', gmtime())}] Finished all evaluations.")

[2025-05-07 17:08:30] Starting Mortality classifier evaluation...

=== Model Configuration: BASELINE+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 11489.99it/s]


2025-05-07 17:08:33
num_features: 8
	 2025-05-07 17:08:33

Number of patients in cohort: 33408
Label distribution: Counter({0: 29985, 1: 3423})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:28<00:00,  3.55it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.6070, 1.96*std = 0.0172
95% Confidence Interval: (0.5897, 0.6242)

--- Most Informative Features ---
mortality
	('gpt_sentiment', None)  :  0.3215
	('age', None)            :  0.2198
	('los', None)            :  0.1378
	('insurance', 'Self Pay'):  0.0000
	('insurance', 'Public')  : -0.0028
	('gender', 'M')          : -0.1737
	('gender', 'F')          : -0.2474
	('insurance', 'Private') : -0.3460





=== Model Configuration: BASELINE+ALL+GPT ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment', 'gpt_sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 9425.90it/s]


2025-05-07 17:09:05
num_features: 17
	 2025-05-07 17:09:06

Number of patients in cohort: 33408
Label distribution: Counter({0: 29985, 1: 3423})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:44<00:00,  2.25it/s]



--- AUC Results Summary ---
AUCs (n=100): mean = 0.6901, 1.96*std = 0.0118
95% Confidence Interval: (0.6783, 0.7018)

--- Most Informative Features ---
mortality
	('sentiment', None)      :  0.4972
	('race', 'Other')        :  0.3159
	('gpt_sentiment', None)  :  0.2513
	('age', None)            :  0.2156
	('noncompliant', None)   :  0.0757
	('los', None)            :  0.0649
	('autopsy', None)        :  0.0000
	('race', 'Asian')        :  0.0000
	('race', 'Native American'):  0.0000
	('insurance', 'Self Pay'):  0.0000
	('race', 'White')        : -0.2051
	('race', 'Hispanic')     : -0.5499
	('race', 'Black')        : -0.6132
	('gender', 'F')          : -0.6389
	('gender', 'M')          : -0.6427
	('insurance', 'Public')  : -0.6726
	('insurance', 'Private') : -1.0246





=== Model Configuration: BASELINE+ALL ===
Enabled features: ['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


100%|██████████| 36023/36023 [00:03<00:00, 10601.20it/s]


2025-05-07 17:09:54
num_features: 16
	 2025-05-07 17:09:54

Number of patients in cohort: 33408
Label distribution: Counter({0: 29985, 1: 3423})

--- Running 100 Iterations for AUC Estimation ---


100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


--- AUC Results Summary ---
AUCs (n=100): mean = 0.6819, 1.96*std = 0.0121
95% Confidence Interval: (0.6698, 0.6941)

--- Most Informative Features ---
mortality
	('sentiment', None)      :  0.4891
	('race', 'Other')        :  0.3417
	('age', None)            :  0.2219
	('noncompliant', None)   :  0.0876
	('los', None)            :  0.0604
	('race', 'Native American'):  0.0000
	('race', 'Asian')        :  0.0000
	('autopsy', None)        : -0.0167
	('gender', 'F')          : -0.0200
	('gender', 'M')          : -0.0231
	('race', 'White')        : -0.1302
	('race', 'Black')        : -0.4130
	('race', 'Hispanic')     : -0.5009
	('insurance', 'Self Pay'): -0.8831
	('insurance', 'Public')  : -1.6857
	('insurance', 'Private') : -1.9896


--- Feature Weight Summary Across Iterations ---
age         :None                 || 0.21 ± 0.02
autopsy     :None                 || -0.01 ± 0.03
gender      :F                    || -0.61 ± 1.02
gender      :M                    || -0.61 ± 1.02
insurance




In [None]:
metrics = {
    'noncompliant': noncompliant_dict,
    'autopsy': autopsy_dict,
    'sentiment': sentiment_dict,
    'gpt_sentiment': gpt_sentiment_dict
}

def mort_rate(label, hadm_ids):
    cohort = mortality[mortality['hadm_id'].isin(hadm_ids)]
    if len(cohort) == 0:
        print(f"\t{label:<15}: No data available")
        return
    rate = cohort['hospital_expire_flag'].mean()
    print(f"\t{label:<15}: Mortality rate = {rate:.3f} ({len(cohort)} patients)")

for metric, scores in metrics.items():
    print(f"\n=== Analyzing Mortality vs. Trust Metric: {metric.upper()} ===")

    vals = sorted(scores.values())
    n = len(vals)
    t1 = vals[n // 4]
    t2 = vals[n // 2]
    t3 = vals[3 * n // 4]

    lowest  = [hadm_id for hadm_id, score in scores.items() if score <= t1]
    highest = [hadm_id for hadm_id, score in scores.items() if score > t3]

    mort_rate("Most Trusting (Q1)", lowest)
    mort_rate("Least Trusting (Q4)", highest)



=== Analyzing Mortality vs. Trust Metric: NONCOMPLIANT ===
	Most Trusting (Q1): Mortality rate = 0.059 (13717 patients)
	Least Trusting (Q4): Mortality rate = 0.131 (13615 patients)

=== Analyzing Mortality vs. Trust Metric: AUTOPSY ===
	Most Trusting (Q1): Mortality rate = 0.125 (14396 patients)
	Least Trusting (Q4): Mortality rate = 0.093 (13088 patients)

=== Analyzing Mortality vs. Trust Metric: SENTIMENT ===
	Most Trusting (Q1): Mortality rate = 0.044 (13182 patients)
	Least Trusting (Q4): Mortality rate = 0.171 (13181 patients)
