In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/ventilation_cleaned.csv'

# Load the CSV into a DataFrame
ventilation = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(ventilation.head())

   subject_id   stay_id            charttime  itemid  valuenum_normalized
0    10002428  38875437  2156-04-22 07:00:00  220339             0.333333
1    10002428  38875437  2156-04-22 07:00:00  223835             0.240506
2    10002428  38875437  2156-04-22 07:00:00  224685             0.437000
3    10002428  38875437  2156-04-22 07:00:00  224687             0.600000
4    10002428  38875437  2156-04-22 07:00:00  224688             0.440000


In [3]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/postwean.csv'

# Load the CSV into a DataFrame
outcomes = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(outcomes.head())

   original_index  subject_id     hadm_id   stay_id  hadm_no  stay_no  \
0              16    10002428  28662225.0  38875437        1        1   
1              17    10002428  28662225.0  38875437        1        1   
2              18    10002428  28662225.0  38875437        1        1   
3              19    10002428  28662225.0  38875437        1        1   
4              20    10002428  28662225.0  38875437        1        1   

  intubation_time ventilation_starttime  ventilation_endtime  \
0             NaN   2156-04-19 20:10:00  2156-04-22 17:05:00   
1             NaN   2156-04-19 20:10:00  2156-04-22 17:05:00   
2             NaN   2156-04-19 20:10:00  2156-04-22 17:05:00   
3             NaN   2156-04-19 20:10:00  2156-04-22 17:05:00   
4             NaN   2156-04-19 20:10:00  2156-04-22 17:05:00   

   ventilation_time  ... age  niv_48 reintubation_48  died_48  ext_success  \
0            4135.0  ...  80     NaN             NaN      NaN            1   
1            4135.0 

In [4]:
outcomes = outcomes[['subject_id', 'stay_id', 'ventilation_time', 'ext_success']]

outcomes.head()

Unnamed: 0,subject_id,stay_id,ventilation_time,ext_success
0,10002428,38875437,4135.0,1
1,10002428,38875437,4135.0,1
2,10002428,38875437,4135.0,1
3,10002428,38875437,4135.0,1
4,10002428,38875437,4135.0,1


In [7]:
outcomes.shape

(205449, 4)

In [5]:
# Drop duplicate rows in outcomes dataframe

outcomes = outcomes.drop_duplicates()
outcomes.shape

(6460, 4)

In [9]:
print("Unique subject_ids in outcomes:", outcomes['subject_id'].nunique())
print("Unique subject_ids in ventilation:", ventilation['subject_id'].nunique())

Unique subject_ids in outcomes: 5832
Unique subject_ids in ventilation: 5832


In [10]:
# Combine dataframes

combined = pd.merge(
    ventilation,
    outcomes[['subject_id', 'stay_id', 'ext_success']],
    on=['subject_id', 'stay_id'],
    how='inner'  # 
)

In [11]:
combined.head()

Unnamed: 0,subject_id,stay_id,charttime,itemid,valuenum_normalized,ext_success
0,10002428,38875437,2156-04-22 07:00:00,220339,0.333333,1
1,10002428,38875437,2156-04-22 07:00:00,223835,0.240506,1
2,10002428,38875437,2156-04-22 07:00:00,224685,0.437,1
3,10002428,38875437,2156-04-22 07:00:00,224687,0.6,1
4,10002428,38875437,2156-04-22 07:00:00,224688,0.44,1


In [12]:
# Perform logistic regression using mean of last 10 values of each included itemid

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

combined_sorted = combined.sort_values(['subject_id', 'stay_id', 'itemid', 'charttime'])

# Select last 10 values
combined_sorted = combined.sort_values(['subject_id', 'stay_id', 'itemid', 'charttime'])

last10 = (
    combined_sorted
    .groupby(['subject_id', 'stay_id', 'itemid'])
    .tail(10)
)

# Aggregate by mean
agg = (
    last10
    .groupby(['subject_id', 'stay_id', 'itemid'])['valuenum_normalized']
    .mean()
    .reset_index()
)

# Pivot to wide format (features as columns)
vent_features = agg.pivot(index=['subject_id', 'stay_id'],
                          columns='itemid',
                          values='valuenum_normalized').reset_index()

# Attach outcome
outcome_df = (
    combined[['subject_id', 'stay_id', 'ext_success']]
    .drop_duplicates(subset=['subject_id', 'stay_id'])
)

data = pd.merge(vent_features, outcome_df, on=['subject_id', 'stay_id'], how='inner').dropna()

# Prepare data for logistic regression
X = data.drop(columns=['subject_id', 'stay_id', 'ext_success'])
y = data['ext_success']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit logistic regression with L1 penalty (Lasso)
clf = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
clf.fit(X_train, y_train)

# Evaluate performance
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}")

              precision    recall  f1-score   support

           0       0.76      0.29      0.42        75
           1       0.72      0.95      0.82       144

    accuracy                           0.73       219
   macro avg       0.74      0.62      0.62       219
weighted avg       0.73      0.73      0.68       219

ROC AUC: 0.733


In [13]:
# View non-zero coefficients

feature_names = X.columns
coefficients = clf.coef_[0]
non_zero = [(name, coef) for name, coef in zip(feature_names, coefficients) if coef != 0]

print("\nNon-zero features selected by L1:")
for name, coef in non_zero:
    print(f"{name}: {coef:.3f}")


Non-zero features selected by L1:
220210: 0.205
220339: -1.194
223849: 0.520
224369: 0.048
224370: -0.542
224373: 0.144
224684: 0.004
224685: 0.976
224687: -0.977
224689: 0.203
224695: -3.532
224700: 0.034
224701: -0.022


In [14]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

feature_ids = feature_names.astype(int)

# Create a lookup dictionary from d_items: itemid -> label
itemid_to_label = dict(zip(d_items['itemid'], d_items['label'])) 

# Prepare list of (label, coef) for non-zero coefficients
non_zero = []
for itemid, coef in zip(feature_ids, coefficients):
    if coef != 0:
        label = itemid_to_label.get(itemid, f'ItemID {itemid}')  # fallback to itemid if no label found
        non_zero.append((label, coef))

# Print
print("\nNon-zero features selected by L1 with labels:")
for label, coef in non_zero:
    print(f"{label}: {coef:.3f}")


Non-zero features selected by L1 with labels:
Respiratory Rate: 0.205
PEEP set: -1.194
Ventilator Mode: 0.520
Sputum Consistency: 0.048
Sputum Color: -0.542
Sputum Amount: 0.144
Tidal Volume (set): 0.004
Tidal Volume (observed): 0.976
Minute Volume: -0.977
Respiratory Rate (spontaneous): 0.203
Peak Insp. Pressure: -3.532
Total PEEP Level: 0.034
PSV Level: -0.022


In [2]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/vitals_cleaned.csv'

# Load the CSV into a DataFrame
vitals = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(vitals.head())

   subject_id   stay_id            charttime  itemid  valuenum_normalized
0    10002428  38875437  2156-04-19 18:17:00  220045             0.524038
1    10002428  38875437  2156-04-19 18:25:00  220277             0.910000
2    10002428  38875437  2156-04-19 18:27:00  220179             0.411290
3    10002428  38875437  2156-04-19 18:27:00  220180             0.257576
4    10002428  38875437  2156-04-19 18:27:00  220181             0.310000


In [6]:
print("Unique subject_ids in outcomes:", outcomes['subject_id'].nunique())
print("Unique subject_ids in vitals:", vitals['subject_id'].nunique())

Unique subject_ids in outcomes: 5832
Unique subject_ids in vitals: 5832


In [7]:
# Combine dataframes

combined1 = pd.merge(
    vitals,
    outcomes[['subject_id', 'stay_id', 'ext_success']],
    on=['subject_id', 'stay_id'],
    how='inner'   
)

In [8]:
combined1.head()

Unnamed: 0,subject_id,stay_id,charttime,itemid,valuenum_normalized,ext_success
0,10002428,38875437,2156-04-19 18:17:00,220045,0.524038,1
1,10002428,38875437,2156-04-19 18:25:00,220277,0.91,1
2,10002428,38875437,2156-04-19 18:27:00,220179,0.41129,1
3,10002428,38875437,2156-04-19 18:27:00,220180,0.257576,1
4,10002428,38875437,2156-04-19 18:27:00,220181,0.31,1


In [10]:
# Perform logistic regression using mean of last 10 values of each included itemid

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

combined_sorted = combined1.sort_values(['subject_id', 'stay_id', 'itemid', 'charttime'])

# Select last 10 values
combined_sorted = combined1.sort_values(['subject_id', 'stay_id', 'itemid', 'charttime'])

last10 = (
    combined_sorted
    .groupby(['subject_id', 'stay_id', 'itemid'])
    .tail(10)
)

# Aggregate by mean
agg = (
    last10
    .groupby(['subject_id', 'stay_id', 'itemid'])['valuenum_normalized']
    .mean()
    .reset_index()
)

# Pivot to wide format (features as columns)
vitals_features = agg.pivot(index=['subject_id', 'stay_id'],
                          columns='itemid',
                          values='valuenum_normalized').reset_index()

# Attach outcome
outcome_df = (
    combined1[['subject_id', 'stay_id', 'ext_success']]
    .drop_duplicates(subset=['subject_id', 'stay_id'])
)

data = pd.merge(vitals_features, outcome_df, on=['subject_id', 'stay_id'], how='inner').dropna()

# Prepare data for logistic regression
X = data.drop(columns=['subject_id', 'stay_id', 'ext_success'])
y = data['ext_success']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit logistic regression with L1 penalty (Lasso)
clf = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
clf.fit(X_train, y_train)

# Evaluate performance
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}")

              precision    recall  f1-score   support

           0       0.87      0.41      0.56       302
           1       0.81      0.98      0.89       783

    accuracy                           0.82      1085
   macro avg       0.84      0.69      0.72      1085
weighted avg       0.83      0.82      0.79      1085

ROC AUC: 0.795


In [11]:
# View non-zero coefficients

feature_names = X.columns
coefficients = clf.coef_[0]
non_zero = [(name, coef) for name, coef in zip(feature_names, coefficients) if coef != 0]

print("\nNon-zero features selected by L1:")
for name, coef in non_zero:
    print(f"{name}: {coef:.3f}")


Non-zero features selected by L1:
220045: -2.801
220050: 1.516
220052: 5.314
220179: 0.327
220180: 4.628
220277: 12.615
223761: -12.481
224689: 1.097


In [12]:
# Load the d_items CSV
d_items_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/icu/d_items.csv'
d_items = pd.read_csv(d_items_path)

feature_ids = feature_names.astype(int)

# Create a lookup dictionary from d_items: itemid -> label
itemid_to_label = dict(zip(d_items['itemid'], d_items['label'])) 

# Prepare list of (label, coef) for non-zero coefficients
non_zero = []
for itemid, coef in zip(feature_ids, coefficients):
    if coef != 0:
        label = itemid_to_label.get(itemid, f'ItemID {itemid}')  # fallback to itemid if no label found
        non_zero.append((label, coef))

# Print
print("\nNon-zero features selected by L1 with labels:")
for label, coef in non_zero:
    print(f"{label}: {coef:.3f}")


Non-zero features selected by L1 with labels:
Heart Rate: -2.801
Arterial Blood Pressure systolic: 1.516
Arterial Blood Pressure mean: 5.314
Non Invasive Blood Pressure systolic: 0.327
Non Invasive Blood Pressure diastolic: 4.628
O2 saturation pulseoxymetry: 12.615
Temperature Fahrenheit: -12.481
Respiratory Rate (spontaneous): 1.097


In [14]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/bloodgas_cleaned.csv'

# Load the CSV into a DataFrame
bloodgas = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(bloodgas.head())

   subject_id            charttime  itemid  valuenum_normalized
0    10002428  2156-04-12 10:29:00   50813             0.075862
1    10002428  2156-04-12 12:24:00   50813             0.068966
2    10002428  2156-04-12 19:49:00   50813             0.055172
3    10002428  2156-04-13 01:58:00   50813             0.051724
4    10002428  2156-04-13 06:23:00   50813             0.062069


In [15]:
print("Unique subject_ids in outcomes:", outcomes['subject_id'].nunique())
print("Unique subject_ids in vitals:", bloodgas['subject_id'].nunique())

Unique subject_ids in outcomes: 5832
Unique subject_ids in vitals: 5818


In [18]:
# Combine dataframes

combined2 = pd.merge(
    bloodgas,
    outcomes[['subject_id', 'ext_success']],
    on=['subject_id'],
    how='inner'   
)

In [19]:
combined2.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum_normalized,ext_success
0,10002428,2156-04-12 10:29:00,50813,0.075862,1
1,10002428,2156-04-12 12:24:00,50813,0.068966,1
2,10002428,2156-04-12 19:49:00,50813,0.055172,1
3,10002428,2156-04-13 01:58:00,50813,0.051724,1
4,10002428,2156-04-13 06:23:00,50813,0.062069,1


In [23]:
# Perform logistic regression using mean of last 10 values of each included itemid

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

combined_sorted = combined2.sort_values(['subject_id', 'itemid', 'charttime'])

# Select last 10 values
combined_sorted = combined2.sort_values(['subject_id', 'itemid', 'charttime'])

last10 = (
    combined_sorted
    .groupby(['subject_id', 'itemid'])
    .tail(10)
)

# Aggregate by mean
agg = (
    last10
    .groupby(['subject_id', 'itemid'])['valuenum_normalized']
    .mean()
    .reset_index()
)

# Pivot to wide format (features as columns)
bloodgas_features = agg.pivot(index=['subject_id'],
                          columns='itemid',
                          values='valuenum_normalized').reset_index()

# Attach outcome
outcome_df = (
    combined2[['subject_id', 'ext_success']]
    .drop_duplicates(subset=['subject_id']
))

data = pd.merge(bloodgas_features, outcome_df, on=['subject_id'], how='inner').dropna()

# Prepare data for logistic regression
X = data.drop(columns=['subject_id', 'ext_success'])
y = data['ext_success']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit logistic regression with L1 penalty (Lasso)
clf = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
clf.fit(X_train, y_train)

# Evaluate performance
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}")

              precision    recall  f1-score   support

           0       0.67      0.11      0.19        91
           1       0.76      0.98      0.86       266

    accuracy                           0.76       357
   macro avg       0.71      0.55      0.52       357
weighted avg       0.74      0.76      0.69       357

ROC AUC: 0.637


In [24]:
# View non-zero coefficients

feature_names = X.columns
coefficients = clf.coef_[0]
non_zero = [(name, coef) for name, coef in zip(feature_names, coefficients) if coef != 0]

print("\nNon-zero features selected by L1:")
for name, coef in non_zero:
    print(f"{name}: {coef:.3f}")


Non-zero features selected by L1:
50802: 2.506
50803: -1.228
50806: 0.801
50811: 0.320
50813: -2.637
50818: -5.186
50820: 2.816
50821: 0.139
50822: -1.086


In [26]:
# Load the d_items CSV
d_labitems_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/mimic-iv-3.1/hosp/d_labitems_2.csv'
d_labitems = pd.read_csv(d_labitems_path)

feature_ids = feature_names.astype(int)

# Create a lookup dictionary from d_items: itemid -> label
itemid_to_label = dict(zip(d_labitems['itemid'], d_labitems['label'])) 

# Prepare list of (label, coef) for non-zero coefficients
non_zero = []
for itemid, coef in zip(feature_ids, coefficients):
    if coef != 0:
        label = itemid_to_label.get(itemid, f'ItemID {itemid}')  # fallback to itemid if no label found
        non_zero.append((label, coef))

# Print
print("\nNon-zero features selected by L1 with labels:")
for label, coef in non_zero:
    print(f"{label}: {coef:.3f}")


Non-zero features selected by L1 with labels:
Base Excess: 2.506
Calculated Bicarbonate, Whole Blood: -1.228
Chloride, Whole Blood: 0.801
Hemoglobin: 0.320
Lactate: -2.637
pCO2: -5.186
pH: 2.816
pO2: 0.139
Potassium, Whole Blood: -1.086
