<a href="https://colab.research.google.com/github/nkilaru213/ai-selfeval/blob/main/SelfEvaluation_MLDL_MIMIC_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load Data MIMIC health data tables**

In [4]:
# import all the packages needed
import pandas as pd
from google.colab import files

In [2]:
# Connect the the drive to access the google drive file system
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load tables

# Read ADMISSIONS data table from google drive file system
#admissions_df=pd.read_csv('/content/drive/My Drive/Exercises/ADMISSIONS.csv.gz', compression='gzip')
admissions_df=pd.read_csv('/content/drive/My Drive/Exercises/ADMISSIONS.csv')


num_records = len(admissions_df)
print("Number of admissions", num_records)

# Read PATIENTS data table from google drive file system
#patients_df=pd.read_csv('/content/drive/My Drive/Exercises/PATIENTS.csv.gz', compression='gzip')
patients_df=pd.read_csv('/content/drive/My Drive/Exercises/PATIENTS.csv')


num_records = len(patients_df)


# Read PRESCRIPTIONS data table from google drive file system
#prescriptions_df=pd.read_csv('/content/drive/My Drive/Exercises/PRESCRIPTIONS.csv.gz', compression='gzip')
prescriptions_df=pd.read_csv('/content/drive/My Drive/Exercises/PRESCRIPTIONS.csv')



# Read ICU_STAYS data table from google drive file system
#icustays_df=pd.read_csv('/content/drive/My Drive/Exercises/ICUSTAYS.csv.gz', compression='gzip')
icustays_df=pd.read_csv('/content/drive/My Drive/Exercises/ICUSTAYS.csv')



# Read TRANSFERS data table from google drive file system
#transfers_df=pd.read_csv('/content/drive/My Drive/Exercises/TRANSFERS.csv.gz', compression='gzip')
transfers_df=pd.read_csv('/content/drive/My Drive/Exercises/TRANSFERS.csv')


# Display ADMISSIONS table info
admissions_df.info()

# Display ADMISSIONS table info
patients_df.info()

# Display ADMISSIONS table info
prescriptions_df.info()

# Display ADMISSIONS table info
icustays_df.info()

# Display ADMISSIONS table info
transfers_df.info()

Number of admissions 129
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   row_id                129 non-null    int64 
 1   subject_id            129 non-null    int64 
 2   hadm_id               129 non-null    int64 
 3   admittime             129 non-null    object
 4   dischtime             129 non-null    object
 5   deathtime             40 non-null     object
 6   admission_type        129 non-null    object
 7   admission_location    129 non-null    object
 8   discharge_location    129 non-null    object
 9   insurance             129 non-null    object
 10  language              81 non-null     object
 11  religion              128 non-null    object
 12  marital_status        113 non-null    object
 13  ethnicity             129 non-null    object
 14  edregtime             92 non-null     object
 15  edouttime      

***Tables***

**admissions**  : Stores patient hospital admission details (time of admission, type, location, insurance, diagnosis, etc.)

**patients** : Contains demographic details (gender, birth date, death date, expiration flag).

**prescriptions** : Contains medication details (drug type, dosage, route, start & end dates).

**icustays** : Stores ICU admissions (care units, length of stay, entry/exit times).

**transfers** : Records patient movements between hospital units (previous/current ward).



In [6]:
# Display few rows from each dataframe
admissions_df.head()
patients_df.head()
prescriptions_df.head()
icustays_df.head()
transfers_df.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,eventtype,prev_careunit,curr_careunit,prev_wardid,curr_wardid,intime,outtime,los
0,54440,10006,142345,206504.0,carevue,admit,,MICU,,52.0,2164-10-23 21:10:15,2164-10-25 12:21:07,39.18
1,54441,10006,142345,,carevue,transfer,MICU,,52.0,45.0,2164-10-25 12:21:07,2164-11-01 17:14:27,172.89
2,54442,10006,142345,,carevue,discharge,,,45.0,,2164-11-01 17:14:27,,
3,54460,10011,105331,232110.0,carevue,admit,,MICU,,15.0,2126-08-14 22:34:00,2126-08-28 18:59:00,332.42
4,54461,10011,105331,,carevue,discharge,MICU,,15.0,,2126-08-28 18:59:00,,


In [7]:
merged_df = pd.merge(admissions_df, patients_df, on='subject_id')

num_records = len(merged_df)
print("Number of records", num_records)



Number of records 129


# **Data Cleaning**

1.   **Handle missing values** (e.g., fill missing ethnicity values)
2.   **Convert categorical variables** (e.g., gender, admission type) to numerical values.
3.   **Calculate Length of Stay (LOS)** for ICU admissions.

In [8]:
# Fill missing values for categorical columns
admissions_df.update(admissions_df["ethnicity"].fillna("unknown"))

# Convert categorical columns to numerical values
admissions_df["hospital_expire_flag"] = admissions_df["hospital_expire_flag"].astype(int)
patients_df["gender"] = patients_df["gender"].map({"M": 1, "F": 0})  # Encode gender as 1/0

# Calculate ICU stay duration
icustays_df["icu_duration"] = (pd.to_datetime(icustays_df["outtime"]) - pd.to_datetime(icustays_df["intime"])).dt.total_seconds() / 3600

# **Merge Tables to Build a Feature Set**



In [9]:
# Merge Admissions & Patients
data = pd.merge(admissions_df, patients_df, on="subject_id", how="inner") # Changed 'subject_id' to 'SUBJECT_ID'

# Merge with ICU stays
data = pd.merge(data, icustays_df, on=["subject_id", "hadm_id"], how="left") # Changed 'subject_id' to 'SUBJECT_ID' and 'hadm_id' to 'HADM_ID'

# Select useful columns
data = data[["subject_id", "gender", "ethnicity", "admission_type", "diagnosis", "icu_duration", "hospital_expire_flag"]] # Changed 'subject_id' to 'SUBJECT_ID'


# Drop missing ICU durations
data.dropna(subset=["icu_duration"], inplace=True)

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, columns=["admission_type", "ethnicity", "diagnosis"], drop_first=True)

# Display processed data
data.head()


Unnamed: 0,subject_id,gender,icu_duration,hospital_expire_flag,admission_type_EMERGENCY,admission_type_URGENT,ethnicity_ASIAN,ethnicity_BLACK/AFRICAN AMERICAN,ethnicity_HISPANIC OR LATINO,ethnicity_HISPANIC/LATINO - PUERTO RICAN,...,diagnosis_TRACHEAL ESOPHAGEAL FISTULA,diagnosis_TRACHEAL STENOSIS,diagnosis_UNSTABLE ANGINA,diagnosis_UPPER GI BLEED,diagnosis_URINARY TRACT INFECTION;PYELONEPHRITIS,diagnosis_UROSEPSIS,diagnosis_UTI/PYELONEPHRITIS,diagnosis_VARICEAL BLEED,diagnosis_VF ARREST,diagnosis_VOLVULUS
0,10006,0,39.181111,0,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,10011,0,332.416667,1,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,10013,0,63.597778,1,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,10017,0,51.446667,0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,10019,1,31.052222,1,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# **Machine Learning Model**


In [10]:
# Use Logistic Regression to predict patient mortality.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define features & target variable
X = data.drop(["hospital_expire_flag", "subject_id"], axis=1)
y = data["hospital_expire_flag"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6428571428571429
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        20
           1       0.33      0.25      0.29         8

    accuracy                           0.64        28
   macro avg       0.53      0.53      0.52        28
weighted avg       0.61      0.64      0.63        28



# **Deep Learning Model**


In [11]:
# Use a Neural Network to improve accuracy.


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define Neural Network Model
nn_model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")  # Output layer for binary classification
])

# Compile Model
nn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train Model
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate Model
loss, accuracy = nn_model.evaluate(X_test, y_test)
print("Neural Network Accuracy:", accuracy)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 89ms/step - accuracy: 0.4881 - loss: 0.7748 - val_accuracy: 0.4643 - val_loss: 0.7043
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.5436 - loss: 0.6675 - val_accuracy: 0.6429 - val_loss: 0.6715
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.7054 - loss: 0.6201 - val_accuracy: 0.6786 - val_loss: 0.6507
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.7039 - loss: 0.5901 - val_accuracy: 0.7143 - val_loss: 0.6354
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7044 - loss: 0.5494 - val_accuracy: 0.7143 - val_loss: 0.6237
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7289 - loss: 0.5146 - val_accuracy: 0.7143 - val_loss: 0.6160
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━

# **Evaluate Model Performance**


##### Comparision Logistic Regression vs Neural Network performance.



| Model                  | Accuracy  |
|------------------------|----------|
| Logistic Regression    | **64.2%** |
| Neural Network        | **67.8%**   |



Deep Learning Model performs better than Logistic Regression in predicting ICU mortality.





# **30 Day Readmission - Risk Prediction**


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


# **Data Cleaning**

# **Merge Tables to Build a Feature Set**


In [14]:
merged_data = admissions_df.merge(patients_df, on="subject_id", how="inner")
merged_data = merged_data.merge(icustays_df, on=["subject_id", "hadm_id"], how="left")
# Specify suffixes to avoid conflicts
merged_data = merged_data.merge(transfers_df, on=["subject_id", "hadm_id"], how="left", suffixes=('_merged', '_transfers'))
# The 'row_id' column from transfers_df will now be named 'row_id_transfers'.
# Adjust subsequent code to use the new column names if necessary.

# ***Feature Engineering***


***Create Readmission Label***

In [17]:
# Calculate age from 'dob' and 'admittime'
merged_data['admittime'] = pd.to_datetime(merged_data['admittime'])
merged_data['dob'] = pd.to_datetime(merged_data['dob'])

# Apply to_pydatetime before subtracting to handle large timedeltas
merged_data['age'] = merged_data.apply(lambda row: (row['admittime'].to_pydatetime() - row['dob'].to_pydatetime()).days / 365.25, axis=1)

merged_data['age'] = merged_data['age'].astype(int) # Convert age to integers

# Identify if a patient was readmitted within 30 days after discharge
merged_data["readmission_flag"] = merged_data.groupby("subject_id")["admittime"].shift(-1)
merged_data["readmission_flag"] = (pd.to_datetime(merged_data["readmission_flag"]) - pd.to_datetime(merged_data["dischtime"]))
merged_data["readmission_flag"] = merged_data["readmission_flag"].dt.days <= 30
merged_data["readmission_flag"] = merged_data["readmission_flag"].astype(int)

# Select Features and Target Variable
features = ["age", "icu_duration", "admission_type", "gender"]  # Selected predictive features
X = merged_data[features]
y = merged_data["readmission_flag"]  # Target variable

***Preprocessing***

In [18]:
# Preprocess Categorical Features using One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(merged_data[["admission_type", "gender"]]).toarray()
X_numeric = merged_data[["icu_duration", "age"]].fillna(0)  # Fill missing values
X = np.hstack([X_numeric, X_encoded])  # Combine numeric and encoded features

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize Numerical Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# ***Train ML Model: Random Forest***


In [19]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)  # Train Random Forest model
y_pred_rf = rf_model.predict(X_test)  # Predict using test set
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))  # Display accuracy
print(classification_report(y_test, y_pred_rf))  # Display classification metrics


Random Forest Accuracy: 0.7280701754385965
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.81      0.87      0.84        95

    accuracy                           0.73       114
   macro avg       0.41      0.44      0.42       114
weighted avg       0.68      0.73      0.70       114



# ***Train DL Model: Neural Network***


In [20]:
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer
    Dropout(0.3),  # Dropout for regularization
    Dense(32, activation='relu'),  # Second hidden layer
    Dropout(0.3),  # Additional dropout layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])


# Compile the Neural Network Model
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the Neural Network Model
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.3460 - loss: 0.7822 - val_accuracy: 0.7982 - val_loss: 0.6297
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7107 - loss: 0.6396 - val_accuracy: 0.8333 - val_loss: 0.5564
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7657 - loss: 0.6085 - val_accuracy: 0.8333 - val_loss: 0.5062
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7645 - loss: 0.5754 - val_accuracy: 0.8333 - val_loss: 0.4754
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7663 - loss: 0.5739 - val_accuracy: 0.8333 - val_loss: 0.4653
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7779 - loss: 0.5437 - val_accuracy: 0.8333 - val_loss: 0.4577
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e244eb45f90>

# ***Evaluate DL Model***


In [21]:
# Evaluate DL Model
loss, accuracy = nn_model.evaluate(X_test, y_test)
print("Neural Network Accuracy:", accuracy)  # Display model accuracy


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8052 - loss: 0.5098 
Neural Network Accuracy: 0.8333333134651184


# **Evaluate Model Performance**


##### Comparision Random Forest vs Neural Network performance.



| Model                  | Accuracy  |
|------------------------|----------|
| Random Forest Regression    | **72.8%** |
| Neural Network        | **83%**   |



Deep Learning Model performs better than Random Forest  in predicting 30 day re-admission.
