# Code to test HAIM framework on a specific application using HAIM-MIMIC-MM dataset

This notebook is dedicated to demonstrate how an user can use the embedding csv files to make predictions of their task of interest. We here choose to predict a patient's mortality status in the next 48 hours as a demonstration. A patient's death status is defined by their hospital discharge location, if it is morgue, hospice or expired, we consider the patient has died. 

### Project Info
 ->Copyright 2020 (Last Update: June 07, 2022)
 
 -> Authors: 
        Luis R Soenksen (<soenksen@mit.edu>),
        Yu Ma (<midsumer@mit.edu>),
        Cynthia Zeng (<czeng12@mit.edu>),
        Ignacio Fuentes (<ifuentes@mit.edu>),
        Leonard David Jean Boussioux (<leobix@mit.edu>),
        Agni Orfanoudaki (<agniorf@mit.edu>),
        Holly Mika Wiberg (<hwiberg@mit.edu>),
        Michael Lingzhi Li (<mlli@mit.edu>),
        Kimberly M Villalobos Carballo (<kimvc@mit.edu>),
        Liangyuan Na (<lyna@mit.edu>),
        Dimitris J Bertsimas (<dbertsim@mit.edu>),

```
**Licensed under the Apache License, Version 2.0**
You may not use this file except in compliance with the License. You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
```

### Requires 
```
 -> Previously generated pickle files from HAIM-MIMIC-MM Dataset
```

## I. Import Packages

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import xgboost as xgb
import numpy as np

## II. Reading Data and Constructing Prediction Target

In [None]:
df = pd.read_csv(fname)
df_death_small48 = df[((df['img_length_of_stay'] < 48) & (df['death_status'] == 1))]
df_alive_big48 = df[((df['img_length_of_stay'] >= 48) & (df['death_status'] == 0))]
df_death_big48 = df[((df['img_length_of_stay'] >= 48) & (df['death_status'] == 1))]

df_death_small48['y'] = 1
df_alive_big48['y'] = 0
df_death_big48['y'] = 0
df = pd.concat([df_death_small48, df_alive_big48, df_death_big48], axis = 0)
df = df.drop(['img_id', 'img_charttime', 'img_deltacharttime', 'discharge_location', 'img_length_of_stay', 
        'death_status'], axis = 1)

## III. Training/Testing Set Split

In [None]:
pkl_list = df['haim_id'].unique().tolist()

train_id, test_id = train_test_split(pkl_list, test_size=0.3, random_state=seed)
#get the index for training and testing set
train_idx = df[df['haim_id'].isin(train_id)]['haim_id'].tolist()
test_idx = df[df['haim_id'].isin(test_id)]['haim_id'].tolist()

x_y = x_y[~x_y.isna().any(axis=1)]
#split train and test according to pkl list
y_train = x_y[x_y['haim_id'].isin(train_idx)]['y']
y_test = x_y[x_y['haim_id'].isin(test_idx)]['y']

x_train = x_y[x_y['haim_id'].isin(train_idx)].drop(['y','haim_id'],axis=1)
x_test = x_y[x_y['haim_id'].isin(test_idx)].drop(['y','haim_id'],axis=1)

print('train, test shapes', x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print('train set, death outcome case = %s, percentage = %s' %(y_train.sum(),  y_train.sum()/len(y_train)))
print('test set, death outcome case = %s, percentage = %s' %(y_test.sum(),  y_test.sum()/len(y_test)))

## IV: Model Training

In [None]:
cv_folds = 5
gs_metric = 'roc_auc'
param_grid = {'max_depth': [5, 6, 7, 8],
             'n_estimators': [200, 300],
             'learning_rate': [0.3, 0.1, 0.05],
             }

est = xgb.XGBClassifier(verbosity=0, scale_pos_weight = (len(y_train) - sum(y_train))/sum(y_train), seed = 42,
                        eval_metric='logloss')

gs = GridSearchCV(estimator = est, param_grid=param_grid, scoring=gs_metric, cv= cv_folds)
gs.fit(x_train, y_train)

y_pred_prob_train = gs.predict_proba(x_train)
y_pred_train = gs.predict(x_train)

y_pred_prob_test = gs.predict_proba(x_test)
y_pred_test = gs.predict(x_test)

## V:Reporting Performance Metrics

In [None]:
f1_train = metrics.f1_score(y_train, y_pred_train, average='macro')
accu_train = metrics.accuracy_score(y_train, y_pred_train)
accu_bl_train = metrics.balanced_accuracy_score(y_train, y_pred_train)
auc_train =  metrics.roc_auc_score(y_train, y_pred_prob_train)
conf_matrix_train = metrics.confusion_matrix(y_train, y_pred_train)

In [None]:
print(f'F1 Score for Training Set is: {f1_train}')
print(f'Accuracy for Training Set is: {accu_train}')
print(f'Balanced Accuracy for Training Set is: {accu_bl_train}')
print(f'AUC for Training Set is: {auc_train}')
print(f'Confusion Matrix for Training Set is: {conf_matrix_train}')

In [None]:
f1_test = metrics.f1_score(y_test, y_pred_test, average='macro')
accu_test = metrics.accuracy_score(y_test, y_pred_test)
accu_bl_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test =  metrics.roc_auc_score(y_test, y_pred_prob_test)
conf_matrix_test = metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
print(f'F1 Score for Testing Set is: {f1_test}')
print(f'Accuracy for Testing Set is: {accu_test}')
print(f'Balanced Accuracy for Testing Set is: {accu_bl_test}')
print(f'AUC for Testing Set is: {auc_test}')
print(f'Confusion Matrix for Testing Set is: {conf_matrix_test}')