In [6]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from imblearn.over_sampling import SMOTE

from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer

import shap

from utils import acc_balanced, mse_balanced

In [2]:
df = pd.read_csv('data/preprocessed.csv', lineterminator='\n')

## Predicting Race/APSII from Notes
### Classification (Race)

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['TEXT'])
y = 1*(df['ETHNICITY']=='BLACK')

scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
clf = LogisticRegression(random_state=0, 
                         max_iter=100, 
                         penalty='l2',
                         class_weight='balanced').fit(X_train, y_train)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
logreg_coef_lst = list(zip(list(vectorizer.get_feature_names_out()), list(clf.coef_[0])))
logreg_coef_lst = sorted(logreg_coef_lst, key=lambda x: -x[1])

In [27]:
logreg_coef_lst[:20]

[('4233', 0.3316337154927785),
 ('gsw', 0.3043980445695834),
 ('haitian', 0.2836621604108215),
 ('esrd', 0.28067627682615526),
 ('sickle', 0.2639422010568879),
 ('relapsed', 0.26380616030012805),
 ('hd', 0.2626747400479761),
 ('g6pd', 0.24788482135534248),
 ('dronedarone', 0.2314523989506795),
 ('fibroids', 0.213076461707557),
 ('dismal', 0.20471126224815475),
 ('failure', 0.20254201470300548),
 ('formalin', 0.18967202348104958),
 ('14503', 0.18717580165352712),
 ('cocaine', 0.1852745461846615),
 ('mom', 0.18457616078775763),
 ('laminotomy', 0.1835690369518447),
 ('she', 0.17907132221798627),
 ('thrombotic', 0.17887742123307473),
 ('htn', 0.1785499676292473)]

In [26]:
logreg_coef_lst[-20:][::-1]

[('russian', -0.6954018939852876),
 ('osh', -0.4296643940987133),
 ('cholangitis', -0.35642259653902936),
 ('inflicted', -0.3090761367262856),
 ('wife', -0.2891765680289175),
 ('melanoma', -0.2879477092478057),
 ('bypass', -0.28734631590433074),
 ('cml', -0.2779768254808201),
 ('eccymotic', -0.2749831943821543),
 ('cll', -0.2747479784870075),
 ('squamous', -0.2607840969953475),
 ('afib', -0.26030160099975075),
 ('ercp', -0.25908830230901075),
 ('cea', -0.24654476229389274),
 ('mottled', -0.24360120961218246),
 ('sotalol', -0.23710988374160907),
 ('impaction', -0.23671724732391874),
 ('aortic', -0.2354627395002561),
 ('revascularization', -0.23155939231270717),
 ('coccyx', -0.22690384836739833)]

In [28]:
# LR: Acc
# Acc 1: 0.8954014079958664, Acc 0: 0.8561647142898531, Acc Avg: 0.8757830611428598
# Acc 1: 0.6057825484764543, Acc 0: 0.804342371900029, Acc Avg: 0.7050624601882416
acc_balanced(clf.predict(X_train), y_train)
acc_balanced(clf.predict(X_test), y_test)

Acc 1: 0.8954014079958664, Acc 0: 0.8561647142898531, Acc Avg: 0.8757830611428598
Acc 1: 0.6057825484764543, Acc 0: 0.804342371900029, Acc Avg: 0.7050624601882416


In [29]:
# LR: ROC-AUC: 0.7698727769986204
enc = OneHotEncoder(handle_unknown='ignore')
y_test_ohe = enc.fit_transform(np.array(y_test).reshape(-1,1)).todense()
roc_auc_score(y_test_ohe, clf.predict_proba(X_test))

np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html


0.7698727775555061

In [30]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
X_test,  y_test  = SMOTE().fit_resample(X_test,  y_test)

In [46]:
# Rf Acc
# Acc 1: 0.8101832626743538, Acc 0: 0.5874384751824419, Acc Avg: 0.6988108689283978
# Acc 1: 0.8656695294155489, Acc 0: 0.5849593103670109, Acc Avg: 0.7253144198912799
clf = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=0)
clf.fit(X_train, y_train)

acc_balanced(clf.predict(X_train), y_train)
acc_balanced(clf.predict(X_test), y_test)

Acc 1: 0.8116704812858325, Acc 0: 0.5856518813959253, Acc Avg: 0.698661181340879
Acc 1: 0.8653929042426581, Acc 0: 0.582971469008331, Acc Avg: 0.7241821866254945


In [47]:
# RF ROC-AUC: 0.8448827558552359
enc = OneHotEncoder(handle_unknown='ignore')
y_test_ohe = enc.fit_transform(np.array(y_test).reshape(-1,1)).todense()
roc_auc_score(y_test_ohe, clf.predict_proba(X_test))

np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html


0.8395741576391722

In [48]:
rf_coef_lst = list(zip(list(vectorizer.get_feature_names_out()), 
                       clf.feature_importances_))
rf_coef_lst = sorted(rf_coef_lst, key=lambda x: -x[1])


In [50]:
rf_coef_lst[:20]

[('12', 0.024919001322369905),
 ('nc', 0.015630447222842573),
 ('rate', 0.014734274917822112),
 ('gi', 0.014652811405912956),
 ('am', 0.01463327906597654),
 ('stitle', 0.014323173129562519),
 ('monitor', 0.01343912169498776),
 ('resp', 0.013107519556953037),
 ('dose', 0.012974888473933336),
 ('patent', 0.01221652256933256),
 ('thick', 0.012116014843212538),
 ('bases', 0.012065269342883619),
 ('dr', 0.011823712815035789),
 ('hr', 0.011622559024537694),
 ('lungs', 0.01092438117325458),
 ('intact', 0.010727266957251175),
 ('sats', 0.010524633210947364),
 ('ls', 0.01009731559815544),
 ('diminished', 0.009911666998090228),
 ('via', 0.009601658994005325)]

### Regression (APSIII)

In [51]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['TEXT'])
y = df['apsiii']
r = 1*(df['ETHNICITY']=='BLACK')

# scaler = StandardScaler(with_mean=False)
# X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, r_train, r_test = train_test_split(X, y, r, 
                                                                     test_size=0.2, 
                                                                     random_state=42)

In [58]:
import random

In [69]:
idx = random.sample(range(X_train.shape[0]), X_train.shape[0]//10)

In [None]:
reg = LinearRegression().fit(X_train[idx], np.array(y_train)[idx])

In [None]:
# Train MSE: 167.74819976462678
# Test MSE:  360.5120580410933
print(f"Train MSE: {mean_squared_error(reg.predict(X_train), y_train)}")
print(f"Test MSE: {mean_squared_error(reg.predict(X_test), y_test)}")

Train MSE: 167.74819976462678
Test MSE: 360.5120580410933


In [None]:
# MSE 1: 147.49089812367117, Acc 0: 170.77714066513687, Acc Avg: 159.13401939440402
# Avg. Deviation 1: -0.7082664754667595 ± 12.123912599627277
# Avg. Deviation 0: 0.10590242145592403 ± 13.067743697450856
mse_balanced(reg.predict(X_train), y_train, r_train)

MSE 1: 147.49089812367117, Acc 0: 170.77714066513687, Acc Avg: 159.13401939440402
Avg. Deviation 1: -0.7082664754667595 ± 12.123912599627277
Avg. Deviation 0: 0.10590242145592403 ± 13.067743697450856


In [None]:
# MSE 1: 330.95810368059006, Acc 0: 364.90470213736575, Acc Avg: 347.93140290897793
# Avg. Deviation 1: -1.1453378946427526 ± 18.156164374329876
# Avg. Deviation 0: 0.13663823751172163 ± 19.101990266184707
mse_balanced(reg.predict(X_test), y_test, r_test)

MSE 1: 330.95810368059006, Acc 0: 364.90470213736575, Acc Avg: 347.93140290897793
Avg. Deviation 1: -1.1453378946427526 ± 18.156164374329876
Avg. Deviation 0: 0.13663823751172163 ± 19.101990266184707


In [None]:
linearreg_coef_lst = list(zip(list(vectorizer.get_feature_names_out()), 
                       reg.coef_))
linearreg_coef_lst = sorted(linearreg_coef_lst, key=lambda x: -x[1])


In [None]:
linearreg_coef_lst[:20]

In [None]:
linearreg_coef_lst[-20:]

In [None]:
reg = RandomForestRegressor(max_depth=4, n_estimators=50, random_state=0)
reg.fit(X_train, y_train)

In [None]:
# Train MSE: 330.8376609887305
# Test MSE: 332.61579437026865
print(f"Train MSE: {mean_squared_error(reg.predict(X_train), y_train)}")
print(f"Test MSE: {mean_squared_error(reg.predict(X_test), y_test)}")

Train MSE: 330.8376609887305
Test MSE: 332.61579437026865


In [None]:
# MSE 1: 300.8333949524116, Acc 0: 335.3240012821214, Acc Avg: 318.0786981172665
# Avg. Deviation 1: -1.2538477642888646 ± 17.299169365504213
# Avg. Deviation 0: 0.1856462586975391 ± 18.310913050657877
mse_balanced(reg.predict(X_train), y_train, r_train)

MSE 1: 300.8333949524116, Acc 0: 335.3240012821214, Acc Avg: 318.0786981172665
Avg. Deviation 1: -1.2538477642888646 ± 17.299169365504213
Avg. Deviation 0: 0.1856462586975391 ± 18.310913050657877


In [None]:
# MSE 1: 300.4668771565827, Acc 0: 337.3941312952583, Acc Avg: 318.9305042259205
# Avg. Deviation 1: -1.4347986065554001 ± 17.27449652277048
# Avg. Deviation 0: 0.1883365672714545 ± 18.36732589770995
mse_balanced(reg.predict(X_test), y_test, r_test)

MSE 1: 300.4668771565827, Acc 0: 337.3941312952583, Acc Avg: 318.9305042259205
Avg. Deviation 1: -1.4347986065554001 ± 17.27449652277048
Avg. Deviation 0: 0.1883365672714545 ± 18.36732589770995


### De-racifying notes (using all notes)


### Application to subset of data
    - Only keep each patient's first nursing note
    df_nte = df_nte.sort_values(by=['SUBJECT_ID','HADM_ID','CHARTDATE'])\
                .groupby(['SUBJECT_ID','HADM_ID'])\
                .head(1)
    - Test on pneumonia and fever datasets
        

In [None]:
PNEUMONIA_KEYS = ['PNEUMONIA','PMEUMONIA','PNEUMOMIA',
                  'PNEUMONI','PNAUMONIA','PNEMONIA',
                  'PNEUMNOIA','PNEUMONIN','PNEUMONNIA']
FEVER_KEYS = ['FEVER','FEER']

In [None]:
df.loc[df['DIAGNOSIS'].apply(lambda s: any([k in s for k in PNEUMONIA_KEYS]))]

In [None]:
df.loc[df['DIAGNOSIS'].apply(lambda s: any([k in s for k in FEVER_KEYS]))]