# LR w/ClinicalBERT embeddings for baseline performance

In [1]:
import utils
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer
import model_helpers
LABELS = ["ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"]

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
# Both will be dataframes with a 'notes' column and a column for each label
train_data = utils.get_note_data(LABELS, folder_name='train')
test_data = utils.get_note_data(LABELS, folder_name='test')

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenize text
encoded_input = tokenizer(train_data['notes'], return_tensors='tf')

# Get word embeddings
word_embeddings = encoded_input['input_ids'].numpy()

# Flatten the word embeddings - one input vector per sequence
flattened_embeddings = np.mean(word_embeddings, axis=1)

models = {}
for label in LABELS:
    # Define logistic regression model
    logistic_regression_model = tf.keras.Sequential([
        tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(flattened_embeddings.shape[1],))
    ])

    # Compile the model
    logistic_regression_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    logistic_regression_model.fit(flattened_embeddings, train_data[label], epochs=5)

    models[label] = logistic_regression_model



In [None]:
label_to_predictions = {}
for label, model in models.items():
    print(f"Predicting for model: {label}")
    label_to_predictions[label] = model_helpers.get_predictions(model, test_data)


Predicting for model: ABDOMINAL
[[0.2414437 ]
 [0.15770328]
 [0.2840973 ]
 [0.14279576]
 [0.23869914]
 [0.197795  ]
 [0.26773468]
 [0.15957266]
 [0.23185757]
 [0.24648981]
 [0.19184572]
 [0.2231623 ]
 [0.1649169 ]
 [0.23290344]
 [0.20445669]
 [0.20872465]
 [0.15718919]
 [0.21298273]
 [0.16097684]
 [0.19958724]
 [0.20998082]
 [0.18410239]
 [0.17957325]
 [0.16367117]
 [0.26657534]
 [0.30540603]
 [0.16570383]
 [0.14840782]
 [0.24877639]
 [0.26570857]
 [0.2948003 ]
 [0.23967612]
 [0.18088737]
 [0.18813384]
 [0.28471982]
 [0.18206395]
 [0.17414749]
 [0.20830086]
 [0.24213487]
 [0.20787425]
 [0.22716814]
 [0.15660971]
 [0.23418112]
 [0.15611242]
 [0.29636976]
 [0.15414767]
 [0.21193619]
 [0.21650712]
 [0.16983722]
 [0.18683328]
 [0.16694435]
 [0.23897076]
 [0.2077289 ]
 [0.21732768]
 [0.15772642]
 [0.21203758]
 [0.18335319]
 [0.2559509 ]
 [0.2224181 ]
 [0.27251002]
 [0.19017339]
 [0.20210706]
 [0.25041097]
 [0.1911733 ]
 [0.1631854 ]
 [0.21375366]
 [0.19509749]
 [0.26186112]
 [0.25060195]
 [

Save predictions and load predictions here if wanted.

In [None]:
utils.save_preds(label_to_predictions, "LR_predictions")

In [None]:
label_to_predictions = utils.read_preds("LR_predictions")

  label_to_predictions[row[0]] = list(row[1:])


### Performance:

In [None]:
label_to_micro_f1, overall_f1 = utils.get_f1_scores_for_labels(LABELS, test_data, label_to_predictions)
print('overall-f1:', overall_f1)

overall-f1: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
