In [1]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
import torch
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
from tensorflow.keras.optimizers import Adam
labels = ["ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"]




2024-04-06 19:29:06.093975: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def get_note_data(labels, folder_name='train', separate=False):
    if separate:
        headers = ["note1", "note2", "note3", "note4", "note5"]
        headers.extend(labels)
        overall_df = pd.DataFrame(columns=headers)
    else:
        headers = ["notes"]
        headers.extend(labels)
        overall_df = pd.DataFrame(columns=headers)

    current_directory = os.getcwd()
    directory = os.path.join(current_directory, folder_name)
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            patient_num = os.path.splitext(filename)[0]
            row_to_add = {}
            # Load the XML file
            tree = ET.parse(os.path.join(directory, filename))
            root = tree.getroot()
            # Access elements and attributes
            for child in root:
                if child.tag == "TEXT":
                    if separate:
                        notes = child.text.split("****************************************************************************************************")
                        notes = [note.strip() for note in notes if note.strip()]
                        i = 1
                        for note in notes:
                            row_to_add[f"note{i}"] = note
                            i += 1
                        for j in range(i, 6):
                            row_to_add[f"note{j}"] = ""
                    else:
                        note = child.text
                        row_to_add['notes'] = note
                if child.tag == "TAGS":
                    for subchild in child:
                        row_to_add[subchild.tag] = 1 if subchild.attrib.get('met') == 'met' else 0
            overall_df.loc[patient_num] = row_to_add

    return overall_df


In [20]:
patient_notes_data = get_note_data(labels)
patient_notes_data.sort_index().head()

Unnamed: 0,notes,ABDOMINAL
100,\n\nRecord date: 2106-02-12\n\nCampbell Orthop...,0
101,\n\nRecord date: 2079-05-12\n\n\n\n\n\nMERCY C...,0
102,\n\nRecord date: 2120-09-19\n\nPersonal Data a...,1
103,\n\nRecord date: 2067-11-24\n\n ...,0
104,\n\nRecord date: 2094-02-16\n\nJENNIFER BOOKER...,0


In [4]:
def make_model_for_label_combined_notes(df, label, num_epochs, save=False, save_name=None):
    '''
    Makes a fine-tuned ClinicalBERT model for a given label with data.

    Arguments:
    df - DataFrame containing a 'notes' column with the corresponding clinical notes
    label - Str that is the name of the column to use as y/labels for the model training
    save - Flag to save the model
    save_name - What the directory for the model should be named

    Returns:
    TFAutoModelForSequenceClassification
    '''
    if save and save_name is None:
        save_name = f'{label}_model'
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    model = TFAutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", from_pt=True)
    clinical_notes = list(df['notes'])

    # Tokenize and pad the clinical notes
    tokenized_notes = tokenizer(clinical_notes, padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    labels = np.array(df[label])

    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=.5, patience=3)
    model.compile(optimizer=Adam(3e-5), metrics='accuracy')  # No loss argument!

    model.fit(tokenized_data, labels, epochs=num_epochs, verbose=True, callbacks=[lr_scheduler])
    if save:
        tokenizer.save_pretrained(save_name)
        model.save_pretrained(save_name)
    return model



In [5]:
def make_models_for_labels(labels, make_model_function, df, epochs, save=False):
    '''
    Makes a model for each label using the given function, data, and labels.

    Arguments:
    labels - List[Str] where each string is a label
    make_model_function - Callable to make the model that takes in a df and label
    df - DataFrame of the data
    save - Flag to save each model

    Returns:
    Dict Str -> TFAutoModelForSequenceClassification
    '''
    models = {}
    for label in labels:
        print(f"Making model for label {label}...")
        model = make_model_function(df, label, epochs, save=save)
        print("Finished making model.")
        models[label] = model
    return models

In [6]:
# # Example usage of make_models_for_labels
# # models = make_models_for_labels(["ABDOMINAL",
#         "ADVANCED-CAD",
#         "ALCOHOL-ABUSE",
#         "ASP-FOR-MI",
#         "CREATININE"], make_model_for_label_combined_notes, get_note_data(), epochs=2, save=True)

In [7]:
def tokenize_data(note):
    import nltk
    from nltk.corpus import stopwords
    import re
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    pattern = r"[\n\.]"
    # Split the text into sentences
    sentences = re.split(pattern, note.lower())
    # Split the text into tokens
    tokens = [nltk.word_tokenize(sentence) for sentence in sentences if sentence.strip()]
    # Remove stop words
    #filtered_tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [12]:
loaded_models = {}
labels = ["ABDOMINAL"]
for label in labels:
    loaded_models[label] = (TFAutoModelForSequenceClassification.from_pretrained(f'{label}_model'), AutoTokenizer.from_pretrained(f'{label}_model'))
print(loaded_models)
    

Some layers from the model checkpoint at ABDOMINAL_model were not used when initializing TFBertForSequenceClassification: ['dropout_721']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ABDOMINAL_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


{'ABDOMINAL': (<transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x17ed3b5d0>, BertTokenizerFast(name_or_path='ABDOMINAL_model', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, spec

In [38]:
threshold = .5
def get_predictions(model, tokenizer, verbose=True):
    tokenized_notes = tokenizer(list(patient_notes_data["notes"]), padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    model_predictions = model.predict(tokenized_data)
    logits = model_predictions.logits
    probabilities = tf.nn.softmax(logits)
    list_probabilities = probabilities.numpy()
    predictions = (list_probabilities[:,0] < .5).astype(int).tolist()
    return predictions

label_to_predictions = {}
for label, model_tokenizer in loaded_models.items():
    label_to_predictions[label] = get_predictions(model_tokenizer[0], model_tokenizer[1])




In [41]:
def get_accuracy_from_preds(predictions, true_labels, verbose=True):
    accuracy = np.mean(predictions == true_labels)
    if verbose:
        print(f"{label} Accuracy: {accuracy}")
    return accuracy

label_to_accuracy = {}
for label, predictions in label_to_predictions.items():
    true_labels = np.array(patient_notes_data[label])
    accuracy = get_accuracy_from_preds(predictions, true_labels)
    label_to_accuracy[label] = accuracy

ABDOMINAL Accuracy: 0.6188118811881188


In [42]:
pd.DataFrame(label_to_accuracy).to_csv("accuracies_for_labels")

ValueError: If using all scalar values, you must pass an index