In [3]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
import torch
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
from tensorflow.keras.optimizers import Adam
labels = ["ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"]


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_note_data(folder_name='train', separate=False):
    if separate:
        overall_df = pd.DataFrame(columns=["note1", "note2", "note3", "note4", "note5"].extend(labels))
    else:
        overall_df = pd.DataFrame(columns=["notes"].extend(labels))

    current_directory = os.getcwd()
    directory = os.path.join(current_directory, folder_name)
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            patient_num = os.path.splitext(filename)[0]
            row_to_add = {}
            # Load the XML file
            tree = ET.parse(os.path.join(directory, filename))
            root = tree.getroot()
            # Access elements and attributes
            for child in root:
                if child.tag == "TEXT":
                    if separate:
                        notes = child.text.split("****************************************************************************************************")
                        notes = [note.strip() for note in notes if note.strip()]
                        i = 1
                        for note in notes:
                            row_to_add[f"note{i}"] = note
                            i += 1
                        for j in range(i, 6):
                            row_to_add[f"note{j}"] = ""
                    else:
                        note = child.text
                        row_to_add['notes'] = note
                if child.tag == "TAGS":
                    for subchild in child:
                        row_to_add[subchild.tag] = 1 if subchild.attrib.get('met') == 'met' else 0
            overall_df.loc[patient_num] = row_to_add

    return overall_df


In [5]:
patient_notes_data = get_note_data()
patient_notes_data.head()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'c:\\Users\\nlabu\\Downloads\\train'

In [6]:
def make_model_for_label_combined_notes(df, label, num_epochs, save=False, save_name=None):
    '''
    Makes a fine-tuned ClinicalBERT model for a given label with data.

    Arguments:
    df - DataFrame containing a 'notes' column with the corresponding clinical notes
    label - Str that is the name of the column to use as y/labels for the model training
    save - Flag to save the model
    save_name - What the directory for the model should be named

    Returns:
    TFAutoModelForSequenceClassification
    '''
    if save and save_name is None:
        save_name = f'{label}_model'
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    model = TFAutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", from_pt=True)
    clinical_notes = list(df['notes'])

    # Tokenize and pad the clinical notes
    tokenized_notes = tokenizer(clinical_notes, padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    labels = np.array(df[label])

    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=.5, patience=3)
    model.compile(optimizer=Adam(3e-5), metrics='accuracy')  # No loss argument!

    model.fit(tokenized_data, labels, epochs=num_epochs, verbose=True, callbacks=[lr_scheduler])
    if save:
        tokenizer.save_pretrained(save_name)
        model.save_pretrained(save_name)
    return model



In [7]:
def make_models_for_labels(labels, make_model_function, df, epochs, save=False):
    '''
    Makes a model for each label using the given function, data, and labels.

    Arguments:
    labels - List[Str] where each string is a label
    make_model_function - Callable to make the model that takes in a df and label
    df - DataFrame of the data
    save - Flag to save each model

    Returns:
    Dict Str -> TFAutoModelForSequenceClassification
    '''
    models = {}
    for label in labels:
        print(f"Making model for label {label}...")
        model = make_model_function(df, label, epochs, save=save)
        print("Finished making model.")
        models[label] = model
    return models

In [8]:
# # Example usage of make_models_for_labels
# # models = make_models_for_labels(["ABDOMINAL",
#         "ADVANCED-CAD",
#         "ALCOHOL-ABUSE",
#         "ASP-FOR-MI",
#         "CREATININE"], make_model_for_label_combined_notes, get_note_data(), epochs=2, save=True)

In [9]:
def tokenize_data(note):
    import nltk
    from nltk.corpus import stopwords
    import re
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    pattern = r"[\n\.]"
    # Split the text into sentences
    sentences = re.split(pattern, note.lower())
    # Split the text into tokens
    tokens = [nltk.word_tokenize(sentence) for sentence in sentences if sentence.strip()]
    # Remove stop words
    #filtered_tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [26]:
loaded_models = {}
for label in labels:
    loaded_models[label] = (TFAutoModelForSequenceClassification.from_pretrained(f'{label}_model'), AutoTokenizer.from_pretrained(f'{label}_model'))
print(loaded_models)
    

Some layers from the model checkpoint at ABDOMINAL_model were not used when initializing TFBertForSequenceClassification: ['dropout_721']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ABDOMINAL_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.
Some layers from the model checkpoint at ADVANCED-CAD_model were not used when

{'ABDOMINAL': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x5873e0700>, 'ADVANCED-CAD': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x358bd0a00>, 'ALCOHOL-ABUSE': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x595d43b20>, 'ASP-FOR-MI': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x587342c20>, 'CREATININE': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x32b4bffa0>, 'DIETSUPP-2MOS': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x593ec10c0>, 'DRUG-ABUSE': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x32b0fdd80>, 'ENGLISH': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x32b5fa980>, 'HBA1C': <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object 

In [27]:
def get_accuracy_for_label(label, model, tokenizer, verbose=True):
    tokenized_notes = tokenizer(patient_notes_data, padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    true_labels = np.array(patient_notes_data[label])

    predictions = model.predict(tokenized_data)
    predicted_labels = np.argmax(predictions, axis=1)

    accuracy = np.mean(predicted_labels == true_labels)
    if verbose:
        print(f"{label} Accuracy: {accuracy}")
        
    return accuracy

label_to_accuracy = {}
for label, model_tokenizer in loaded_models.items():
    label_to_accuracy[label] = get_accuracy_for_label(label, model_tokenizer[0], model_tokenizer[1])














