In [2]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
import torch
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
from tensorflow.keras.optimizers import Adam


In [3]:
def get_note_data(folder_name='train', separate=False):
    if separate:
        overall_df = pd.DataFrame(columns=["note1", "note2", "note3", "note4", "note5", "ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"])
    else:
        overall_df = pd.DataFrame(columns=["notes", "ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"])

    current_directory = os.getcwd()
    directory = os.path.join(current_directory, folder_name)
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            patient_num = os.path.splitext(filename)[0]
            row_to_add = {}
            # Load the XML file
            tree = ET.parse(os.path.join(directory, filename))
            root = tree.getroot()
            # Access elements and attributes
            for child in root:
                if child.tag == "TEXT":
                    if separate:
                        notes = child.text.split("****************************************************************************************************")
                        notes = [note.strip() for note in notes if note.strip()]
                        i = 1
                        for note in notes:
                            row_to_add[f"note{i}"] = note
                            i += 1
                        for j in range(i, 6):
                            row_to_add[f"note{j}"] = ""
                    else:
                        note = child.text
                        row_to_add['notes'] = note
                if child.tag == "TAGS":
                    for subchild in child:
                        row_to_add[subchild.tag] = 1 if subchild.attrib.get('met') == 'met' else 0
            overall_df.loc[patient_num] = row_to_add

    return overall_df


In [4]:
patient_notes_data = get_note_data()
patient_notes_data.head()

Unnamed: 0,notes,ABDOMINAL,ADVANCED-CAD,ALCOHOL-ABUSE,ASP-FOR-MI,CREATININE,DIETSUPP-2MOS,DRUG-ABUSE,ENGLISH,HBA1C,KETO-1YR,MAJOR-DIABETES,MAKES-DECISIONS,MI-6MOS
162,\n\nRecord date: 2068-02-04\n\nASSOCIATED ARTH...,1,1,0,1,0,0,0,1,0,0,0,1,0
176,\n\nRecord date: 2085-04-22\n\n \nThis patient...,1,0,1,0,0,1,0,1,0,0,0,1,0
189,\n\nRecord date: 2090-07-07\n\nWillow Gardens ...,0,1,0,1,1,1,0,1,1,0,1,1,0
214,\n\nRecord date: 2096-07-15\n\n\n\nResults01/3...,0,1,0,1,0,1,0,0,1,0,0,1,1
200,\n\nRecord date: 2170-02-17\n\n \n\nReason for...,1,0,0,1,0,1,0,1,0,0,1,1,0


In [5]:
def make_model_for_label_combined_notes(df, label, num_epochs, save=False, save_name=None):
    '''
    Makes a fine-tuned ClinicalBERT model for a given label with data.

    Arguments:
    df - DataFrame containing a 'notes' column with the corresponding clinical notes
    label - Str that is the name of the column to use as y/labels for the model training
    save - Flag to save the model
    save_name - What the directory for the model should be named

    Returns:
    TFAutoModelForSequenceClassification
    '''
    if save and save_name is None:
        save_name = f'{label}_model'
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    model = TFAutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", from_pt=True)
    clinical_notes = list(df['notes'])

    # Tokenize and pad the clinical notes
    tokenized_notes = tokenizer(clinical_notes, padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    labels = np.array(df[label])

    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=.5, patience=3)
    model.compile(optimizer=Adam(3e-5), metrics='accuracy')  # No loss argument!

    model.fit(tokenized_data, labels, epochs=num_epochs, verbose=True, callbacks=[lr_scheduler])
    if save:
        model.save(save_name, save_format="tf")
    return model



In [6]:
def make_models_for_labels(labels, make_model_function, df, epochs, save=False):
    '''
    Makes a model for each label using the given function, data, and labels.

    Arguments:
    labels - List[Str] where each string is a label
    make_model_function - Callable to make the model that takes in a df and label
    df - DataFrame of the data
    save - Flag to save each model

    Returns:
    Dict Str -> TFAutoModelForSequenceClassification
    '''
    models = {}
    for label in labels:
        print(f"Making model for label {label}...")
        model = make_model_function(df, label, epochs)
        print("Finished making model.")
        models[label] = model
    return models

In [7]:
# Example usage of make_models_for_labels
models = make_models_for_labels(['ABDOMINAL'], make_model_for_label_combined_notes, get_note_data(), epochs=10)
abuse_mobel = models['ABDOMINAL']

Making model for label ABDOMINAL...


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
def tokenize_data(note):
    import nltk
    from nltk.corpus import stopwords
    import re
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    pattern = r"[\n\.]"
    # Split the text into sentences
    sentences = re.split(pattern, note.lower())
    # Split the text into tokens
    tokens = [nltk.word_tokenize(sentence) for sentence in sentences if sentence.strip()]
    # Remove stop words
    #filtered_tokens = [token for token in tokens if token not in stop_words]
    return tokens

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 