In [1]:
import re
import nltk
import numpy as np
import pandas as pd
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import sys
sys.path.insert(0, '../../data')
import data_loaders

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuvalmamana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def phrase(text: str) -> list:
    """
    Divide patient notes into phrases by splitting by commas and periods and using nltk sent_tokenize.
    Gets a piece of text and returns a list of phrases
    """
    phrases = text.split(', ')
    phrases = [i.split('. ') for i in phrases]
    phrases = [sent_tokenize(val) for sublist in phrases for val in sublist]
    phrases = [val for sublist in phrases for val in sublist]

    return phrases

In [3]:
def process_feature_text(text: str) -> str:
    """
    Cleans feature text
    """
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)

    return text

In [4]:
def get_features(case=None):
    features = data_loaders.get_features()
    features['feature_text'] = features['feature_text'].apply(process_feature_text)
    if case is not None:
        features = features[features['case_num'] == case].reset_index(drop=True)
    return features

In [5]:
def train_bert(sentences: list, show_pbar=False):
    model = SentenceTransformer('all-mpnet-base-v2')
    embeddings = model.encode(sentences, show_progress_bar=show_pbar)

    return embeddings, model

In [6]:
def bert_similarity(s: str, embeddings, model: SentenceTransformer) -> (int, float):
    """
    return the index and similarity score of the most similar feature to a given string
    """
    y = model.encode(s).reshape(1, -1)
    sim = cosine_similarity(embeddings, y)
    opt_index = np.argmax(sim)

    return opt_index, sim[opt_index][0]

In [7]:
def predict(feature_num: int, case_num:int, pn_note: str, threshold=0.47) -> list:
    """
    returns a list of tuples of predicted locations of a given feature in a given text
    """
    features = get_features(case_num)
    embeddings, model = train_bert(features['feature_text'])
    phrases = phrase(pn_note)
    start = 0
    locations = []
    similarities = []
    for phrase_txt in phrases:
        end = start + len(phrase_txt)
        index, sim = bert_similarity(phrase_txt, embeddings, model)
        f_num = features['feature_num'][index]
        if f_num == feature_num and sim > threshold:
            similarities.append(sim)
            locations.append((start, end))
        start += len(phrase_txt) + 2

    return locations

In [16]:
def prepare_test_data() -> pd.DataFrame:
    """
    get test data and adds the notes text
    """
    notes: pd.DataFrame = data_loaders.get_patient_notes()
    notes = notes.set_index('pn_num', drop=True)
    test = pd.read_csv('../../data/test.csv')
    test['pn_history'] = test['pn_num'].apply(lambda x: notes.loc[int(x)]['pn_history'])

    return test

In [37]:
test = prepare_test_data()

In [38]:
test['pred'] = test.apply(lambda row: predict(row['feature_num'], row['case_num'], row['pn_history'], threshold=0.48), axis=1)

In [39]:
test

Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,pred
0,80245_816,8,80245,816,Pt is a 67 year old female with PMH of high bl...,"[(354, 402)]"
1,80307_800,8,80307,800,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(382, 398), (651, 678)]"
2,80307_817,8,80307,817,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(400, 436), (680, 699)]"
3,80307_811,8,80307,811,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(701, 717)]"
4,80307_805,8,80307,805,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,[]


In [29]:
import spacy
import numpy as np

def plot_annotation(df, pn_num):
    """
    see example bellow - mark the anotation on the text, and adds the required feature text for each.
    """
    options = {"colors": {}}

    df_text = df[df["pn_num"] == pn_num].reset_index(drop=True)

    text = df_text["pn_history"][0]
    ents = []

    for spans, feature_text, feature_num in df_text[["span", "feature_text", "feature_num"]].values:
        print(spans)
        for s in spans:
            ents.append({"start": int(s[0]), "end": int(s[1]), "label": feature_text})

        options["colors"][feature_text] =  f"rgb{tuple(np.random.randint(100, 255, size=3))}"

    doc = {"text": text, "ents": sorted(ents, key=lambda i: i["start"])}

    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)

In [31]:
test['feature_text'] = ['No-suicidal-ideations', 'Increased-appetite', 'Difficulty-with-sleep', 'Diminished-energy-OR-feeling-drained']
test

Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,pred,feature_text
0,80245_816,8,80245,816,Pt is a 67 year old female with PMH of high bl...,"[(354, 402)]",No-suicidal-ideations
1,80307_800,8,80307,800,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(382, 398), (651, 678)]",Increased-appetite
2,80307_817,8,80307,817,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(400, 436), (680, 699)]",Difficulty-with-sleep
3,80307_811,8,80307,811,Ms. Wicks is a 67 yo F h/o HTN and breast ca i...,"[(701, 717)]",Diminished-energy-OR-feeling-drained


In [32]:
test['span'] = test['pred']

In [33]:
plot_annotation(test, 80307)

[(382, 398), (651, 678)]
[(400, 436), (680, 699)]
[(701, 717)]
