## Similarity of phrases

In [13]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import ast
import sys
sys.path.insert(0, '../../data')
import data_loaders
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuvalmamana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load the data and filter by feature, add annotation_length column for analysis

In [14]:
train = data_loaders.get_clean_train_data()

In [15]:
def add_phrases_manual(train):
    """
    'manually' divide patient notes into phrases by splitting by commas and periods.
    """
    train['phrases_manual'] = train['pn_history'].apply(lambda x: x.split(', '))
    train['phrases_manual'] = train['phrases_manual'].apply(lambda x: [i.split('. ') for i in x])
    train['phrases_manual'] = train['phrases_manual'].apply(lambda row: [val for sublist in row for val in sublist])
    train = train.reset_index().drop('index', axis=1)

    return train

def add_phrases_tokenizer(train, tokenizer):
    """
    add 2 columns to train:
     - phrases_tokenizer: patient notes divided into phrases using a given tokenizer
     - phrases: phrases_manual divided into phrases using a given tokenizer
    """
    train['phrases_tokenizer'] = train['pn_history'].apply(tokenizer)
    train['phrases'] = train['phrases_manual'].apply(lambda row: [tokenizer(val) for val in row])
    train['phrases'] = train['phrases'].apply(lambda row: [val for sublist in row for val in sublist])

    return train

In [16]:
train = add_phrases_manual(train)
train = add_phrases_tokenizer(train, sent_tokenize)
train['note_len'] = train['pn_history'].apply(len)

# initializing results columns
train['similarity'] = [list() for _ in range(len(train['pn_num']))]
train['phrase'] = [list() for _ in range(len(train['pn_num']))]
train['res_loc'] = [list() for _ in range(len(train['pn_num']))]
train['res_loc_bin'] = [list() for _ in range(len(train['pn_num']))]
train['expected_bin'] = [list() for _ in range(len(train['pn_num']))]

#### The pre-processed train dataset:

In [17]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location_raw,feature_text,pn_history,location,phrases_manual,phrases_tokenizer,phrases,note_len,similarity,phrase,res_loc,res_loc_bin,expected_bin
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,"[(696, 724)]","[HPI: 17yo M presents with palpitations, Patie...","[HPI: 17yo M presents with palpitations., Pati...","[HPI: 17yo M presents with palpitations, Patie...",938,[],[],[],[],[]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,"[(668, 693)]","[HPI: 17yo M presents with palpitations, Patie...","[HPI: 17yo M presents with palpitations., Pati...","[HPI: 17yo M presents with palpitations, Patie...",938,[],[],[],[],[]
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,"[(203, 217)]","[HPI: 17yo M presents with palpitations, Patie...","[HPI: 17yo M presents with palpitations., Pati...","[HPI: 17yo M presents with palpitations, Patie...",938,[],[],[],[],[]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,"[(70, 91), (176, 183)]","[HPI: 17yo M presents with palpitations, Patie...","[HPI: 17yo M presents with palpitations., Pati...","[HPI: 17yo M presents with palpitations, Patie...",938,[],[],[],[],[]
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,"[(222, 258)]","[HPI: 17yo M presents with palpitations, Patie...","[HPI: 17yo M presents with palpitations., Pati...","[HPI: 17yo M presents with palpitations, Patie...",938,[],[],[],[],[]


In [18]:
train = train[['id', 'case_num', 'pn_num', 'feature_num', 'phrases', 'note_len', 'annotation', 'phrase', 'feature_text', 'location', 'res_loc', 'similarity', 'res_loc_bin', 'expected_bin']]
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,phrases,note_len,annotation,phrase,feature_text,location,res_loc,similarity,res_loc_bin,expected_bin
0,00016_000,0,16,0,"[HPI: 17yo M presents with palpitations, Patie...",938,[dad with recent heart attcak],[],Family-history-of-MI-OR-Family-history-of-myoc...,"[(696, 724)]",[],[],[],[]
1,00016_001,0,16,1,"[HPI: 17yo M presents with palpitations, Patie...",938,"[mom with ""thyroid disease]",[],Family-history-of-thyroid-disorder,"[(668, 693)]",[],[],[],[]
2,00016_002,0,16,2,"[HPI: 17yo M presents with palpitations, Patie...",938,[chest pressure],[],Chest-pressure,"[(203, 217)]",[],[],[],[]
3,00016_003,0,16,3,"[HPI: 17yo M presents with palpitations, Patie...",938,"[intermittent episodes, episode]",[],Intermittent-symptoms,"[(70, 91), (176, 183)]",[],[],[],[]
4,00016_004,0,16,4,"[HPI: 17yo M presents with palpitations, Patie...",938,[felt as if he were going to pass out],[],Lightheaded,"[(222, 258)]",[],[],[],[]


#### Loading features data:

In [19]:
features = data_loaders.get_features()

In [20]:

def process_feature_text(text):
    """
    SHOULD USE
    """
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

features['feature_text'] = features['feature_text'].apply(process_feature_text)
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family history of MI or Family history of myoc...
1,1,0,Family history of thyroid disorder
2,2,0,Chest pressure
3,3,0,Intermittent symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family history of migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No known illness contacts


#### Training a bert model on all features:

In [21]:
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(features['feature_text'], show_progress_bar=True)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
def bert(s, embeddings):
    """
    return the index of the most similar feature to a given string
    """
    y = model.encode(s).reshape(1, -1)
    sim = cosine_similarity(embeddings, y)
    opt_index = np.argmax(sim)
    return opt_index, sim[opt_index][0]

#### Main training loop - get the most similar feature for every phrase

In [None]:
for case, feature in zip(features['case_num'], list(features['feature_num'][:2])):
    for i, row in train[train['case_num'] == case].iterrows():
        start = 0
        for phrase in row['phrases']:
            end = start + len(phrase)
            index, sim = bert(phrase, embeddings)
            f_num = features['feature_num'][index]
            if f_num == feature and sim > 0.48:
                train.loc[i, 'similarity'].append(sim)
                train.loc[i, 'phrase'].append(phrase)
                train.loc[i, 'res_loc'].append((start, end))
            start += len(phrase) + 2

In [None]:
sys.path.insert(0, '../../experiments/theoviel_reproduction')
import reproduced_model

In [None]:
reproduced_model._evaluate_f1(train['res_loc'], train['location'])