## Similarity of phrases

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import ast
import sys
sys.path.insert(0, '../../data')
import data_loaders
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuvalmamana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load the data and filter by feature, add annotation_length column for analysis

In [2]:
train = data_loaders.get_clean_train_data()
train = train[train['feature_num'] == 806]

In [3]:
def add_phrases_manual(train):
    """
    'manually' divide patient notes into phrases by splitting by commas and periods.
    """
    train['phrases_manual'] = train['pn_history'].apply(lambda x: x.split(', '))
    train['phrases_manual'] = train['phrases_manual'].apply(lambda x: [i.split('. ') for i in x])
    train['phrases_manual'] = train['phrases_manual'].apply(lambda row: [val for sublist in row for val in sublist])
    train = train.reset_index().drop('index', axis=1)

    return train

def add_phrases_tokenizer(train, tokenizer):
    """
    add 2 columns to train:
     - phrases_tokenizer: patient notes divided into phrases using a given tokenizer
     - phrases: phrases_manual divided into phrases using a given tokenizer
    """
    train['phrases_tokenizer'] = train['pn_history'].apply(tokenizer)
    train['phrases'] = train['phrases_manual'].apply(lambda row: [tokenizer(val) for val in row])
    train['phrases'] = train['phrases'].apply(lambda row: [val for sublist in row for val in sublist])

    return train

In [4]:
train = add_phrases_manual(train)
train = add_phrases_tokenizer(train, sent_tokenize)
train['note_len'] = train['pn_history'].apply(len)

# initializing results columns
train['similarity'] = [list() for _ in range(len(train['pn_num']))]
train['phrase'] = [list() for _ in range(len(train['pn_num']))]
train['res_loc'] = [list() for _ in range(len(train['pn_num']))]
train['res_loc_bin'] = [list() for _ in range(len(train['pn_num']))]
train['expected_bin'] = [list() for _ in range(len(train['pn_num']))]

#### The pre-processed train dataset:

In [5]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location_raw,feature_text,pn_history,location,phrases_manual,phrases_tokenizer,phrases,note_len,similarity,phrase,res_loc,res_loc_bin,expected_bin
0,80039_806,8,80039,806,[Difficulty falling asleep],[76 101],Difficulty-falling-asleep,Loraine Wicks is a 67 y.o. woman here for diff...,"[(76, 101)]","[Loraine Wicks is a 67 y.o, woman here for dif...","[Loraine Wicks is a 67 y.o., woman here for di...","[Loraine Wicks is a 67 y.o, woman here for dif...",921,[],[],[],[],[]
1,80042_806,8,80042,806,[trouble falling asleep],[51 73],Difficulty-falling-asleep,67yo F presents with trouble sleeping. She rep...,"[(51, 73)]","[67yo F presents with trouble sleeping, She re...","[67yo F presents with trouble sleeping., She r...","[67yo F presents with trouble sleeping, She re...",949,[],[],[],[],[]
2,80066_806,8,80066,806,[difficulty falling asleep],[86 111],Difficulty-falling-asleep,Pt is a 67 yo F presenting with insomnia x 3 w...,"[(86, 111)]",[Pt is a 67 yo F presenting with insomnia x 3 ...,[Pt is a 67 yo F presenting with insomnia x 3 ...,[Pt is a 67 yo F presenting with insomnia x 3 ...,949,[],[],[],[],[]
3,80098_806,8,80098,806,[],[],Difficulty-falling-asleep,Mrs Loraine Wicks a 67 Y old F pt presenting t...,[],[Mrs Loraine Wicks a 67 Y old F pt presenting ...,[Mrs Loraine Wicks a 67 Y old F pt presenting ...,[Mrs Loraine Wicks a 67 Y old F pt presenting ...,714,[],[],[],[],[]
4,80245_806,8,80245,806,[difficulty falling asleep],[182 207],Difficulty-falling-asleep,Pt is a 67 year old female with PMH of high bl...,"[(182, 207)]",[Pt is a 67 year old female with PMH of high b...,[Pt is a 67 year old female with PMH of high b...,[Pt is a 67 year old female with PMH of high b...,857,[],[],[],[],[]


In [6]:
train = train[['id', 'case_num', 'pn_num', 'feature_num', 'phrases', 'note_len', 'annotation', 'phrase', 'feature_text', 'location', 'res_loc', 'similarity', 'res_loc_bin', 'expected_bin']]
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,phrases,note_len,annotation,phrase,feature_text,location,res_loc,similarity,res_loc_bin,expected_bin
0,80039_806,8,80039,806,"[Loraine Wicks is a 67 y.o, woman here for dif...",921,[Difficulty falling asleep],[],Difficulty-falling-asleep,"[(76, 101)]",[],[],[],[]
1,80042_806,8,80042,806,"[67yo F presents with trouble sleeping, She re...",949,[trouble falling asleep],[],Difficulty-falling-asleep,"[(51, 73)]",[],[],[],[]
2,80066_806,8,80066,806,[Pt is a 67 yo F presenting with insomnia x 3 ...,949,[difficulty falling asleep],[],Difficulty-falling-asleep,"[(86, 111)]",[],[],[],[]
3,80098_806,8,80098,806,[Mrs Loraine Wicks a 67 Y old F pt presenting ...,714,[],[],Difficulty-falling-asleep,[],[],[],[],[]
4,80245_806,8,80245,806,[Pt is a 67 year old female with PMH of high b...,857,[difficulty falling asleep],[],Difficulty-falling-asleep,"[(182, 207)]",[],[],[],[]


#### Loading features data:

In [7]:
features = data_loaders.get_features()

In [8]:

def process_feature_text(text):
    """
    SHOULD USE
    """
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

features['feature_text'] = features['feature_text'].apply(process_feature_text)
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family history of MI or Family history of myoc...
1,1,0,Family history of thyroid disorder
2,2,0,Chest pressure
3,3,0,Intermittent symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family history of migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No known illness contacts


#### Training a bert model on all features:

In [9]:
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(features['feature_text'], show_progress_bar=True)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
def bert(s, embeddings):
    """
    return the index of the most similar feature to a given string
    """
    y = model.encode(s).reshape(1, -1)
    sim = cosine_similarity(embeddings, y)
    opt_index = np.argmax(sim)
    return opt_index, sim[opt_index][0]

#### Main training loop - get the most similar feature for every phrase

In [11]:
for case, feature in [(8, 806)]: #zip(features['case_num'], list(features['feature_num'][:2])):
    for i, row in train[train['case_num'] == case].iterrows():
        start = 0
        for phrase in row['phrases']:
            end = start + len(phrase)
            index, sim = bert(phrase, embeddings)
            f_num = features['feature_num'][index]
            if f_num == feature and sim > 0.48:
                train.loc[i, 'similarity'].append(sim)
                train.loc[i, 'phrase'].append(phrase)
                train.loc[i, 'res_loc'].append((start, end))
            start += len(phrase) + 2

In [12]:
sys.path.insert(0, '../../experiments/theoviel_reproduction')
import reproduced_model

Using cache dir: /Users/yuvalmamana/.cache, please set environment variable TRANSFORMERS_CACHE to override


In [13]:
reproduced_model._evaluate_f1(train['res_loc'], train['location'])

0.5248287671232877