## Similarity of phrases

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import ast
import sys
sys.path.insert(0, '../../data')
import data_loaders

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yuvalmamana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load the data and filter by feature, add annotation_length column for analysis

In [2]:
train = data_loaders.get_clean_train_data()
train = train[train['feature_num'] == 800]

In [3]:
def add_phrases_manual(train):
    """
    'manually' divide patient notes into phrases by splitting by commas and periods.
    """
    train['phrases_manual'] = train['pn_history'].apply(lambda x: x.split(', '))
    train['phrases_manual'] = train['phrases_manual'].apply(lambda x: [i.split('. ') for i in x])
    train['phrases_manual'] = train['phrases_manual'].apply(lambda row: [val for sublist in row for val in sublist])
    train = train.reset_index().drop('index', axis=1)

    return train

def add_phrases_tokenizer(train, tokenizer):
    """
    add 2 columns to train:
     - phrases_tokenizer: patient notes divided into phrases using a given tokenizer
     - phrases: phrases_manual divided into phrases using a given tokenizer
    """
    train['phrases_tokenizer'] = train['pn_history'].apply(tokenizer)
    train['phrases'] = train['phrases_manual'].apply(lambda row: [tokenizer(val) for val in row])
    train['phrases'] = train['phrases'].apply(lambda row: [val for sublist in row for val in sublist])

    return train

In [4]:
train = add_phrases_manual(train)
train = add_phrases_tokenizer(train, sent_tokenize)

# initializing results columns
train['similarity'] = [[] for _ in range(len(train['pn_num']))]
train['phrase'] = [[] for _ in range(len(train['pn_num']))]
train['res_loc'] = [[] for _ in range(len(train['pn_num']))]

#### The pre-processed train dataset:

In [5]:
for i in train['phrases'][34]:
    print('-', i)

train

- HPI:67 Y F c/o trouble sleeping since last 3 weeks
- son died in a motor vehicle accident 3 weeks ago when the symptoms started,takes AMPIEN for problem but not relieved completely
- no compla complains of eating more since then
- no complaints of,no weight gain
- constipation.
- PMH:kc/o high blood pressure takes HCTZ since last 15 years and LIsinopril since last 5 years
- h/o breast cancer- lumpectomy 10 years ago
 h/o laparotomy for appendicitis at 20 years of age
- no history of travel
FH: not significant

SH: retired receptionist
- lives with husband,
      no smokig
- drinks occasionally
-  no recreational drugs
      sexually active with husband


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location_raw,feature_text,pn_history,location,phrases_manual,phrases_tokenizer,phrases,similarity,phrase,res_loc
0,80039_800,8,80039,800,"[appetite has increased, eating more]","[468 490, 502 513]",Increased-appetite,Loraine Wicks is a 67 y.o. woman here for diff...,"[(468, 490), (502, 513)]","[Loraine Wicks is a 67 y.o, woman here for dif...","[Loraine Wicks is a 67 y.o., woman here for di...","[Loraine Wicks is a 67 y.o, woman here for dif...",[],[],[]
1,80042_800,8,80042,800,[],[],Increased-appetite,67yo F presents with trouble sleeping. She rep...,[],"[67yo F presents with trouble sleeping, She re...","[67yo F presents with trouble sleeping., She r...","[67yo F presents with trouble sleeping, She re...",[],[],[]
2,80066_800,8,80066,800,[eating more than usual],[364 386],Increased-appetite,Pt is a 67 yo F presenting with insomnia x 3 w...,"[(364, 386)]",[Pt is a 67 yo F presenting with insomnia x 3 ...,[Pt is a 67 yo F presenting with insomnia x 3 ...,[Pt is a 67 yo F presenting with insomnia x 3 ...,[],[],[]
3,80098_800,8,80098,800,[Increased appetite],[255 273],Increased-appetite,Mrs Loraine Wicks a 67 Y old F pt presenting t...,"[(255, 273)]",[Mrs Loraine Wicks a 67 Y old F pt presenting ...,[Mrs Loraine Wicks a 67 Y old F pt presenting ...,[Mrs Loraine Wicks a 67 Y old F pt presenting ...,[],[],[]
4,80245_800,8,80245,800,[],[],Increased-appetite,Pt is a 67 year old female with PMH of high bl...,[],[Pt is a 67 year old female with PMH of high b...,[Pt is a 67 year old female with PMH of high b...,[Pt is a 67 year old female with PMH of high b...,[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,84191_800,8,84191,800,[appetite is increased],[510 531],Increased-appetite,"Identity: Loraine Wicks, a 67-year-old female\...","[(510, 531)]","[Identity: Loraine Wicks, a 67-year-old female...","[Identity: Loraine Wicks, a 67-year-old female...","[Identity: Loraine Wicks, a 67-year-old female...",[],[],[]
96,84231_800,8,84231,800,[appetite has increased],[358 380],Increased-appetite,Loraine Wicks is a 67 yo F with a history of H...,"[(358, 380)]",[Loraine Wicks is a 67 yo F with a history of ...,[Loraine Wicks is a 67 yo F with a history of ...,[Loraine Wicks is a 67 yo F with a history of ...,[],[],[]
97,84255_800,8,84255,800,[appetite has increased],[503 525],Increased-appetite,"HPI: 67 yo F, G1P1, c/o trouble sleeping for t...","[(503, 525)]","[HPI: 67 yo F, G1P1, c/o trouble sleeping for ...","[HPI: 67 yo F, G1P1, c/o trouble sleeping for ...","[HPI: 67 yo F, G1P1, c/o trouble sleeping for ...",[],[],[]
98,84273_800,8,84273,800,[Increased appetite],[211 229],Increased-appetite,"67F w/ PMH of breast CA in remission, presenti...","[(211, 229)]","[67F w/ PMH of breast CA in remission, present...","[67F w/ PMH of breast CA in remission, present...","[67F w/ PMH of breast CA in remission, present...",[],[],[]


#### Loading features data:

In [6]:
features = data_loaders.get_features()

#### Training a bert model on all features:

In [7]:
model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(features['feature_text'], show_progress_bar=True)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
def bert(s, embeddings):
    """
    return the index of the most similar feature to a given string
    """
    y = model.encode(s).reshape(1, -1)
    sim = cosine_similarity(embeddings, y)
    opt_index = np.argmax(sim)
    return opt_index, sim[opt_index][0]

#### Main training loop - get the most similar feature for every phrase

In [9]:
for i, row in train.iterrows():
    start = 0
    for phrase in row['phrases']:
        end = start + len(phrase)
        index, sim = bert(phrase, embeddings)
        f_num = features['feature_num'][index]
        if f_num == 800 and sim > 0.48:
            train.loc[i, 'similarity'].append(sim)
            train.loc[i, 'phrase'].append(phrase)
            train.loc[i, 'res_loc'].append((start, end))
        start += len(phrase) + 2

In [10]:
train.sort_values('similarity', ascending=False)[['pn_num',
                                                  'annotation',
                                                  'phrase',
                                                  'location',
                                                  'res_loc',
                                                  'similarity']]

Unnamed: 0,pn_num,annotation,phrase,location,res_loc,similarity
14,80656,[increased appetite],[increased appetite],"[(411, 429)]","[(411, 429)]",[0.9319909]
86,83907,[increased appetite],[increased appetite],"[(398, 416)]","[(398, 416)]",[0.9319909]
31,81437,[increased appetite],[increased appetite],"[(618, 636)]","[(618, 636)]",[0.9319909]
76,83376,[increased appetite],[increased appetite],"[(323, 341)]","[(323, 341)]",[0.9319909]
41,81856,[increased appetite],[increased appetite],"[(510, 528)]","[(510, 528)]",[0.9319909]
...,...,...,...,...,...,...
24,81015,[],[],[],[],[]
70,82967,[],[],[],[],[]
38,81687,[increase appettite],[],"[(262, 280)]",[],[]
23,80950,[EATING MORE],[],"[(311, 322)]",[],[]


In [12]:
a = train[['location', 'res_loc']]

The main problem with this approach is the sentences' division
I believe that if we will find a better method for that the results will improve significantly
It can work on other features as well