In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn

In [2]:
# Main NLP Class

from pipeUtils import Document
from pipeUtils import Annotation
import re

class PadClassificationSystem:
    def __init__(self):
        #initiate necessary components        
        self.target_rules=self.getTargetRegexes()        
        self.negation_rules = self.getNegRegexes()
                
    def process(self, document):
        document_id = document.document_id
        ann_index=0
        for reg in self.target_rules:
            for match in reg.finditer(document.text):
                ann_id = 'NLP_'+ str(document_id) + '_' + str(ann_index)
                ann_index=ann_index+1
                new_annotation = Annotation(start_index=int(match.start()), 
                                    end_index=int(match.end()), 
                                    type='pad_annotation',
                                    ann_id = ann_id
                                    )
                new_annotation.spanned_text = document.text[new_annotation.start_index:new_annotation.end_index]

                # Check negation right before the found target up to 30 charachers before, 
                # making sure that the pre-text does not cross the text boundary and is valid

                if new_annotation.start_index - 30 > 0:
                    pre_text_start = new_annotation.start_index - 30
                else:
                    pre_text_start = 0

                # ending index of the pre_text is the beginning of the found target    
                pre_text_end = new_annotation.start_index    

                # substring the document text to identify the pre_text string
                pre_text = doc.text[pre_text_start: pre_text_end]

                # We do not need to know the exact location of the negation keyword, so re.search is acceptable
                for neg_regex in self.negation_rules:
                    if re.search(neg_regex, pre_text):
                        new_annotation.attributes["Negation"] ="Negated"

                document.annotations.append(new_annotation)
        
        return document 
    
    def getTargetRegexes(self):
        target_regexes = []
        regexes = [
            r'\bp(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            r'\ba(nk\w+)?(\s*\-|\\|\/)?\s*b(ra\w+)?\s*i(nd\w+)?\b(\W*)?((?:\w+\W+){1,5}?((0*(\.\d{1,2}))|(1*(\.[4-9])))?)?',
            
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

    def getNegRegexes(self):
        target_regexes = []
        regexes = [
            r'/\bno\b',
            r'/no\s*evidence\s*of'  ,
            r'/does\s*not\s*have',
            r'/denies'
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

In [3]:
#  test case
nlp_system = PadClassificationSystem()
doc_text = '''
Patient has peripheral artery disease. ---------- \nPatient also has PVD or peripheral vascular\ndisease or pvd . 
\n The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . \n 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.
'''
doc=Document(text=doc_text, document_id='Doc1')
 
out_doc=nlp_system.process(doc)
print(out_doc.toString())

Doc1
-------

Patient has peripheral artery disease. ---------- 
Patient also has PVD or peripheral vascular
disease or pvd . 

 The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . 
 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.

-------
NLP_Doc1_0 pad_annotation 13 38 peripheral artery disease 
NLP_Doc1_1 pad_annotation 69 72 PVD 
NLP_Doc1_2 pad_annotation 76 103 peripheral vascular
disease 
NLP_Doc1_3 pad_annotation 107 110 pvd 
NLP_Doc1_4 pad_annotation 146 171 peripheral artery disease 
NLP_Doc1_5 pad_annotation 181 208 peripheral arterial disease 
NLP_Doc1_6 pad_annotation 237 264 peripheral vascular disease 
NLP_Doc1_7 pad_annotation 340 367 peripheral arterial disease 



In [4]:
from pipeUtils import Annotation
from pipeUtils import Document
 
import os
import glob 

In [5]:
# Read all test documents
unid="u1166466"
project_1 = "PAD_TRAIN"
project_2 = "PAD_ABI"
path_1 = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/"+project_1
path_2 = "/home/"+"u0420717"+"/BRAT/"+str(unid)+"/"+project_2

In [6]:
test_docs=dict()
test_doc_paths = glob.glob(str(path_2+'/*.txt')) 
for d in test_doc_paths:
    doc = Document()
    #print(d)
    doc.load_document_from_file(d)
    #print(str(d[:-3])+'ann')
    doc.load_annotations_from_brat(str(d[:-3])+'ann')
    #print(os.path.basename(d))
    test_docs[os.path.basename(d)]=doc


test_docs

{'10083_27.txt': <pipeUtils.Document at 0x7fa0f4d1d320>,
 '10594_28.txt': <pipeUtils.Document at 0x7fa0f4d1ffd0>,
 '10594_29.txt': <pipeUtils.Document at 0x7fa0f4d95d68>,
 '12272_30.txt': <pipeUtils.Document at 0x7fa0f4d8beb8>,
 '12403_31.txt': <pipeUtils.Document at 0x7fa0f4d2ba90>,
 '12403_32.txt': <pipeUtils.Document at 0x7fa0f4d99390>,
 '12573_33.txt': <pipeUtils.Document at 0x7fa0f4d1d7b8>,
 '1266_4.txt': <pipeUtils.Document at 0x7fa0f4d99f60>,
 '1266_5.txt': <pipeUtils.Document at 0x7fa0f4d31e48>,
 '1266_6.txt': <pipeUtils.Document at 0x7fa0f4d7a4e0>,
 '13260_34.txt': <pipeUtils.Document at 0x7fa0f4d8add8>,
 '13625_35.txt': <pipeUtils.Document at 0x7fa0f4d8bfd0>,
 '1369_8.txt': <pipeUtils.Document at 0x7fa0f4d7b470>,
 '1371_7.txt': <pipeUtils.Document at 0x7fa0f4d31860>,
 '14566_36.txt': <pipeUtils.Document at 0x7fa0f4d1dfd0>,
 '15011_37.txt': <pipeUtils.Document at 0x7fa0f4d2be80>,
 '1604_10.txt': <pipeUtils.Document at 0x7fa194288b38>,
 '1604_9.txt': <pipeUtils.Document at 0x7f

In [15]:
# Processing the all notes
nlp_system = PadClassificationSystem()

for doc_id in  test_docs.keys():
    nlp_system.process(test_docs.get(doc_id))

In [16]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('PAD','pad_annotation', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 177 FN = 0
NLP_17411_39.txt_0 pad_annotation 1073 1101 ankle-brachial index is 0.78 
NLP_17411_39.txt_1 pad_annotation 1501 1529 ankle-brachial index is 0.87 
NLP_1604_10.txt_0 pad_annotation 1142 1170 ankle-brachial indices, and  
NLP_1604_10.txt_1 pad_annotation 1320 1351 ankle-
     brachial index of . 
NLP_1604_10.txt_2 pad_annotation 1511 1518 ABI of  
NLP_12272_30.txt_0 pad_annotation 176 182 ABI B/ 
NLP_12272_30.txt_1 pad_annotation 557 563 ABI B/ 
NLP_12272_30.txt_2 pad_annotation 1074 1106 ankle brachial
     index could  
NLP_12272_30.txt_3 pad_annotation 1412 1440 Ankle brachial index is 0.79 
NLP_13625_35.txt_0 pad_annotation 440 443 PVD 
NLP_13625_35.txt_1 pad_annotation 1007 1031 ankle/brachial index is  
NLP_13625_35.txt_2 pad_annotation 1229 1253 ankle/brachial index is  
NLP_732_1.txt_0 pad_annotation 419 422 PVD 
NLP_732_1.txt_1 pad_annotation 986 1015 ankle-brachial index
     is  
NLP_21380_51.txt_0 pad_annotation 2116 2143 Peripheral vascular disease 
N

In [17]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PAD', 'Negation', 'Negated'),('pad_annotation', 'Negation', 'Negated')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_annotation', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 177 FN = 0
NLP_17411_39.txt_0 pad_annotation 1073 1101 ankle-brachial index is 0.78 
NLP_17411_39.txt_1 pad_annotation 1501 1529 ankle-brachial index is 0.87 
NLP_1604_10.txt_0 pad_annotation 1142 1170 ankle-brachial indices, and  
NLP_1604_10.txt_1 pad_annotation 1320 1351 ankle-
     brachial index of . 
NLP_1604_10.txt_2 pad_annotation 1511 1518 ABI of  
NLP_12272_30.txt_0 pad_annotation 176 182 ABI B/ 
NLP_12272_30.txt_1 pad_annotation 557 563 ABI B/ 
NLP_12272_30.txt_2 pad_annotation 1074 1106 ankle brachial
     index could  
NLP_12272_30.txt_3 pad_annotation 1412 1440 Ankle brachial index is 0.79 
NLP_13625_35.txt_0 pad_annotation 440 443 PVD 
NLP_13625_35.txt_1 pad_annotation 1007 1031 ankle/brachial index is  
NLP_13625_35.txt_2 pad_annotation 1229 1253 ankle/brachial index is  
NLP_732_1.txt_0 pad_annotation 419 422 PVD 
NLP_732_1.txt_1 pad_annotation 986 1015 ankle-brachial index
     is  
NLP_21380_51.txt_0 pad_annotation 2116 2143 Peripheral vascular disease 
N

In [None]:
import pymysql
import pandas as pd
import getpass
import re

conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')
cursor = conn.cursor()

In [None]:
pad_data = pd.read_sql("""SELECT subject_id, 
                      category, 
                      text FROM noteevents limit 100000 """,conn)

In [None]:
final_nlp_system = PadClassificationSystem()

In [None]:
output = []
counter = 0
for index , row in pad_data.sample(10000).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    final_nlp_system.process(doc)
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pad_annotation'):
                neg_flag = 0
                # Switch the flag to 1 when the mention is negated
                if('definite_negated_existence' in a.attributes):
                    neg_flag=1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

In [None]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame = (pd.DataFrame(output, columns=columns))

result_data_frame.describe()
result_data_frame

In [None]:
result_data_frame.to_csv('out_table.csv', index=False)
print('Done')