In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn

In [2]:
# Main NLP Class

from pipeUtils import Document
from pipeUtils import Annotation
import re

class PadClassificationSystem:
    def __init__(self):
        #initiate necessary components        
        self.target_rules=self.getTargetRegexes()        
        self.negation_rules = self.getNegRegexes()
                
    def process(self, document):
        document_id = document.document_id
        ann_index=0
        for reg in self.target_rules:
            for match in reg.finditer(document.text):
                ann_id = 'NLP_'+ str(document_id) + '_' + str(ann_index)
                ann_index=ann_index+1
                new_annotation = Annotation(start_index=int(match.start()), 
                                    end_index=int(match.end()), 
                                    type='pad_annotation',
                                    ann_id = ann_id
                                    )
                new_annotation.spanned_text = document.text[new_annotation.start_index:new_annotation.end_index]

                # Check negation right before the found target up to 30 charachers before, 
                # making sure that the pre-text does not cross the text boundary and is valid

                if new_annotation.start_index - 30 > 0:
                    pre_text_start = new_annotation.start_index - 30
                else:
                    pre_text_start = 0

                # ending index of the pre_text is the beginning of the found target    
                pre_text_end = new_annotation.start_index    

                # substring the document text to identify the pre_text string
                pre_text = doc.text[pre_text_start: pre_text_end]

                # We do not need to know the exact location of the negation keyword, so re.search is acceptable
                for neg_regex in self.negation_rules:
                    if re.search(neg_regex, pre_text):
                        new_annotation.attributes["Negation"] ="Negated"

                document.annotations.append(new_annotation)
        
        return document 
    
    def getTargetRegexes(self):
        target_regexes = []
        regexes = [
            #r'\bp(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            #r'\bhistory\s*(?:\w+\W+){1,5}?p(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            #r'\bpvd\b',
            #r'\bhas\s*(?:\w+\W+){1,5}?p(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            #r'\bsug(es\w+)\s*(?:\w+\W+){1,5}?p(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            #r'\bdiagnosis\s*(?:\w+\W+){1,5}?p(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b',
            #r'\ba(nk\w+)?(\s*\-|\\|\/)?\s*b(ra\w+)?\s*i(nd\w+)?\b((\W*)?(?:\w+\W+){1,5}?(0*(\.\d{1,2}))|(1*(\.[4-9])))+'
            r'\ba(nk\w+)?(\s*\-|\\|\/)?\s*b(ra\w+)?\s*i(nd\w+)?\b',
            r'\ba(nk\w+)?(\s*\-|\\|\/)?\s*b(ra\w+)?\s*i(nd\w+)?\b\W*(?:\w+\W+){1,5}?(\d*\.\d*\d*)'
            
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

    def getNegRegexes(self):
        target_regexes = []
        regexes = [
            r'\bno\b',
            r'no\s*evidence\s*of'  ,
            r'does\s*not\s*have',
            r'denies'
            #r'^(?!.*(\bp(eri\w+)?\s*(a(rt\w+)?|v(as\w+)?)\s*d(ise\w+)?\b)).*$',
            #r'^(?!.*(\ba(nk\w+)?(\s*\-|\\|\/)?\s*b(ra\w+)?\s*i(nd\w+)?\b)).*$'
        ]
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

In [3]:
#  test case
nlp_system = PadClassificationSystem()
doc_text = '''
Patient has peripheral artery disease. ---------- \nPatient also has PVD or peripheral vascular\ndisease or pvd . 
\n The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . \n 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.\nthis is just some random text, has no meaning at all
'''
doc=Document(text=doc_text, document_id='Doc1')
 
out_doc=nlp_system.process(doc)
print(out_doc.toString())

Doc1
-------

Patient has peripheral artery disease. ---------- 
Patient also has PVD or peripheral vascular
disease or pvd . 

 The patient does not have any peripheral artery disease 
but has peripheral arterial disease . The patient denies having peripheral vascular disease . 
 
The patient has a femoral and illiac occlusion which is suggestive of peripheral arterial disease.
this is just some random text, has no meaning at all

-------



In [4]:
from pipeUtils import Annotation
from pipeUtils import Document
 
import os
import glob 

In [5]:
# Read all test documents
unid="u1166466"
project_1 = "ABI"
project_2 = "TEST"
path_1 = "/home/"+"u0420717"+"/BRAT/"+str(unid)+"/"+project_1
path_2 = "/home/"+"u0420717"+"/BRAT/"+str(unid)+"/"+project_2

In [6]:
test_docs=dict()
test_doc_paths = glob.glob(str(path_2+'/*.txt')) 
for d in test_doc_paths:
    doc = Document()
    #print(d)
    doc.load_document_from_file(d)
    #print(str(d[:-3])+'ann')
    doc.load_annotations_from_brat(str(d[:-3])+'ann')
    #print(os.path.basename(d))
    test_docs[os.path.basename(d)]=doc


test_docs 

{'10088_1.txt': <pipeUtils.Document at 0x7f5c5b6df400>,
 '1234_8.txt': <pipeUtils.Document at 0x7f5c5b6dfcf8>,
 '24631_2.txt': <pipeUtils.Document at 0x7f5c5b6df390>,
 '25879_5.txt': <pipeUtils.Document at 0x7f5c5b6df3c8>,
 '3678_0.txt': <pipeUtils.Document at 0x7f5c5b6df358>,
 '3830_3.txt': <pipeUtils.Document at 0x7f5c5b6dfd30>,
 '3830_7.txt': <pipeUtils.Document at 0x7f5c5b6df048>,
 '781_6.txt': <pipeUtils.Document at 0x7f5c5b6dfda0>,
 '781_9.txt': <pipeUtils.Document at 0x7f5c5b6df0b8>,
 '8396_4.txt': <pipeUtils.Document at 0x7f5c5b6ee5c0>}

In [7]:
# Processing the all notes
nlp_system = PadClassificationSystem()

for doc_id in  test_docs.keys():
    nlp_system.process(test_docs.get(doc_id))
    print(test_docs.get(doc_id))

<pipeUtils.Document object at 0x7f5c5b6dfda0>
<pipeUtils.Document object at 0x7f5c5b6dfd30>
<pipeUtils.Document object at 0x7f5c5b6df048>
<pipeUtils.Document object at 0x7f5c5b6df400>
<pipeUtils.Document object at 0x7f5c5b6df358>
<pipeUtils.Document object at 0x7f5c5b6df0b8>
<pipeUtils.Document object at 0x7f5c5b6dfcf8>
<pipeUtils.Document object at 0x7f5c5b6df3c8>
<pipeUtils.Document object at 0x7f5c5b6df390>
<pipeUtils.Document object at 0x7f5c5b6ee5c0>


In [8]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('ABI-VALUE','pad_annotation', True)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 0 FN = 0


In [9]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('ABI', 'Positive', 'Negative','Unknown'),('pad_annotation', 'Negation', 'Negated','Hypothetical','Affirmed')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('ABI-VALUE','pad_annotation', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 0 FN = 0


In [10]:
import pymysql
import pandas as pd
import getpass
import re

conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')
cursor = conn.cursor()

Enter MySQL passwd for jovyan········


In [11]:
pad_data = pd.read_sql("""SELECT subject_id, 
                      category, 
                      text FROM noteevents limit 100000 """,conn)

In [12]:
final_nlp_system = PadClassificationSystem()

In [13]:
output = []
counter = 0
for index , row in pad_data.sample(10000).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    final_nlp_system.process(doc)
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pad_annotation'):
                neg_flag = 0
                # Switch the flag to 1 when the mention is negated
                if('definite_negated_existence' in a.attributes):
                    neg_flag=1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

..

In [14]:
counter

24

In [15]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame = (pd.DataFrame(output, columns=columns))

result_data_frame.describe()
result_data_frame

Unnamed: 0,record_id,subject_id,note_id,annotation_type,span_start,span_end,PAD_snippet,neg_flag
0,1999_13810_1,1999,1999_13810,pad_annotation,1142,1145,ABI,0
1,14061_85453_1,14061,14061_85453,pad_annotation,1300,1303,ABI,0
2,14061_85453_2,14061,14061_85453,pad_annotation,1437,1440,ABI,0
3,14061_85453_3,14061,14061_85453,pad_annotation,2135,2138,ABI,0
4,14061_85453_4,14061,14061_85453,pad_annotation,2254,2257,ABI,0
5,14061_85453_5,14061,14061_85453,pad_annotation,1300,1309,ABI 0.53.,0
6,14061_85453_6,14061,14061_85453,pad_annotation,1437,1453,ABI of\n 0.47,0
7,10083_60081_1,10083,10083_60081,pad_annotation,1182,1202,ankle brachial index,0
8,10083_60081_2,10083,10083_60081,pad_annotation,1182,1209,ankle brachial index is 1.4,0
9,14123_85314_1,14123,14123_85314,pad_annotation,955,977,ankle brachial indices,0


In [16]:
result_data_frame.to_csv('ABI.csv', index=False)
print('Done')

Done
