In [1]:
import os
import string
import collections
from collections import defaultdict
import spacy
from spacy.pipeline import EntityRuler
import csv
import scispacy
from scispacy.umls_linking import UmlsEntityLinker
from scispacy.linking import EntityLinker
import en_core_web_lg as lg
import en_core_sci_lg as scilg
import en_core_sci_sm as scism
from negspacy.negation import Negex
from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher
from sqlalchemy.engine import create_engine
import pandas as pd
import re

In [2]:
def clean_text(text):
    new_text = re.sub('[^A-Za-z0-9 /-]+', '', text.lower())
    cl_text = re.sub(r'(?:(?<=\/) | (?=\/))','',new_text)
    #print('{}: {}'.format(cl_text, len(cl_text)))
    return cl_text

In [3]:
def join_words(words):
    
    new_text = words[0]
    special = ['-', '/']
    for i in range(1, len(words)):
        if words[i] in special or words[i-1] in special:
            new_text = new_text + words[i]
        else:
            new_text = new_text + ' ' + words[i]
        
    return new_text

In [4]:
def write_to_csv(_dict, output):
    
    with open(output, 'w') as csv_file:  
        writer = csv.writer(csv_file)
        for key, value in _dict.items():
            writer.writerow([key, value])

In [5]:
def write_to_csv_pos_neg_final(_dict_positive, _dict_negative, _dict_final, output):
    
    cdc_symptoms = ['cough', 'shortness of breath or difficulty breathing', 'fatigue', 'headache', 
                    'ache', 'new loss of taste or smell', 'sore throat', 'congestion or runny nose', 
                    'nausea or vomit', 'diarrhea', 'fever or chill']
    
    new_cdc_symptoms = ['file_id']
    for x in cdc_symptoms:
        words_pos = x.split()
        words_pos.append('p')
        words_neg = x.split()
        words_neg.append('n')
        words_neutral = x.split()
        
        new_pos = '_'.join(words_pos)
        new_neg = '_'.join(words_neg)
        new_neutral = '_'.join(words_neutral)
        
        new_cdc_symptoms.append(new_pos)
        new_cdc_symptoms.append(new_neutral)
        new_cdc_symptoms.append(new_neg)
        
    with open(output, 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(new_cdc_symptoms)
        
        for key, value in _dict_positive.items():
            li_men = [key]
            for key2, value2 in value.items():
                li_men.extend([_dict_positive[key][key2], _dict_final[key][key2], _dict_negative[key][key2]])
            writer.writerow(li_men)

In [6]:
def init_dict(_dict, notes_for_training, dict_gold_cdc_cui):
    
    for file in notes_for_training:
        #name = file.replace('.source', '')
        name = file.strip()
        for k, v in dict_gold_cdc_cui.items():
            _dict[name][k] = 0
    
    #print(_dict)

In [7]:
# Uses a dictionary of list
def check_dict(_dict, element):
    
    for k, v in _dict.items():
        if element in v:
            return k
    
    return 'null'

In [8]:
def update_mdict(_dict, file, parent):

    _dict[file][parent] = 1

In [9]:
def update_final_mdict(_dict_final, _dict_positive, _dict_negative):
    
    for key, value in _dict_final.items():
        for key2, value2 in value.items():
            pos = _dict_positive[key][key2]
            neg = _dict_negative[key][key2]
            
            if (pos == 1):
                _dict_final[key][key2] = 1
            if (pos == 0) and (neg == 1):
                _dict_final[key][key2] = -1

In [10]:
def diff(li1, li2):

    li_dif = []
    for x in li1:
        if x not in li2:
            li_dif.append(x)
            
    return li_dif

In [11]:
def load_gaz_cdc(filename):
    
    nlp = spacy.load('en_core_web_lg')
    _dict = defaultdict(list)
    
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            new_list = []
            for x in row:
                x_new = clean_text(x)
                words = [token.lemma_ for token in nlp(x_new.strip())]
                new_str = join_words(words)
                #print('{}: {}'.format(new_str, len(new_str)))
                new_list.append(new_str)
            _dict[new_list[1]].append(new_list[0])
            #if new_list[1] not in _dict[new_list[1]]:
            #    _dict[new_list[1]].append(new_list[1])
    
    return _dict

In [12]:
def create_gazetteer(filename):
    
    nlp = spacy.load('en_core_web_lg')
    gaz = []
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            row_0_new = clean_text(row[0])
            row_1_new = clean_text(row[1])
            
            words_0 = [token.lemma_ for token in nlp(row_0_new.strip())]
            new_str_0 = join_words(words_0)
            words_1 = [token.lemma_ for token in nlp(row_1_new.strip())]
            new_str_1 = join_words(words_1)
            
            if new_str_0 not in gaz:
                gaz.append(new_str_0)
            if new_str_1 not in gaz:
                gaz.append(new_str_1)
    
    return gaz

In [13]:
def get_gaz_matches(tokenizer, phrases, texts, span_left, span_right):
    matcher = PhraseMatcher(tokenizer.vocab)
    matcher.add("Phrase", None, *phrases)
    for text in texts:
        doc = tokenizer(text.lower())
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
        for ent_id, start, end in matches:
            if start - span_left >= 0:
                yield (ent_id, doc[start:end].text, doc[(start - span_left):(end + span_right)].text)
            else:
                yield (ent_id, doc[start:end].text, doc[start:(end + span_right)].text)

In [14]:
def create_matcher(nlp):
    
    patterns = [[{'LEMMA': 'abdominal'}, {'LEMMA': 'bloating'}],
                [{'LEMMA': 'abdominal'}, {'LEMMA': 'bloat'}],
                [{'LEMMA': 'abdominal'}, {'LEMMA': 'cramping'}],
                [{'LEMMA': 'abdominal'}, {'LEMMA': 'cramp'}],
                [{'LEMMA': 'abdominal'}, {'LEMMA': 'pain'}],
                ############## possible variations ############
                [{'LEMMA': 'stomach'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'stomach'}, {'LEMMA': 'bloating'}],
                [{'LEMMA': 'stomach'}, {'LEMMA': 'bloat'}],
                [{'LEMMA': 'stomach'}, {'LEMMA': 'cramping'}],
                [{'LEMMA': 'stomach'}, {'LEMMA': 'cramp'}],
                [{'LEMMA': 'gastric'}, {'LEMMA': 'bloating'}],
                [{'LEMMA': 'gastric'}, {'LEMMA': 'bloat'}],
                [{'LEMMA': 'gastric'}, {'LEMMA': 'cramping'}],
                [{'LEMMA': 'gastric'}, {'LEMMA': 'cramp'}],
                [{'LEMMA': 'gastric'}, {'LEMMA': 'pain'}],
                ###############################################
                [{'LEMMA': 'ache'}, {'POS': 'CCONJ'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'achey'}], 
                [{'LEMMA': 'achiness'}],
                [{'LEMMA': 'achy'}],
                [{'LEMMA': 'asthenia'}],
                [{'LEMMA': 'calf'}, {'LEMMA': 'pain'}],
                ############# use case: can't and cannot ############
                [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'breathe'}], 
                [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'smell'}],
                [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'taste'}],
                #####################################################
                [{'LOWER': 'catharsis'}],
                [{'LEMMA': 'cephalic'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'congested'}]
                [{'LEMMA': 'congestion'}, {'POS': 'CCONJ'}, {'LEMMA': 'runny'}, {'LEMMA': 'nose'}],
                [{'LEMMA': 'cramp'}],
                [{'LEMMA': 'diaphoretic'}],
                [{'LEMMA': 'discombobulation'}],
                [{'LEMMA': 'distaste'}],
                [{'LEMMA': 'doe'}],
                [{'LEMMA': 'dysphagia'}],
                [{'LEMMA': 'elevated'}, {'LEMMA': 'temp'}],
                ############## possible variations. use cases: high, higher ############
                [{'LEMMA': 'high'}, {'LEMMA': 'temp'}],
                [{'LEMMA': 'elevated'}, {'LEMMA': 'temperature'}],
                [{'LEMMA': 'high'}, {'LEMMA': 'temperature'}],
                ########################################################################
                [{'LEMMA': 'exhaustion'}],
                [{'LOWER': 'f'}, {'IS_PUNCT': True}, {'LOWER': 'c'}],
                [{'LEMMA': 'febrile'}],
                [{'LEMMA': 'fever'}, {'POS': 'CCONJ'}, {'LEMMA': 'chill'}],
                [{'LEMMA': 'food'}, {'LEMMA': 'aversion'}],
                [{'LEMMA': 'gag'}],
                [{'LEMMA': 'gi'}, {'LEMMA': 'symptom'}],
                [{'LEMMA': 'headache'}],
                [{'LEMMA': 'hyperthermia'}],
                [{'LEMMA': 'inspiratory'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'lack'}, {'TAG': 'IN'}, {'LEMMA': 'olfactory'}, {'LEMMA': 'sense'}],
                [{'LEMMA': 'loose'}, {'LEMMA': 'stool'}],
                [{'LEMMA': 'loss'}, {'TAG': 'IN'}, {'LEMMA': 'smell'}],
                [{'LEMMA': 'loss'}, {'TAG': 'IN'}, {'LEMMA': 'taste'}],
                [{'LEMMA': 'low'}, {'LEMMA': 'energy'}],
                [{'LEMMA': 'metallic'}, {'LEMMA': 'taste'}],
                ############## possible variation ############
                [{'LEMMA': 'metal'}, {'LEMMA': 'taste'}],
                ###############################################
                [{'LEMMA': 'mucusy'}],
                [{'LEMMA': 'muscle'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'myalgia'}],
                [{'LEMMA': 'myodynia'}],
                [{'LOWER': 'n'}, {'IS_PUNCT': True}, {'LOWER': 'v'}],
                [{'LEMMA': 'nausea'}, {'POS': 'CCONJ'}, {'LEMMA': 'vomit'}],
                ############## possible variation ############
                [{'LEMMA': 'nauseous'}],
                ##############################################
                [{'LEMMA': 'neck'}, {'LEMMA': 'pain'}],
                [{'LEMMA': 'new'}, {'LEMMA': 'loss'}, {'POS': 'ADP', 'TAG': 'IN'}, {'LEMMA': 'taste'}, {'POS': 'CCONJ', 'TAG': 'CC'}, {'LEMMA': 'smell'}],
                [{'LEMMA': 'not', 'TAG': 'RB'}, {'LEMMA': 'eat'}],
                [{'LEMMA': 'not', 'TAG': 'RB'}, {'LEMMA': 'hungry'}],
                [{'LEMMA': 'out'}, {'LEMMA': 'of', 'TAG': 'IN'}, {'LOWER': 'it', 'TAG': 'PRP'}],
                [{'LEMMA': 'pain'}, {'LEMMA': 'with', 'POS': 'ADP'}, {'LEMMA': 'inspiration'}],
                [{'LEMMA': 'painful'}, {'LEMMA': 'swallow'}],
                [{'LEMMA': 'pleurisy'}],
                [{'LOWER': 'pnd'}],
                [{'LEMMA': 'poor'}, {'LEMMA': 'appetite'}],
                [{'LOWER': 'purgation'}],
                [{'LEMMA': 'queasiness'}],
                [{'LEMMA': 'queasy'}],
                [{'LEMMA': 'respiratory'}, {'LEMMA': 'difficulty'}],
                [{'LEMMA': 'respiratory'}, {'LEMMA': 'distress'}],
                [{'LEMMA': 'retch'}],
                [{'LEMMA': 'rhinorrhea'}],
                [{'LEMMA': 'rigor'}],
                [{'LEMMA': 'rhinorrhea'}],
                [{'LEMMA': 'runny'}, {'LEMMA': 'nose'}],
                [{'LEMMA': 'scratchy'}, {'LEMMA': 'throat'}],
                [{'LEMMA': 'shake'}],
                [{'LEMMA': 'shakey'}],
                [{'LEMMA': 'shakiness'}],
                [{'LEMMA': 'short'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}],
                [{'LEMMA': 'shortness'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}],
                [{'LEMMA': 'shortness'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}, {'POS': 'CCONJ'}, {'LEMMA': 'difficulty'}, {'LEMMA': 'breathing'}],
                ############## possible variation ############
                [{'LEMMA': 'short'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}, {'POS': 'CCONJ'}, {'LEMMA': 'difficulty'}, {'LEMMA': 'breathing'}],
                ##############################################
                [{'LEMMA': 'sob'}],
                [{'LEMMA': 'sore'}, {'LEMMA': 'throat'}],
                [{'LEMMA': 'soreness'}],
                [{'LEMMA': 'stuffy'}, {'LEMMA': 'nose'}],
                [{'LEMMA': 'sweaty'}],
                ############## possible variation ############
                [{'LEMMA': 'sweat'}],
                [{'LEMMA': 'sweating'}],
                ##############################################
                [{'LEMMA': 'throw'}, {'LEMMA': 'up'}],
                [{'LOWER': 'tired'}], 
                [{'LOWER': 'tiredness'}],
                [{'LOWER': 'tremor'}],
                [{'LOWER': 'trot'}],
                [{'LEMMA': 'trouble'}, {'LEMMA': 'breathing'}],
                [{'LEMMA': 'watery'}, {'LEMMA': 'stool'}],
                [{'LEMMA': 'weariness'}],
                [{'LEMMA': 'wear'}, {'LEMMA': 'out'}]
                ]
    
    matcher = Matcher(nlp.vocab)
    
    count = 0
    for pattern in patterns:
        matcher.add(pattern[0].items()[1], None, pattern)
        count = count + 1
    
    print('patterns added: {}'.format(count))
    
    return matcher

In [15]:
def create_ruler(nlp):
    
    patterns = [ {'label': 'ENTITY', 'pattern': [{'LEMMA': 'abdominal'}, {'LEMMA': 'bloating'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'abdominal'}, {'LEMMA': 'bloat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'abdominal'}, {'LEMMA': 'cramping'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'abdominal'}, {'LEMMA': 'cramp'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'abdominal'}, {'LEMMA': 'pain'}]},
                 ############## possible variations ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stomach'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stomach'}, {'LEMMA': 'bloating'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stomach'}, {'LEMMA': 'bloat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stomach'}, {'LEMMA': 'cramping'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stomach'}, {'LEMMA': 'cramp'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gastric'}, {'LEMMA': 'bloating'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gastric'}, {'LEMMA': 'bloat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gastric'}, {'LEMMA': 'cramping'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gastric'}, {'LEMMA': 'cramp'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gastric'}, {'LEMMA': 'pain'}]},
                 ###############################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'ache'}, {'POS': 'CCONJ'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'achey'}]}, 
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'achiness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'achy'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'asthenia'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'calf'}, {'LEMMA': 'pain'}]},
                 ############# use case: can't and cannot ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'breathe'}]}, 
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'smell'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'can'}, {'LEMMA': 'not'}, {'LEMMA': 'taste'}]},
                 #####################################################
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'catharsis'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'cephalic'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'congested'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'congestion'}, {'POS': 'CCONJ'}, {'LEMMA': 'runny'}, {'LEMMA': 'nose'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'cramp'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'diaphoretic'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'discombobulation'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'distaste'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'doe'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'dysphagia'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'elevated'}, {'LEMMA': 'temp'}]},
                 ############## possible variations. use cases: high, higher ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'high'}, {'LEMMA': 'temp'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'elevated'}, {'LEMMA': 'temperature'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'high'}, {'LEMMA': 'temperature'}]},
                 ########################################################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'exhaustion'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'f'}, {'IS_PUNCT': True}, {'LOWER': 'c'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'febrile'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'fever'}, {'POS': 'CCONJ'}, {'LEMMA': 'chill'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'food'}, {'LEMMA': 'aversion'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gag'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'gi'}, {'LEMMA': 'symptom'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'headache'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'hyperthermia'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'inspiratory'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'lack'}, {'TAG': 'IN'}, {'LEMMA': 'olfactory'}, {'LEMMA': 'sense'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'loose'}, {'LEMMA': 'stool'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'loss'}, {'TAG': 'IN'}, {'LEMMA': 'smell'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'loss'}, {'TAG': 'IN'}, {'LEMMA': 'taste'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'low'}, {'LEMMA': 'energy'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'metallic'}, {'LEMMA': 'taste'}]},
                 ############## possible variation ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'metal'}, {'LEMMA': 'taste'}]},
                 ###############################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'mucusy'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'muscle'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'myalgia'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'myodynia'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'n'}, {'IS_PUNCT': True}, {'LOWER': 'v'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'nausea'}, {'POS': 'CCONJ'}, {'LEMMA': 'vomit'}]},
                 ############## possible variation ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'nauseous'}]},
                 ##############################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'neck'}, {'LEMMA': 'pain'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'new'}, {'LEMMA': 'loss'}, {'POS': 'ADP', 'TAG': 'IN'}, {'LEMMA': 'taste'}, {'POS': 'CCONJ', 'TAG': 'CC'}, {'LEMMA': 'smell'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'not', 'TAG': 'RB'}, {'LEMMA': 'eat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'not', 'TAG': 'RB'}, {'LEMMA': 'hungry'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'out'}, {'LEMMA': 'of', 'TAG': 'IN'}, {'LOWER': 'it', 'TAG': 'PRP'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'pain'}, {'LEMMA': 'with', 'POS': 'ADP'}, {'LEMMA': 'inspiration'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'painful'}, {'LEMMA': 'swallow'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'pleurisy'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'pnd'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'poor'}, {'LEMMA': 'appetite'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'purgation'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'queasiness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'queasy'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'respiratory'}, {'LEMMA': 'difficulty'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'respiratory'}, {'LEMMA': 'distress'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'retch'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'rhinorrhea'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'rigor'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'rhinorrhea'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'runny'}, {'LEMMA': 'nose'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'scratchy'}, {'LEMMA': 'throat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'shake'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'shakey'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'shakiness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'short'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'shortness'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'shortness'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}, {'POS': 'CCONJ'}, {'LEMMA': 'difficulty'}, {'LEMMA': 'breathing'}]},
                 ############## possible variation ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'short'}, {'POS': 'ADP'}, {'LEMMA': 'breath'}, {'POS': 'CCONJ'}, {'LEMMA': 'difficulty'}, {'LEMMA': 'breathing'}]},
                 ##############################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'sob'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'sore'}, {'LEMMA': 'throat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'soreness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'stuffy'}, {'LEMMA': 'nose'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'sweaty'}]},
                 ############## possible variation ############
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'sweat'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'sweating'}]},
                 ##############################################
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'throw'}, {'LEMMA': 'up'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'tired'}]}, 
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'tiredness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'tremor'}]},
                 {'label': 'ENTITY', 'pattern': [{'LOWER': 'trot'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'trouble'}, {'LEMMA': 'breathing'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'watery'}, {'LEMMA': 'stool'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'weariness'}]},
                 {'label': 'ENTITY', 'pattern': [{'LEMMA': 'wear'}, {'LEMMA': 'out'}]}
                ]
    
    ruler = EntityRuler(nlp, overwrite_ents=True)
    ruler.add_patterns(patterns)
    
    return ruler

In [16]:
def create_gazetteer_lexicon_count_dictionary(gazetteer):
    
    dict_lexicon_count = defaultdict(list)
    for x in gazetteer:
        dict_lexicon_count[x] = 0
    
    return dict_lexicon_count

In [17]:
def mention_using_gaz(gaz_csv, notes_for_training, doc_folder, dict_gaz, output, output_lex_count):
    
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    gaz = create_gazetteer(gaz_csv)
    patterns = [nlp.make_doc(text) for text in gaz]
    
    nlp_lemma = lg.load()
    
    dict_files_positive = defaultdict(dict)
    init_dict(dict_files_positive, notes_for_training, dict_gaz)
    
    dict_files_negative = defaultdict(dict)
    init_dict(dict_files_negative, notes_for_training, dict_gaz)
    
    dict_files_final = defaultdict(dict)
    init_dict(dict_files_final, notes_for_training, dict_gaz)
    
    dict_lex_count = create_gazetteer_lexicon_count_dictionary(gaz)
    
    span_left = 10
    span_right = 2
    span_left_temporal = 15
    span_right_temporal = 15
    
    nlp_neg = scilg.load()
    linker = EntityLinker(resolve_abbreviations=True, name="umls")
    nlp_neg.add_pipe(linker)
    #matcher = create_matcher(nlp_neg)
    #nlp_neg.add_pipe(matcher)
    ruler = create_ruler(nlp_neg)
    nlp_neg.add_pipe(ruler)
    negex = Negex(nlp_neg, language = "en_clinical_sensitive", chunk_prefix = ["without", "no"])
    negex.add_patterns(preceding_negations = ['deny'])
    nlp_neg.add_pipe(negex, last = True)
    
    for file in notes_for_training:
        with open(os.path.join(doc_folder, file), 'r') as f:
            for ent_id, men, text in get_gaz_matches(nlp.tokenizer, patterns, f, span_left, span_right):
                words = [token.lemma_ for token in nlp_lemma(men.lower().strip())]
                sent_words = [token.lemma_ for token in nlp_lemma(text.lower().strip())]
                new_str = join_words(words)
                new_str_sent = join_words(sent_words)
                
                #print(new_str)
                # additional conditions to detect use case like: '... no fever. Patient has sore throat ...'
                split_strings = new_str_sent.split('.')
                for sub in split_strings:
                    threshold = 2
                    if (len(sub.split()) >= threshold):
                        neg = nlp_neg(sub)
                        for e in neg.ents:
                            parent = check_dict(dict_gaz, e.text)
                            if (parent != 'null'):
                                name = file.strip()
                                content = name + ', [' + new_str_sent + '], ' + e.text + ', ' + str(not e._.negex) + '\n'
                                #print(content)
                                dict_lex_count[e.text] = dict_lex_count[e.text] + 1
                                men_bool = not e._.negex
                                if men_bool:
                                    update_mdict(dict_files_positive, name, parent)
                                if men_bool == False:
                                    update_mdict(dict_files_negative, name, parent)
    
    update_final_mdict(dict_files_final, dict_files_positive, dict_files_negative)
    write_to_csv_pos_neg_final(dict_files_positive, dict_files_negative, dict_files_final, output)
    write_to_csv(dict_lex_count, output_lex_count)
    
    #print(dict_files)
    return dict_files_final

In [18]:
def select_notes_for_gaz_tuning(engine_name):
    
    engine = create_engine(engine_name)
    sql = """SELECT file
             FROM public.training_notes"""
    
    training_files = pd.read_sql(sql, engine).file.to_list()
    return training_files

In [19]:
def select_notes_for_gold_standard(gold_standard_notes):
    
    gold_notes = []
    with open(gold_standard_notes, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            mdm_link_id, encounter_date, note_id, pat_id, note_status = row
            gold_notes.append(pat_id + '_' + note_id + '.txt')
    
    gold_notes.pop(0)
    
    print('length of notes for gold evaluation: {}'.format(len(gold_notes)))
    return gold_notes

In [20]:
def main():
    
    gaz_csv = 'GAZ_group.csv'
    dict_gaz_cdc = load_gaz_cdc(gaz_csv)
    #print(dict_gaz_cdc) 
    
    gold_standard_csv = 'Z:\\ed_provider_notes\\methods_paper\\gold_standard_eval\\annotation_metadata.csv'
    doc_folder = 'Z:\\ed_provider_notes\\methods_paper\\notes_for_analysis'
    output_gaz = 'covid_gaz_cdc_mentions.csv'
    output_lex_count = 'covid_gaz_lexicon_count.csv'
    engine_name = 'postgresql+psycopg2://him85144:test123@d0pconcourse001/covid-19'
    
    gold_standard_notes = select_notes_for_gold_standard(gold_standard_csv)
    all_notes = select_notes_for_gaz_tuning(engine_name)
    print('total notes including some gold notes: {}'.format(len(all_notes)))
    training_notes = diff(all_notes, gold_standard_notes)
    print('total notes for training: {}'.format(len(training_notes)))
    gaz_men = mention_using_gaz(gaz_csv, training_notes, doc_folder, dict_gaz_cdc, output_gaz, output_lex_count)

if __name__ == "__main__":
    main()

length of notes for gold evaluation: 53
total notes including some gold notes: 1700
total notes for training: 1690


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]
