In [None]:
import pandas as pd
import spacy
import re
from termcolor import colored
from spacy import displacy
import nltk
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                        'stanford-ner/stanford-ner.jar',
                        encoding='utf-8')
nlp = spacy.load('en_core_web_lg')
nlpSci = spacy.load("en_ner_bc5cdr_md")

In [None]:
epi_abs = pd.read_csv('epidemiology_classifications.csv', header=None, skiprows=[0],
                        names=['is_epi','pmid','abs'])

In [None]:
def printHighlighted(doc, indices):
    final = ''
    start = 0
    for i in indices:
        final += doc[start:i[0]].text+' '
        final += colored(doc[i[0]:i[1]].text, 'red', 'on_yellow', attrs=['bold']) + ' '
        start = i[1]
    final += doc[start:].text
    print(final)

In [None]:
def removeDuplicates(a):
    for i in range(len(a)-1,0,-1):
        if a[i] == a[i-1]:
            del a[i]

## LOCATION

In [1]:
def getLocsNltk(text):
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    locs = set()

    for word in classified_text:
        if word[1] == 'LOCATION':
            if word[0] not in locs:
                locs.add(word[0])
    
    return locs

In [None]:
def getLocsSpacy(doc):
    locs = {}
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            tokens = {token.text for token in ent}
            if ent.text not in locs:
                locs[ent.text] = tokens
            else:
                for t in tokens:
                    if t not in locs[ent.text]:
                        locs[ent.text].add(t)
                
    return locs

In [None]:
def getLocs(text):
    doc = nlp(text)
    
    spacyLocs = getLocsSpacy(doc)
    nltkLocs = getLocsNltk(text)
    locs = []
    
    for entity in spacyLocs:
        if len(spacyLocs[entity] & nltkLocs) != 0:
            locs.append(entity)
            
    return locs

## STATS

In [None]:
def getTokenChunkDict(doc):
    chunks = [chunk for chunk in doc.noun_chunks]
    tokenToChunk = {}
    for chunk in chunks:
        for i in range(chunk.start, chunk.end):
            tokenToChunk[i] = [chunk.start, chunk.end]
    return tokenToChunk

In [None]:
def isValidStat(token):
    ancestors = {a.text.lower() for a in token.ancestors}
    if 'ci' in ancestors or 'confidence' in ancestors or 'interval' in ancestors or 'p' in ancestors or 'p-value' in ancestors or 'type' in ancestors:
        return False
    if 'times' in ancestors:
        return False
    if token.text.lower() == 'one' and len(token.doc) > token.i + 1 and token.doc[token.i + 1].text == 'of':
        return False
    if token.ent_type_ == 'DATE':
        return False
    if token.ent_type_ in {'CARDINAL','QUANTITY'}:
        return True
    return False

In [None]:
def getStats(abst, display=False):
    doc = nlp(abst)
    indices = []
    tokenToChunk = getTokenChunkDict(doc)
    key_val_dz = []
    
    for sent in doc.sents:
        keywords = []
        values = []
        dzs = []
        
        keywords_text = []
        values_text = []
        dzs_text = []
        
        sciSent = nlpSci(sent.text)
        
        for token in sent:
            sciToken = nlpSci(token.text)[0]
            if token.text.lower() in {'prevalence','incidence','frequency','PR','prevalences','occurrence'}:
                if token.i in tokenToChunk:
                    keywords.append(tokenToChunk[token.i])
                else:
                    keywords.append([token.i, token.i+1])
            if isValidStat(token) or isValidStat(nlp(token.text)[0]):
                if token.i in tokenToChunk:
                    values.append(tokenToChunk[token.i])
                else:
                    values.append([token.i, token.i+1])
        if keywords != [] and values != []:
            for token in sciSent:
                if token.ent_type_ == 'DISEASE':
                    for token_reg in sent:
                        if token_reg.text == token.text:
                            if token_reg.i in tokenToChunk:
                                dzs.append(tokenToChunk[token_reg.i])
                            else:
                                dzs.append([token_reg.i, token_reg.i+1])
            
            removeDuplicates(keywords)
            removeDuplicates(values)
            removeDuplicates(dzs)
            for i in keywords:
                keywords_text.append(doc[i[0]:i[1]])
            for i in values:
                values_text.append(doc[i[0]:i[1]])
            for i in dzs:
                dzs_text.append(doc[i[0]:i[1]])
            key_val_dz.append((keywords_text, values_text, dzs_text))
            indices += keywords
            indices += values
            indices += dzs
    indices = sorted(indices)
    removeDuplicates(indices)
    if display:
        printHighlighted(doc, indices)
    return key_val_dz
    

In [None]:
sent = 'Incidence of the disease in Olmsted County, Minnesota, was 2.6/million/year.'
doc = nlp(sent)

In [None]:
getStats(sent, True)

## INFORMATION EXTRACTION

In [None]:
for i,row in epi_abs.iterrows():
    if row['is_epi'] == True:
        locs = getLocs(row['abs'])
        info = getStats(row['abs'], True)
        print(locs)
        print(info)
        print(row['pmid'])
        print('\n')

In [None]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    #############################################################

    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = tok.text  

            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = tok.text

        print('\nent1:',ent1)
        print('ent2:',ent2)
    #############################################################

    return [ent1.strip(), ent2.strip()]

In [None]:
def standardizeSent(sent):
    doc = nlp(sent)
    newSent = sent
    for e in reversed(doc.ents):
        if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
            l = e.label_
            if e.text[0].isdigit():
                l = 'CARDINAL'
            start = e.start_char
            end = start + len(e.text)
            newSent = newSent[:start] + l + newSent[end:]
    return newSent

In [None]:
sent = 'Incidence of the disease in Olmsted County, Minnesota, was 2.6/million/year'
sent = standardizeSent(sent)
print(sent)
doc = nlp(sent)
print([chunk for chunk in doc.noun_chunks])

for token in doc:
    print(token.text, token.dep_)

In [None]:
get_entities(sent)
