The purpose of this notebook is to test (and archive) functions that will be put into *extract_abs.py*

In [None]:
import sys
#!{sys.executable} -m pip install nltk
#!{sys.executable} -m pip install Unidecode
#!{sys.executable} -m pip install spacy
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import tokenize
STOPWORDS = set(stopwords.words('english'))
import string
PUNCTUATION = set(char for char in string.punctuation)
import csv
import spacy
import re
from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, pipeline
import numpy as np
import pandas as pd
import torch
import requests
import xml.etree.ElementTree as ET
import classify_abs
#import extract_abs
from unidecode import unidecode

In [None]:
from extract_abs import autosearch, str2sents, get_diseases, load_GARD_diseases

In [None]:
## Section: Prepare ML/DL Models
# This fuction prepares the model. Should call before running in notebook.
def init_NER_pipeline(name_or_path_to_model_folder = "ncats/EpiExtract4GARD-v2"): #NER_pipeline, labels = init_NER_pipeline()
    tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
    custommodel = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
    customNER = pipeline('ner', custommodel, tokenizer=tokenizer, aggregation_strategy='simple')
    
    config = BertConfig.from_pretrained(name_or_path_to_model_folder)
    labels = {re.sub(".-","",label) for label in config.label2id.keys() if label != "O"}
    return customNER, labels

In [None]:
#Input: Sentences & Model Outputs Output: Dictionary with all entity types (dynamic to fit multiple models)
#model_outputs is list of NER_pipeline outputs
#labels are a set of all the possible entities (not including "O"). This is a misnomer. Was originally named "entities" but changed to not get confused with other code
def parse_info(sentences, model_outputs, labels, GARD_dict, max_length):
    #do not use dict.fromkeys(labels,set()) as the value is a single instance which all keys point to. The value is therefore effectively immutable. See: https://docs.python.org/3/library/stdtypes.html?highlight=dict%20fromkeys#dict.fromkeys
    output_dict = {label:([] if label =='STAT' else set()) for label in labels}
    for output in model_outputs:
        #This abstracts the labels so that models with different types and numbers of labels can be used.
        for label in labels:
            if label == 'STAT':
                #no unique filtering for stats, also means that results stay in order
                output_dict[label]+=[entity_dict['word'] for entity_dict in output if entity_dict['entity_group'] ==label]
            else:
                #used sets to auto-filter duplicates
                output_dict[label].update({entity_dict['word'] for entity_dict in output if entity_dict['entity_group'] ==label})
                
    if 'DIS' not in labels:
        for sentence in sentences:
            diseases,ids = get_diseases(sentence, GARD_dict, max_length)
            output_dict['DIS'] = diseases
            output_dict['IDS'] = ids

    return output_dict

In [None]:
#Extracts Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic given a PubMed ID
def PMID_extraction(pmid, NER_pipeline, labels, GARD_dict, max_length): #extraction = PMID_extraction(pmid, NER_pipeline, labels, GARD_dict, max_length)
    text = classify_abs.PMID_getAb(pmid)
    if len(text)>5:
        sentences = str2sents(text)
        model_outputs = [NER_pipeline(sent) for sent in sentences]
        output_dict = parse_info(sentences, model_outputs, labels, GARD_dict, max_length)
        output_dict['ABSTRACT'] = text
        return output_dict
    else:
        out = ['ABSTRACT']
        out+=list(labels)
        output_dict =dict.fromkeys(out,"N/A")
        output_dict['ABSTRACT'] = '*ABSTRACT NOT FOUND*'
        return output_dict

In [None]:
#Extract if you already have the text and you do not want epi_predictions (this makes things much faster)
#extraction = abstract_extraction(text, NER_pipeline, labels, GARD_dict, max_length)
def abstract_extraction(text, NER_pipeline, labels, GARD_dict, max_length): 
    if len(text)>5:
        sentences = str2sents(text)
        model_outputs = [NER_pipeline(sent) for sent in sentences]
        output_dict = parse_info(sentences, model_outputs, labels, GARD_dict, max_length)
        output_dict['ABSTRACT'] = text
        return output_dict
    else:
        out = ['ABSTRACT']
        out+=list(labels)
        output_dict =dict.fromkeys(out,"N/A")
        output_dict['ABSTRACT'] = '*ABSTRACT NOT FOUND*'
        return output_dict

In [None]:
disease,ids = set(), set()

In [None]:
disease

In [None]:
ids

In [None]:
disease.add('fire')

In [None]:
disease

In [None]:
ids

In [None]:
#This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
def order_labels(entities):
    ordered_labels = []
    label_order = ['ABRV','EPI','STAT','LOC','DATE','SEX','ETHN']
    ordered_labels = [label for label in label_order if label in entities]
    #This adds any extra entities (from yet-to-be-created models) to the end of the ordered list of labels 
    for entity in entities:
        if entity not in label_order:
            ordered_labels.append(entity)
    return ordered_labels

In [None]:
#Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
#It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
# results = search_term_extraction(search_term, maxResults, NER_pipeline, labels, GARD_dict, max_length, nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer)
def search_term_extraction(search_term, maxResults, #for abstract search
                           NER_pipeline, labels, GARD_dict, max_length, #for extraction 
                           nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer): #for classification
    #Format of Output
    ordered_labels = order_labels(labels)
    columns = ['PMID', 'ABRSTRACT','EPI_PROB','IsEpi','IDS','DIS']+ordered_labels
    results = pd.DataFrame(columns=columns)
    
    ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
    search_term_list = autosearch(search_term, GARD_dict)
    
    #Gather title+abstracts into a dictionary {pmid:abstract}
    pmid_abs = classify_abs.search_getAbs(search_term_list,maxResults)
    
    for pmid, abstract in pmid_abs.items():
        epi_prob, isEpi = classify_abs.getTextPredictions(abstract, nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer)
        if isEpi:
            #Preprocessing Functions for Extraction
            sentences = str2sents(abstract)
            model_outputs = [NER_pipeline(sent) for sent in sentences]
            extraction = parse_info(sentences, model_outputs, labels, GARD_dict, max_length)
            extraction.update({'PMID':pmid, 'ABRSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi})
            #Slow dataframe update
            results = results.append(extraction, ignore_index=True)
            
    return results.sort_values('EPI_PROB', ascending=False)

In [None]:
GARD_dict, max_length = load_GARD_diseases()

In [None]:
NER_pipeline, labels = init_NER_pipeline()

In [None]:
nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer = classify_abs.init_classify_model()

In [None]:
def search(term,num_results = 50):
    return search_term_extraction(term, num_results, NER_pipeline, labels, GARD_dict, max_length, nlp, nlpSci, nlpSci2, classify_model, classify_tokenizer)

In [None]:
c = search('Fellman syndrome')
c

In [None]:
df = pd.DataFrame.from_dict({'LOC':[set(),set(),set(),set()],'DIS':[set(),set(),{'wow'},set()],'NEW':[['dang','fire'],list(),['this','is','a','lot'],[]],'woah':[[],[],[],[]]})
df

In [None]:
x = df.replace(to_replace=[list(),set()], value=np.NaN, inplace=False)
x

In [None]:
o = PMID_extraction(34449519, NER_pipeline, labels, GARD_dict, max_length)

In [None]:
o

In [None]:
labels

previous iteration

In [None]:
#These are the main three main functions that can be called in a noteboook.
#Extracts Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic given a PubMed ID
def PMID_extraction(pmid, NER_pipeline, GARD_dict, max_length):
    text = classify_abs.PMID_getAb(pmid)
    if len(text)>5:
        sentences = str2sents(text)
        model_outputs = [NER_pipeline(sent) for sent in sentences]
        ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = parse_info(sentences, model_outputs, GARD_dict, max_length)
        return text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats
    else:
        return '*ABSTRACT NOT FOUND*',{"N/A"},{"N/A"},{"N/A"},{"N/A"},["N/A"]

In [None]:
import json
import codecs

def load_GARD_diseases():
    diseases = json.load(codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig'))

    #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
    GARD_dict = {}
    GARD_firstwds = set()

    #Find out what the length of the longest disease name sequence is, of all names and synonyms
    max_length = -1
    for entry in diseases:
        if entry['name'] not in GARD_dict.keys():
            s = entry['name'].lower().strip()
            if s not in STOPWORDS and len(s)>4:
                GARD_dict[s] = entry['gard_id']
                #This will increase the false negative rate a little bit, but decrease the false positive rate tremendously
                if s.split()[0] not in STOPWORDS:
                    GARD_firstwds.add(s.split()[0])
                #compare length
                l = len(s.split())
                if l>max_length:
                    max_length = l
        if entry['synonyms']:
            for synonym in entry['synonyms']:
                if synonym not in GARD_dict.keys():
                    s = synonym.lower().strip()
                    if s not in STOPWORDS and len(s)>4:
                        GARD_dict[s] = entry['gard_id']
                        #This will increase the false negative rate a little bit, but decrease the false positive rate tremendously
                        if s.split()[0] not in STOPWORDS:
                            GARD_firstwds.add(s.split()[0])
                        #compare length
                        l = len(s.split())
                        if l>max_length:
                            max_length = l
    return GARD_dict, max_length

In [None]:
#GARD.csv d.synonyms has oddly saved string data that cannot be converted directly into a list, this converts that
def str2list(string):
    string = str(string).replace('[','')
    string = string.replace(']','')
    string = string.strip()
    str_list = string.split(',')
    for s in str_list:
        s = s.strip()
        if s=='nan':
            str_list.remove('nan')
    return str_list

In [None]:
def load_GARD_diseases():    
    GARD_df = pd.read_csv('GARD.csv')
    #Convert d.synonym strings into lists
    i=0
    for i in range(len(GARD_df['d.synonyms'])):
        GARD_df['d.synonyms'][i] = str2list(GARD_df['d.synonyms'][i])
    #Set up a new & easier to use list of diseases
    rowlist = []
    i=0
    for i in range(len(GARD_df)):
        columnlist=[]
        columnlist.append(GARD_df['d.name'][i])
        columnlist+=GARD_df['d.synonyms'][i]
        rowlist.append(columnlist)

    #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
    GARD_dict = {}
    GARD_firstwd_dict = {}

    #Find out what the length of the longest disease name sequence is, of all names and synonyms
    max_length = -1
    for i in range(len(rowlist)):
        for j in range(len(rowlist[i])):
            if rowlist[i][j] not in GARD_dict.keys():
                s = str(rowlist[i][j]).lower().strip()
                if len(s.split())>0 and s not in STOPWORDS:
                    if len(s.split())==1 and (len(s.split()[0])==1 or s.split()[0] in STOPWORDS):
                        #We dont want anything that is one letter or a stopword(if it is one word)
                        pass
                    else:
                        GARD_dict[s] = GARD_df['d.gard_id'][i]
                        #GARD_firstwd_dict[s.split()[0]] = GARD_df['d.gard_id'][i]
                        #This will increase the false negative rate a little bit, but decrease the false positive rate tremendously
                        if s.split()[0] not in STOPWORDS and len(s.split()[0])>1:
                            GARD_firstwd_dict[s.split()[0]] = GARD_df['d.gard_id'][i]
                #compare length
                l = len(s.split())
                if l>max_length:
                    max_length = l
    return GARD_dict, max_length

In [None]:
def get_diseases(sentence, GARD_dict, max_length):   
    tokens = [s.strip() for s in nltk.word_tokenize(sentence)]
    diseases, ids = [],[]
    i=0
    while i <len(tokens):  
        if (len(tokens)-i) < max_length:
            compare_length=len(tokens)-i
        else:
            compare_length = max_length
        #Compares longest sequences first and goes down until there is a match
        #print('(start compare_length)',compare_length)
        exit = False
        while compare_length>0:
            s = ' '.join(tokens[i:i+compare_length])
            for key in GARD_dict.keys():
                if key==s.lower():
                    #print('MATCH',s)
                    diseases.append(s)
                    ids.append(GARD_dict[key])
                    #Need to skip over the next few indexes
                    i+=compare_length-1
                    exit = True #this allows you to break out of two loops
                    break
            #break out of loop in case there are multiple rare diseases in the same sentence
            if exit:
                break
            else:
                compare_length-=1
        i+=1  
    return diseases,ids

In [None]:
def str2sents(string):
    for in_sent, replacement in regex_subs.items():
        string = in_sent

In [None]:
def str2sents(string):
    string = re.sub('<.{1,4}>', ' ', string)
    string = re.sub("  *", " " , string)
    string = re.sub("^ ", "" , string)
    string = re.sub("$", "" , string)
    string = re.sub("™", "" , string)
    string = re.sub("®", "" , string)
    string = re.sub("•", "" , string)
    string = re.sub("…", "" , string)
    string = re.sub("♀", "female" , string)
    string = re.sub("♂", "male" , string)
    string = re.sub("α", "[alpha]" , string)
    string = re.sub("β", "[beta]" , string)
    string = re.sub("γ", "[gamma]" , string)
    string = re.sub("δ", "[delta]" , string)
    string = re.sub("ε", "[epsilon]" , string)
    string = re.sub("ζ", "[zeta]" , string)
    string = re.sub("η", "[eta]" , string)
    string = re.sub("θ", "[theta]" , string)
    string = re.sub("ι", "[iota]" , string)
    string = re.sub("κ", "[kappa]" , string)
    string = re.sub("λ", "[lambda]" , string)
    string = re.sub("μ", "[mu]" , string)
    string = re.sub("ν", "[nu]" , string)
    string = re.sub("ξ", "[xi]" , string)
    string = re.sub("ο", "[omicron]" , string)
    string = re.sub("π", "[pi]" , string)
    string = re.sub("ρ", "[rho]" , string)
    string = re.sub("σ", "[sigma]" , string)
    string = re.sub("ς", "[sigma]" , string)
    string = re.sub("τ", "[tau]" , string)
    string = re.sub("υ", "[upsilon]" , string)
    string = re.sub("φ", "[phi]" , string)
    string = re.sub("χ", "[chi]" , string)
    string = re.sub("ψ", "[psi]" , string)
    string = re.sub("ω", "[omega]" , string)
    string = unidecode(string)
    string=string.strip()
    sentences = tokenize.sent_tokenize(string)
    return sentences

In [None]:
def init_NER_pipeline(path_to_model_folder = "./NER/outputLG7/"):
    config = BertConfig.from_json_file(str(path_to_model_folder+'config.json'))
    tokenizer = BertTokenizer.from_pretrained(path_to_model_folder)
    custommodel = AutoModelForTokenClassification.from_pretrained(path_to_model_folder,config=config,local_files_only=True)
    customNER = pipeline('ner', custommodel, config=config,tokenizer=tokenizer,aggregation_strategy='simple')
    return customNER

In [None]:
GARD_dict, max_length = load_GARD_diseases()

In [None]:
#Can search by 7-digit GARD_ID, 12-digit "GARD:{GARD_ID}", matched search term, or arbitrary search term
#Returns list of terms to search by
def autosearch(searchterm, GARD_dict, matching=2):
    while matching>=1:
        if 'GARD:' in searchterm and len(searchterm)==12:
            return [k for k,v in GARD_dict.items() if v==searchterm]
        
        elif len(searchterm)==7 and searchterm[0].isdigit() and searchterm[-1].isdigit():
            searchterm = 'GARD:'+searchterm
            return [k for k,v in GARD_dict.items() if v==searchterm]
        
        elif searchterm in GARD_dict.keys():
            return [k for k,v in GARD_dict.items() if v==GARD_dict[searchterm]]
        
        else:
            searchterm = searchterm.replace(' ','-')
            return autosearch(searchterm, GARD_dict, matching-1)
    print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
    return [searchterm]

In [None]:
searchterm_list = 'tay sachs'
print(searchterm_list)
#type validation, allows string or list input
if type(searchterm_list)!=list:
    if type(searchterm_list)==str:
        searchterm_list = [searchterm_list]
    else:
        searchterm_list = list(searchterm_list)
print(searchterm_list)

In [None]:
autosearch('tay sachs',GARD_dict)

In [None]:
extract_abs.autosearch('Tay-Sachs Disease',GARD_dict)

In [None]:
def combined_autosearch_API(searchterm_list, maxResults):
    i = 0
    pmids = set()
    pmid_abs = {}
    
    for dz in searchterm_list:
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]

        ## get results from searching for disease name through PubMed API
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('IdList'):
            if i >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('Id')]
            pmids.update(pmidlist)
            i+=len(pmidlist)

        ## get results from searching for disease name through EBI API
        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('result'):
            if i >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('id')]
            if len(pmidlist) > 0:
                pmid = pmidlist[0]
                if pmid[0].isdigit():
                    pmids.add(pmid)
                    i += 1

    ## get abstracts from EBI PMID API and output a dictionary
    for pmid in pmids:
        abstract = classify_abs.PMID_getAb(pmid)
        if len(abstract)>5:
            pmid_abs[pmid] = abstract
    
    return pmid_abs

In [None]:
def combined_autosearch_API(searchterm_list, maxResults):
    i = 0
    pmids = set()
    pmid_abs = {}
    
    for dz in searchterm_list:
        term = ''
        dz_words = dz.split()
        for word in dz_words:
            term += word + '%20'
        query = term[:-3]

        ## get results from searching for disease name through PubMed API
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('IdList'):
            if i >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('Id')]
            pmids.update(pmidlist)
            i+=len(pmidlist)

        ## get results from searching for disease name through EBI API
        url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
        r = requests.get(url)
        root = ET.fromstring(r.content)

        # loop over resulting articles
        for result in root.iter('result'):
            if i >= maxResults:
                break
            pmidlist = [pmid.text for pmid in result.iter('id')]
            if len(pmidlist) > 0:
                pmid = pmidlist[0]
                if pmid[0].isdigit():
                    pmids.add(pmid)
                    i += 1

    ## get abstracts from EBI PMID API and output a dictionary
    for pmid in pmids:
        abstract = classify_abs.PMID_getAb(pmid)
        if len(abstract)>5:
            pmid_abs[pmid] = abstract
    
    return pmid_abs

In [None]:
termlist = autosearch('GARD:0007737', GARD_dict)

In [None]:
termlist

In [None]:
dic = combined_autosearch_API(termlist, 100)

In [None]:
dic

In [None]:
len(dic)

In [None]:
def getAbs(PMID):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(PMID)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    titles = [title.text for title in root.iter('title')]
    abstracts = [abstract.text for abstract in root.iter('abstractText')]
    if len(abstracts) > 0 and len(abstracts[0])>5:
        return titles[0]+' '+abstracts[0]
    else:
        return ''

In [None]:
def pubmed_API(searchterm, maxResults):
    # get results from searching for disease name through EBI API
    term = ''
    dz_words = searchterm.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
    r = requests.get(url)
    root = ET.fromstring(r.content)

    pmids = []
    i = 0

    # loop over resulting articles
    for result in root.iter('IdList'):
        if i >= maxResults:
            break
        pmids = [pmid.text for pmid in result.iter('Id')]
    
    pmid_to_abs = {}
    for pmid in pmids:
        abstract = classify_abs.PMID_getAb(pmid)
        if len(abstract)>5:
            pmid_to_abs[pmid]=abstract
    
    return pmid_to_abs

In [None]:
def EBI_API(searchterm, maxResults):
    # get results from searching for disease name through EBI API
    term = ''
    dz_words = searchterm.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    print('query',query)
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    print('url',url)
    r = requests.get(url)
    root = ET.fromstring(r.content)

    pmid_to_abs = {}
    i = 0

    # loop over resulting articles
    for result in root.iter('result'):
        if i >= maxResults:
            break
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                if len(abstracts) > 0:
                    pmid_to_abs[pmid] = abstracts[0]
                    i += 1
    return pmid_to_abs

In [None]:
def combined_API(searchterm, maxResults):
    term = ''
    dz_words = searchterm.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    i = 0
    pmids = set()
    
    ## get results from searching for disease name through PubMed API
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    # loop over resulting articles
    for result in root.iter('IdList'):
        if i >= maxResults:
            break
        pmids = {pmid.text for pmid in result.iter('Id')}
    
    ## get results from searching for disease name through EBI API
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    # loop over resulting articles
    for result in root.iter('result'):
        if i >= maxResults:
            break
        pmidlist = [pmid.text for pmid in result.iter('id')]
        if len(pmidlist) > 0:
            pmid = pmidlist[0]
            if pmid[0].isdigit():
                pmids.add(pmid)
                i += 1
    
    ## get abstracts from EBI PMID API and output a dictionary
    pmid_abs = {}
    for pmid in pmids:
        abstract = classify_abs.PMID_getAb(pmid)
        if len(abstract)>5:
            pmid_abs[pmid] = abstract
    
    return pmid_abs

In [None]:
#Combined API, but optimized to have fewer API calls
def optimized_API(search_term,maxResults): #returns a dictionary of {pmids:abstracts}    
    term = ''
    dz_words = search_term.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)

    pmids_abs = {}
    i = 0

    # loop over resulting articles
    for result in root.iter('result'):
        if i >= maxResults:
            break
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                titles = [title.text for title in result.iter('title')]
                if len(abstracts) > 0 and len(abstracts[0])>5:
                    pmids_abs[pmid] = titles[0]+' '+abstracts[0]
                    i+=1
    
    #PubMed API gets different results
    url2 = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
    r2 = requests.get(url2)
    root2 = ET.fromstring(r2.content)
    
    for result in root.iter('IdList'):
        if i >= maxResults:
            break
        pmids = [pmid.text for pmid in result.iter('Id')]
        i+=len(pmids)
        for pmid in pmids:
            if pmid not in pmids_abs.keys():
                abstract = classify_abs.PMID_getAb(pmid)
                if len(abstract)>5:
                    pmids_abs[pmid]=abstract
     
    return pmids_abs

In [None]:
### DEPRECATED, old function, only takes in string input for search term
## Gets results from searching through both PubMed and EBI search term APIs, also makes use of the EBI API for PMIDs. 
## EBI API and PubMed API give different results
# This makes n+2 API calls where n<=maxResults, which is slow 
# There is a way to optimize by gathering abstracts from the EBI API when also getting pmids but did not pursue due to time constraints
def search_getAbs(searchterm, maxResults):
    term = ''
    dz_words = searchterm.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    i = 0
    pmids = set()
    
    ## get results from searching for disease name through PubMed API
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    # loop over resulting articles
    for result in root.iter('IdList'):
        if i >= maxResults:
            break
        pmids = {pmid.text for pmid in result.iter('Id')}
    
    ## get results from searching for disease name through EBI API
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    # loop over resulting articles
    for result in root.iter('result'):
        if i >= maxResults:
            break
        pmidlist = [pmid.text for pmid in result.iter('id')]
        if len(pmidlist) > 0:
            pmid = pmidlist[0]
            if pmid[0].isdigit():
                pmids.add(pmid)
                i += 1
    
    ## get abstracts from EBI PMID API and output a dictionary
    pmid_abs = {}
    for pmid in pmids:
        abstract = PMID_getAb(pmid)
        if len(abstract)>5:
            pmid_abs[pmid] = abstract
    
    return pmid_abs

In [None]:
def PMID_extraction(pmid, NER_pipeline, GARD_dict, max_length):
    text = getAbs(pmid)
    if len(text)>5:
        sentences = str2sents(text)
        results = [NER_pipeline(sent) for sent in sentences]
        ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = parse_info(sentences, results)
        return text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats
    else:
        return '*ABSTRACT NOT FOUND*',{"N/A"},{"N/A"},{"N/A"},{"N/A"},{"N/A"}

In [None]:
def text_extraction(text, NER_pipeline, GARD_dict, max_length):
    if len(text)>5:
        sentences = str2sents(text)
        results = [NER_pipeline(sent) for sent in sentences]
        ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = parse_info(sentences, results)
        return text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats
    else:
        return '*Text too short*',{"N/A"},{"N/A"},{"N/A"},{"N/A"},{"N/A"}

In [None]:
customNER = init_NER_pipeline()

In [None]:
GARD_dict, max_length = load_GARD_diseases()

In [None]:
text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = PMID_extraction(25274184, customNER, GARD_dict, max_length)

In [None]:
print(text,
      '\n\nGARD Disease ID: ',ab_ids, 
      '\nGARD Disease: ',ab_dis, 
      '\nLocations: ',ab_locs, 
      '\nEpi Identifier: ',ab_epis, 
      '\nEpi Statistics: ',ab_stats)

In [None]:
if __name__ == '__main__':
    print('Loading NER Pipeline..')
    path_to_model_folder = input('Input path_to_model_folder. Input "d" to use default model.')
    if path_to_model_folder == 'd':
        NER_pipeline = init_NER_pipeline()
    else:
        NER_pipeline = init_NER_pipeline(path_to_model_folder)
    print('NER Pipeline Loaded')
    
    print('Loading GARD Disease Dictionary....')
    GARD_dict, GARD_firstwd_dict, max_length = load_GARD_diseases()
    print('Loading GARD Diseases Loaded')
    pmid = input('\nEnter PubMed PMID (or DONE): ')
    while pmid != 'DONE':
        text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = PMID_extraction(pmid, NER_pipeline, GARD_dict, max_length)
        print('GARD Disease ID: ',ab_ids, 
              '\nGARD Disease: ',ab_dis, 
              '\nLocations: ',ab_locs, 
              '\nEpi Identifier: ',ab_epis, 
              '\nEpi Statistics: ',ab_stats)
        pmid = input('\nEnter PubMed PMID (or DONE): ')

In [None]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
import re
def filter_results(searchterm_list,pmid_abs):
    
    terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
    '''
    joined = ' '.join(searchterm_list)
    comma_gone = re.sub(',','',joined)
    split = set(comma_gone.split())
    key_words = split.difference(STOPWORDS)
    search_set = set(searchterm_list)
    terms = search_set.union(key_words)
    '''
    print(len(terms),terms)
    print()
    for abstract in pmid_abs.values():
        for term in terms:
            print(term)
            if term in abstract:
                print('yes')
            else:
                print('no')
            print()


In [5]:
searchterm_list = ['facioscapulohumeral muscular dystrophy', 'muscular dystrophy, facioscapulohumeral', 'facioscapulohumeral muscular dystrophy 1a', 'fshmd1a', 'muscular dystrophy, facioscapulohumeral, type 1a', 'fshd1a', 'landouzy-dejerine muscular dystrophy']
import extract_abs
GARD_dict, max_length = extract_abs.load_GARD_diseases()

In [6]:
d= {14321:"Predictors of functional outcomes in patients with facioscapulohumeral muscular dystrophy. Facioscapulohumeral muscular dystrophy (FSHD) is one of the most prevalent muscular dystrophies characterized by considerable variability in severity, rates of progression and functional outcomes. Few studies follow FSHD cohorts long enough to understand predictors of disease progression and functional outcomes, creating gaps in our understanding, which impacts clinical care and the design of clinical trials. Efforts to identify molecularly targeted therapies create a need to better understand disease characteristics with predictive value to help refine clinical trial strategies and understand trial outcomes. Here we analysed a prospective cohort from a large, longitudinally followed registry of patients with FSHD in the USA to determine predictors of outcomes such as need for wheelchair use. This study analysed de-identified data from 578 individuals with confirmed FSHD type 1 enrolled in the United States National Registry for FSHD Patients and Family members. Data were collected from January 2002 to September 2019 and included an average of 9 years (range 0-18) of follow-up surveys. Data were analysed using descriptive epidemiological techniques, and risk of wheelchair use was determined using Cox proportional hazards models. Supervised machine learning analysis was completed using Random Forest modelling and included all 189 unique features collected from registry questionnaires. A separate medications-only model was created that included 359 unique medications reported by participants. Here we show that smaller allele sizes were predictive of earlier age at onset, diagnosis and likelihood of wheelchair use. Additionally, we show that females were more likely overall to progress to wheelchair use and at a faster rate as compared to males, independent of genetics. Use of machine learning models that included all reported clinical features showed that the effect of allele size on progression to wheelchair use is small compared to disease duration, which may be important to consider in trial design. Medical comorbidities and medication use add to the risk for need for wheelchair dependence, raising the possibility for better medical management impacting outcomes in FSHD. The findings in this study will require further validation in additional, larger datasets but could have implications for clinical care, and inclusion criteria for future clinical trials in FSHD.",
34242: "Promising Perspective to Treatment: Nutraceuticals and Phytochemicals.  (FSHD) is in the top three list of all dystrophies with an approximate 1:8000 incidence. It is not a life-threatening disease; however, progression of the disease extends over being wheel-chair bound. Despite some drug trials have been continuing, including DUX4 inhibition, TGF-ß inhibition and resokine which promote healthier muscle, there is not an applicable treatment option for FSHD today. Still, there is a need for new agent or agents to heal, to stop or at least to slow down the muscle wasting. Current FSHD studies with nutraceuticals as vitamin C, vitamin E, coenzyme Q10, zinc, selenium, and phytochemicals as curcumin or genistein, daidzein flavonoids provide promising treatment strategies. In this review we will present the clinical and molecular nature of FSHD and focus on nutraceuticals and phytochemicals that may alleviate FSHD. Via interconnection of impaired pathophysiological FSHD pathways together with nutraceuticals and phytochemicals in the light of literature, we present both studied and novel approaches that can contribute FSHD treatment."}


filter_results(searchterm_list,d)

searchterm_list ['facioscapulohumeral muscular dystrophy', 'muscular dystrophy, facioscapulohumeral', 'facioscapulohumeral muscular dystrophy 1a', 'fshmd1a', 'muscular dystrophy, facioscapulohumeral, type 1a', 'fshd1a', 'landouzy-dejerine muscular dystrophy']

JOINED facioscapulohumeral muscular dystrophy muscular dystrophy, facioscapulohumeral facioscapulohumeral muscular dystrophy 1a fshmd1a muscular dystrophy, facioscapulohumeral, type 1a fshd1a landouzy-dejerine muscular dystrophy

comma_gone facioscapulohumeral muscular dystrophy muscular dystrophy facioscapulohumeral facioscapulohumeral muscular dystrophy 1a fshmd1a muscular dystrophy facioscapulohumeral type 1a fshd1a landouzy-dejerine muscular dystrophy

split {'fshmd1a', 'landouzy-dejerine', '1a', 'dystrophy', 'facioscapulohumeral', 'fshd1a', 'type', 'muscular'}

STOPWORDS {'during', 'm', 'aren', 'when', 'which', 'itself', 'mightn', 'shouldn', 'themselves', 'was', "mustn't", 'wasn', 'at', "haven't", 'you', 'him', 'yours', 'as'

In [4]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
import re

def filter_results(searchterm_list,pmid_abs):
    print('searchterm_list',searchterm_list)
    print()
    joined = ' '.join(searchterm_list)
    print('JOINED',joined)
    print()
    comma_gone = re.sub(',','',joined)
    print("comma_gone",comma_gone)
    print()
    split = set(comma_gone.split())
    print("split",split)
    print()
    print("STOPWORDS",STOPWORDS)
    print()
    key_words = split.difference(STOPWORDS)
    print("key_words",key_words)
    print()
    search_set = set(searchterm_list)
    print(search_set)
    terms = search_set.union(key_words)
    print('terms',len(terms),terms)
    print()
    for abstract in pmid_abs.values():
        for term in terms:
            print(term)
            if term in abstract:
                print('yes')
            else:
                print('no')
            print()

In [None]:
#Deprecated?
'''
if __name__ == '__main__':
    print('Loading NER Pipeline..')
    path_to_model_folder = input('Input path_to_model_folder. Input "d" to use default model.')
    if path_to_model_folder == 'd':
        NER_pipeline = init_NER_pipeline()
    else:
        NER_pipeline = init_NER_pipeline(path_to_model_folder)
    print('NER Pipeline Loaded')
    
    print('Loading GARD Disease Dictionary....')
    GARD_dict, GARD_firstwd_dict, max_length = load_GARD_diseases()
    
    pmid = input('\nEnter PubMed PMID (or DONE): ')
    while pmid != 'DONE':
        text = getAbs(pmid)
        if len(text)>5:
            sentences = str2sents(text)
            results = [NER_pipeline(sent) for sent in sentences]
            ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = parse_info(sentences, results)
            print(text,
            '\nGARD Disease ID: ',ab_ids, 
            '\nGARD Disease: ',ab_dis,
            \nLocations: ',ab_locs, 
            '\nEpi Identifier: ',ab_epis, 
            '\nEpi Statistics: ',ab_stats)
        else:
            print("Title and abstract not found.")
        pmid = input('\nEnter PubMed PMID (or DONE): ')
'''

In [None]:
#Deprecated?
'''
#This function can be sped up by using the GARD_firstwd_dict, but as this function works rn, I will not be implementing.
#There are most likely many other ways to optimize this function
def get_diseases(sentence, GARD_dict, max_length):   
    tokens = [s.lower().strip() for s in nltk.word_tokenize(sentence)]
    diseases = []
    ids = []
    i=0
    while i <len(tokens):
        if (len(tokens)-i) < max_length:
            compare_length=len(tokens)-i
        else:
            compare_length = max_length
        #Compares longest sequences first and goes down until there is a match
        #print('(start compare_length)',compare_length)
        exit = False
        while compare_length>0:
            s = ' '.join(tokens[i:i+compare_length])
            for key in GARD_dict.keys():
                if key==s.lower():
                    diseases.append(s)
                    ids.append(GARD_dict[key])
                    #Need to skip over the next few indexes
                    i+=compare_length-1
                    exit = True #this allows you to break out of two loops
                    break
            #break out of loop in case there are multiple rare diseases in the same sentence
            if exit:
                break
            else:
                compare_length-=1
        i+=1  
    return diseases,ids
'''

In [None]:
#Deprecated
'''
def tag_diseases(tokens,labels, GARD_dict, GARD_firstwd_dict, max_length):   
    i=0
    while i <len(tokens):
        if (len(tokens)-i) < max_length:
            compare_length=len(tokens)-i
        else:
            compare_length = max_length
        #Compares longest sequences first and goes down until there is a match
        #print('(start compare_length)',compare_length)
        exit = False
        while compare_length>0:
            s = ' '.join(tokens[i:i+compare_length])
            for key in GARD_dict.keys():
                if key==s.lower():
                    labels[i] = 'B-DIS'
                    #print(s)
                    for j in range(i+1,i+compare_length):
                        labels[j] = 'I-DIS'
                    #Need to skip over the next few indexes
                    #print('(compare_length):',compare_length)
                    i+=compare_length-1
                    exit =True #this allows you to break out of two loops
                    break
            #break out of loop in case there are multiple rare diseases in the same sentence
            if exit:
                break
            else:
                compare_length-=1
        i+=1  
    return tokens,labels
'''