Prepare a dataset of abstracts with high epidemiology probability according to Jennifer's model

In [1]:
#Choose how many diseases to sample, how many articles to return for each, 
#and max df size (some primary disease names may not return hits and I am not going through synonyms, so may need extras to fill it)
#Generally num_dz*num_articles should be > max_size
#test_num is how many articles you want to retrieve in test/projection set, set low like 2-5

num_dz = 150 #input("How many diseases to sample? ") 
num_articles = 100 #input("How many abstracts to gather per disease? ")
test_num = 3 #input("How many articles to retrieve in test set? 2-5 is recommended. ")

In [2]:
#Download any necessary datasets & dependencies, only need to do this once
#import sys
#!{sys.executable} -m pip install spacy
#!{sys.executable} -m spacy download en_core_web_lg
#!{sys.executable} -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz
#!{sys.executable} -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install requests

#Installing dependencies for classify_abs.py
#!{sys.executable} -m pip install tensorflow
#!{sys.executable} -m pip install nltk
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

import numpy as np
import pandas as pd
import classify_abs
import requests
import xml.etree.ElementTree as ET
import spacy
import time
import datetime

2021-07-01 18:09:45.131954: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-01 18:09:45.131979: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
nlp = spacy.load('en_core_web_lg')
nlpSci = spacy.load("en_ner_bc5cdr_md")
nlpSci2 = spacy.load('en_ner_bionlp13cg_md')

In [4]:
#Read in GARD diseases
df = pd.read_csv('GARD.csv')
df.tail()

Unnamed: 0,d.gard_id,d.name,d.synonyms
6056,GARD:0013731,T-cell prolymphocytic leukemia,[T Cell Prolymphocytic Leukemia]
6057,GARD:0013735,Spastic paraplegia 47,
6058,GARD:0013737,AP-4-Associated Hereditary Spastic Paraplegia,[Severe intellectual disability and progressiv...
6059,GARD:0013743,"Multicentric osteolysis, nodulosis and arthrop...","[Torg-Winchester Syndrome,Torg Syndrome,Nodulo..."
6060,GARD:0013818,Sphingosine phosphate lyase insufficiency synd...,"[SPL insufficiency syndrome,SPLIS,Familial ste..."


In [5]:
#Randomly pick n disease names
names_df = df['d.name'].sample(n=num_dz)
names = names_df.values.tolist()
print(names)

['Fragile X syndrome type 1', 'Sudden infant death syndrome', 'Mycosis fungoides', 'Hymenolepiasis', 'Ossicular Malformations, familial', 'Hereditary hemorrhagic telangiectasia type 2', 'Marek disease', 'Generalized peeling skin syndrome', 'Zunich neuroectodermal syndrome', 'Defective apolipoprotein B-100', 'Orbital lymphangioma', 'Asternia', 'Combined pituitary hormone deficiencies, genetic forms', 'Chronic progressive external ophthalmoplegia', 'Tubulointerstitial nephritis and uveitis', 'Dyschondrosteosis nephritis', 'Tetraamelia-multiple malformations syndrome', 'Congenital myasthenic syndromes', 'Hyaline fibromatosis syndrome', 'Desmoplastic small round cell tumor', 'Genetic reflex epilepsy', 'Brucellosis', 'Smith-Fineman-Myers syndrome', 'Silengo Lerone Pelizza syndrome', 'Familial deafness', 'Paris-Trousseau thrombocytopenia', 'Fetal hydantoin syndrome', 'Feigenbaum Bergeron Richardson syndrome', 'Primary melanoma of the central nervous system', 'Brachydactylous dwarfism Mseleni

In [6]:
def rank_results(dz, maxResults):
    dz_results = pd.DataFrame(columns=['pmid', 'abstract', 'epi_prob', 'is_epi'])

    # get results from searching for disease name through EBI API
    term = ''
    dz_words = dz.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)

    pmid_to_abs = {}
    i = 0

    # loop over resulting articles
    for result in root.iter('result'):
        if i >= maxResults:
            break
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                if len(abstracts) > 0:
                    pmid_to_abs[pmid] = abstracts[0]
                    i += 1

    # predict on each PubMed ID & add to dataframe
    for pmid in pmid_to_abs:
        prob, isEpi = classify_abs.getPredictions(pmid, nlp, nlpSci, nlpSci2, 'my_model_orphanet_final')
        dz_results = dz_results.append({'pmid':pmid, 'abstract':pmid_to_abs[pmid], 'epi_prob':prob
                                         , 'is_epi':isEpi}, ignore_index=True)
    return dz_results#.sort_values('epi_prob', ascending=False)
    

In [7]:
#Test rank results function and generate estimated time of completion
test_num
t1 = time.time()
df = rank_results(names[1], test_num)
t2 = time.time()
avg_time = (t2-t1)/test_num
#print('Time taken to complete test: ' + str(datetime.timedelta(seconds=(t2-t1))))
print('Average seconds per rank results call: '+ str(avg_time))
print('Est. time to build dataset of '+str(num_dz*num_articles)+' abstracts: ' + str(datetime.timedelta(seconds=(avg_time*num_dz*num_articles))))
print('Est. completion is:',time.ctime(time.time()+avg_time*num_dz*num_articles))
df

2021-07-01 18:10:03.071801: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-01 18:10:03.071828: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-01 18:10:03.071846: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pubmed.ncats.io): /proc/driver/nvidia/version does not exist
2021-07-01 18:10:03.072025: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-07-01 18:10:10.539664: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of

Average seconds per rank results call: 9.63810682296753
Est. time to build dataset of 15000 abstracts: 1 day, 16:09:31.602345
Est. completion is: Sat Jul  3 10:20:01 2021


Unnamed: 0,pmid,abstract,epi_prob,is_epi
0,33623412,Sudden Infant Death syndrome (SIDS) is a diagn...,0.009953,False
1,33652660,Sudden infant death syndrome (SIDS) is defined...,0.894104,True
2,33549332,Shi et al. recently identified a brainstem pep...,0.011342,False


In [8]:
#Actually get the dataset
frames = []
t1 = time.time()
for dz in names:
    frames.append(rank_results(dz, num_articles))
df = pd.concat(frames)
t2 = time.time()
print('Completion time is:',time.ctime(time.time()))
print('Time taken to build dataset of '+str(len(df.index))+' abstracts: ' + str(datetime.timedelta(seconds=(t2-t1))))

Completion time is: Fri Jul  2 01:00:50 2021
Time taken to build dataset of 2590 abstracts: 6:50:20.265462


In [9]:
df

Unnamed: 0,pmid,abstract,epi_prob,is_epi
0,33906942,Loss of the fragile X mental retardation prote...,0.015510,False
1,34089433,Trace elements have important functions in sev...,0.011081,False
2,34073785,Breastfeeding is associated with numerous heal...,0.927072,True
3,34068266,Fragile X syndrome (FXS) is a neurodevelopment...,0.020304,False
4,33632309,<h4>Background</h4>Sleep disorders are common ...,0.843288,True
...,...,...,...,...
14,33570643,Fanconi anemia (FA) is a complex genetic disor...,0.017008,False
15,33576801,Meiotic recombination increases genetic divers...,0.012714,False
16,34118472,Fanconi anemia (FA) due to biallelic mutations...,0.017601,False
17,32970355,"<h4>Background</h4>Brain atrophy, abnormal pit...",0.013975,False


In [10]:
#Select only those that are above than 0.5 probability
df_high = df[df.epi_prob > 0.5]
df_high

Unnamed: 0,pmid,abstract,epi_prob,is_epi
2,34073785,Breastfeeding is associated with numerous heal...,0.927072,True
4,33632309,<h4>Background</h4>Sleep disorders are common ...,0.843288,True
10,34121987,The human gut microbiome is the ecosystem of m...,0.994891,True
13,33820803,Prevalence of metabolic disturbances is higher...,0.995352,True
1,33652660,Sudden infant death syndrome (SIDS) is defined...,0.894104,True
...,...,...,...,...
14,33138063,Bardet-Biedl syndrome (BBS) is a rare autosoma...,0.822426,True
4,33803570,High-risk human papillomavirus (HPV) is preval...,0.995949,True
5,34045293,"Fanconi anemia, the most frequent genetic caus...",0.968908,True
12,32947577,Fanconi anemia (FA) is the most common inherit...,0.988090,True


In [11]:
positive_sentence_set = df_high.sort_values('epi_prob', ascending=False)
positive_sentence_set

Unnamed: 0,pmid,abstract,epi_prob,is_epi
6,31218158,In order to ascertain the current status of an...,0.998309,True
15,33985607,Brucellosis remains one of the main zoonoses w...,0.998234,True
3,33754462,<h4>Background</h4>Few data are available on t...,0.998059,True
5,33748277,Pseudoachondroplasia (PSACH) is an autosomal d...,0.997997,True
9,33581733,<h4>Background</h4>Pseudomyxoma peritonei (PMP...,0.997871,True
...,...,...,...,...
14,32708003,Patients with type 1 Gaucher disease (GD1) pre...,0.526981,True
4,33916693,Uveal melanoma (UM) is currently classified by...,0.525522,True
5,26026795,PURPOSE:3-Hydroxyisobutryl-CoA hydrolase (HIBC...,0.522585,True
9,34150882,<b>Background:</b> Brucellosis is an important...,0.518870,True


In [12]:
positive_sentence_set.to_csv('positive_abstract_set-.csv',index=False)

In [13]:
df.to_csv('whole_abstract_set.csv',index=False)