In [1]:
import xml.etree.ElementTree as ET
import re
import requests
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import json
import pandas as pd
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en import English
nlpSci = spacy.load("en_ner_bc5cdr_md")

### Get Orphanet articles

In [16]:
tree = ET.parse('en_product9_prev.xml')
root = tree.getroot()

In [17]:
prev_pmids = set()

for child in root.iter('*'):
    if child.tag == 'Source' and 'PMID' in child.text:
        pmids = re.findall('\d{6,8}', child.text)
        for pmid in pmids:
            if pmid not in prev_pmids:
                prev_pmids.add(pmid)

### Prepare negative examples

1. get all GARD disorders
2. for each disorder, find 5 (or fewer) article pmids resulting from searching it on PubMed
3. filter epidemiology from resulting pmids

In [5]:
def getAbs(pmid):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+pmid+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    results = [abstract.text for abstract in root.iter('abstractText')]
    
    if len(results) > 0:
        return results[0]
    else:
        return ''
    

Get all GARD dzs from neo4j

In [6]:
with open('records.json') as f:
    records = json.load(f)

In [7]:
disorders = set()

for entry in records:
    disorders.add(entry['GARD_Name'])

Get PubMed results for each dz name

In [8]:
j=0
disorder_to_results = {}
all_results = pd.DataFrame(columns=['pmid', 'abstract', 'mesh'])
for dz in disorders:
    if j%50 == 0:
        print(j, len(all_results))
    j+=1
    
    term = ''
    dz_words = dz.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    pmid_to_abs = {}
    i = 0
    
    for result in root.iter('result'):
        if i >= 5:
            break
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                if len(abstracts) > 0:
                    mesh = [mesh.text for mesh in result.iter('descriptorName')]
                    pmid_to_abs[pmid] = [abstracts[0], mesh]
                    i += 1
    
    disorder_to_results[dz] = i
    for pmid in pmid_to_abs:
        all_results = all_results.append({'pmid':pmid, 'abstract':pmid_to_abs[pmid][0], 'mesh':pmid_to_abs[pmid][1]}
                                         , ignore_index=True)
    

0 0
50 240
100 469
150 712
200 950
250 1180
300 1419
350 1652
400 1892
450 2128
500 2360
550 2596
600 2846
650 3081
700 3326
750 3568
800 3797
850 4042
900 4252
950 4492
1000 4733
1050 4968
1100 5203
1150 5444
1200 5672
1250 5902
1300 6135
1350 6364
1400 6604
1450 6848
1500 7095
1550 7334
1600 7554
1650 7795
1700 8023
1750 8250
1800 8474
1850 8717
1900 8950
1950 9196
2000 9424
2050 9656
2100 9898
2150 10124
2200 10362
2250 10584
2300 10819
2350 11049
2400 11269
2450 11513
2500 11762
2550 11970
2600 12203
2650 12441
2700 12673
2750 12908
2800 13147
2850 13385
2900 13614
2950 13854
3000 14088
3050 14312
3100 14543
3150 14762
3200 15000
3250 15236
3300 15484
3350 15716
3400 15953
3450 16189
3500 16423
3550 16656
3600 16885
3650 17129
3700 17367
3750 17595
3800 17840
3850 18071
3900 18298
3950 18533
4000 18780
4050 19017
4100 19256
4150 19496
4200 19735
4250 19968
4300 20203
4350 20448
4400 20688
4450 20919
4500 21156
4550 21393
4600 21622
4650 21858
4700 22098
4750 22331
4800 22567
4850 2

Filter results

In [9]:
len(all_results)

28515

Drop results that are also found in the Orphanet dataset

In [19]:
for i,row in all_results.iterrows():
    if row['pmid'] in prev_pmids:
        all_results = all_results.drop([i])

In [20]:
len(all_results)

28046

Drop results with epidemiology keywords

In [24]:
i = 0
for i,row in all_results.iterrows():
    if keywordSearch(row['abstract']):
        all_results = all_results.drop([i])

In [25]:
len(all_results)

25040

Drop results with epidemiology MeSH terms

In [26]:
i = 0
for i,row in all_results.iterrows():
    for term in row['mesh']:
        if term.lower() in {'epidemiology','prevalence','incidence'}:
            all_results = all_results.drop([i])
            break

In [27]:
len(all_results)

24990

In [23]:
# check for presence of tokenized epidemiology keywords in the text
def keywordSearch(sample):
    if 'preval' in sample or 'incid' in sample or 'epidemio' in sample:
        return True
    return False

In [28]:
all_results.to_csv('negative_dataset.csv', index=False, columns=['pmid','abstract'])