Prepare the positive dataset.
1. Extract PubMed IDs from Orphanet epidemiology sources
2. Get MeSH terms for each PubMed ID
3. Add all articles with epidemiology, prevalence, or incidence MeSH terms to positive set

In [1]:
import xml.etree.ElementTree as ET
import re
import requests
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import json
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en import English
nlpSci = spacy.load("en_ner_bc5cdr_md")

In [2]:
tree = ET.parse('en_product9_prev.xml')
root = tree.getroot()

Determine number of sources with PubMed IDs compared to the total number of sources

In [21]:
num_pmids = 0
c = 0
for child in root.iter('*'):
    if child.tag == 'Source':
        c+=1
        if 'PMID' in child.text:
            pmids = re.findall('\d{6,8}', child.text)
            for pmid in pmids:
                num_pmids+=1
                c+=1

In [23]:
num_pmids, c

(10845, 26296)

Assemble set of pmids for epidemiology studies (prev_pmids)

In [15]:
prev_pmids = set()
i = 0
any_tags = 0 # number of articles with any MeSH tags (not just epidemiology)
for child in root.iter('*'):
    if child.tag == 'Source' and 'PMID' in child.text:
        pmids = re.findall('\d{6,8}', child.text)
        for pmid in pmids:
            if i % 100 == 0:
                print('num articles:',i, 'num epi mesh:', len(prev_pmids), 'any mesh:',any_tags)
            i += 1
            if pmid not in prev_pmids:
                is_case = False # is case report
                url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+pmid+'&resulttype=core'
                r = requests.get(url)
                pub_root = ET.fromstring(r.content)
                
                hasMesh = False # has epidemiology MeSH terms
                anyMesh = False # has any MeSH terms
    
                for child in pub_root.iter('*'):
                    if 'mesh' in child.tag:
                        anyMesh = True
                    if child.tag == 'qualifierName' or child.tag == 'descriptorName':
                        if child.text.lower() in {'prevalence','epidemiology','incidence'}:
                            hasMesh = True
                    # exclude case reports
                    if child.tag == 'pubType':
                        if child.text == 'Case Reports':
                            is_case = True
                            break
                if anyMesh:
                    any_tags += 1
                if hasMesh and not is_case:
                    prev_pmids.add(pmid)

num articles: 0 num epi mesh: 0 any mesh: 0
num articles: 100 num epi mesh: 22 any mesh: 48
num articles: 200 num epi mesh: 55 any mesh: 90
num articles: 300 num epi mesh: 66 any mesh: 106
num articles: 400 num epi mesh: 102 any mesh: 160
num articles: 500 num epi mesh: 155 any mesh: 219
num articles: 600 num epi mesh: 187 any mesh: 261
num articles: 700 num epi mesh: 215 any mesh: 303
num articles: 800 num epi mesh: 250 any mesh: 355
num articles: 900 num epi mesh: 279 any mesh: 413
num articles: 1000 num epi mesh: 318 any mesh: 472
num articles: 1100 num epi mesh: 342 any mesh: 519
num articles: 1200 num epi mesh: 355 any mesh: 560
num articles: 1300 num epi mesh: 391 any mesh: 615
num articles: 1400 num epi mesh: 414 any mesh: 659
num articles: 1500 num epi mesh: 451 any mesh: 715
num articles: 1600 num epi mesh: 472 any mesh: 753
num articles: 1700 num epi mesh: 520 any mesh: 813
num articles: 1800 num epi mesh: 574 any mesh: 874
num articles: 1900 num epi mesh: 611 any mesh: 920
n

In [24]:
any_tags

4691

In [18]:
len(prev_pmids)

1506

In [25]:
pos_results_mesh = pd.DataFrame(columns=['pmid'])

In [26]:
pos_results_mesh['pmid'] = list(prev_pmids)

In [19]:
pos_results_mesh.to_csv('orphanet_epi_mesh.csv', index=False)

Add abstract column to the dataframe

In [23]:
def getAbs(pmid):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(pmid)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    results = [abstract.text for abstract in root.iter('abstractText')]
    
    if len(results) > 0:
        return results[0]
    else:
        return ''
    

In [43]:
orphanet_epi_mesh= pd.read_csv('orphanet_epi_mesh.csv')

In [48]:
abstracts = []
for i,row in orphanet_epi_mesh.iterrows():
    if i%50 == 0:
        print(i)
    abstracts.append(getAbs(row['pmid']))

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500


In [None]:
orphanet_epi_mesh['abstract'] = abstracts
orphanet_epi_mesh['abstract'].replace('', np.nan, inplace=True)
orphanet_epi_mesh.dropna(inplace=True)

In [60]:
orphanet_epi_mesh.to_csv('orphanet_epi_mesh.csv', index=False)

Option to add articles from searches for rare disease names that have epidemiology MeSH terms. (Resulted in poorer performance from my testing.)

In [25]:
# get list of all rare disease names on GARD
with open('records.json') as f:
    records = json.load(f)
    
disorders = set()

for entry in records:
    disorders.add(entry['GARD_Name'])

In [29]:
j=0
keywords = {'prevalence','epidemiology','incidence'}
all_disorders_mesh = pd.DataFrame(columns=['pmid', 'abstract'])
for dz in disorders:
    if j%50 == 0:
        print(j, len(pos_results_disorders))
    j+=1
    
    # get results from searching for rare disease name through EBI API
    term = ''
    dz_words = dz.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    pmid_to_abs = {}
    
    for result in root.iter('result'):
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                if len(abstracts) > 0:
                    mesh = set(mesh.text.lower() for mesh in result.iter('descriptorName'))
                    mesh2 = set(mesh.text.lower() for mesh in result.iter('qualifierName'))
                    # add the pmid if its article has epidemiology MeSH terms
                    if len(mesh & keywords) != 0 or len(mesh2 & keywords) != 0:
                        all_disorders_mesh = all_disorders_mesh.append({'pmid':pmid, 'abstract':abstracts[0]}
                                         , ignore_index=True)
    

        
    

0 0
50 25
100 54
150 90
200 113
250 160
300 193
350 222
400 253
450 280
500 315
550 352
600 381
650 414
700 432
750 455
800 479
850 501
900 541
950 581
1000 600
1050 632
1100 656
1150 689
1200 714
1250 764
1300 804
1350 831
1400 851
1450 876
1500 890
1550 912
1600 937
1650 960
1700 991
1750 1018
1800 1045
1850 1065
1900 1091
1950 1142
2000 1167
2050 1202
2100 1229
2150 1279
2200 1309
2250 1328
2300 1361
2350 1392
2400 1419
2450 1447
2500 1498
2550 1535
2600 1556
2650 1592
2700 1628
2750 1658
2800 1694
2850 1721
2900 1771
2950 1816
3000 1846
3050 1875
3100 1896
3150 1926
3200 1950
3250 1983
3300 2012
3350 2055
3400 2079
3450 2103
3500 2129
3550 2156
3600 2185
3650 2202
3700 2227
3750 2255
3800 2278
3850 2310
3900 2340
3950 2372
4000 2410
4050 2437
4100 2459
4150 2490
4200 2523
4250 2545
4300 2563
4350 2596
4400 2623
4450 2649
4500 2682
4550 2705
4600 2741
4650 2773
4700 2805
4750 2830
4800 2856
4850 2892
4900 2918
4950 2939
5000 2961
5050 2987
5100 3015
5150 3065
5200 3105
5250 3129
530

In [41]:
all_disorders_mesh.to_csv('all_disorders_mesh.csv', index=False)

In [63]:
all_disorders_mesh = pd.read_csv('all_disorders_mesh.csv')

In [70]:
combined = pd.concat([all_disorders_mesh, orphanet_epi_mesh]).reset_index(drop=True)

In [71]:
combined.to_csv('all_mesh.csv', index=False)