Prepare the positive dataset.
1. Extract PubMed IDs from Orphanet epidemiology sources
2. Get MeSH terms for each PubMed ID
3. Add all articles with epidemiology, prevalence, or incidence MeSH terms to positive set

In [1]:
import xml.etree.ElementTree as ET
import re
import requests
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import json
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en import English
#import sys
#!{sys.executable} -m pip install scispacy
#!{sys.executable} -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz 
nlpSci = spacy.load("en_ner_bc5cdr_md")

In [2]:
tree = ET.parse('en_product9_prev-6.25.21.xml')
root = tree.getroot()

Determine number of sources with PubMed IDs compared to the total number of sources

In [3]:
num_pmids = 0
c = 0
for child in root.iter('*'):
    if child.tag == 'Source':
        c+=1 #send Jennifer a pull request for the bug below. child.text needs to be str()
        if 'PMID' in str(child.text):
            pmids = re.findall('\d{6,8}', child.text)
            for pmid in pmids:
                num_pmids+=1
                c+=1

In [4]:
num_pmids, c

(10823, 26233)

Assemble set of pmids for epidemiology studies (prev_pmids)

In [5]:
prev_pmids = set()
i = 0
any_tags = 0 # number of articles with any MeSH tags (not just epidemiology)
for child in root.iter('*'):
    if child.tag == 'Source' and 'PMID' in str(child.text):
        pmids = re.findall('\d{6,8}', child.text)
        for pmid in pmids:
            i += 1
            if pmid not in prev_pmids:
                #Need to have a dataset of used pmids because Orphanet cites the same pmids multiple times
                is_case = False # is case report
                hasMesh = False # has epidemiology MeSH terms
                anyMesh = False # has any MeSH terms
            
                #Get publication info(abstract+MeSH) from EBI RESTful API
                url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+pmid+'&resulttype=core'
                r = requests.get(url)
                pub_root = ET.fromstring(r.content)
                            
                for child in pub_root.iter('*'):
                    if 'mesh' in child.tag:
                        anyMesh = True
                    if child.tag == 'qualifierName' or child.tag == 'descriptorName':
                        if child.text.lower() in {'prevalence','epidemiology','incidence'}:
                            hasMesh = True
                    # exclude case reports
                    if child.tag == 'pubType':
                        if child.text == 'Case Reports':
                            is_case = True
                            break
                if anyMesh:
                    any_tags += 1
                if hasMesh and not is_case:
                    prev_pmids.add(pmid)
                    
            if i % 100 == 0:
                print('num articles:',i, 'num epi mesh:', len(prev_pmids), 'any mesh:',any_tags)

num articles: 100 num epi mesh: 23 any mesh: 50
num articles: 200 num epi mesh: 56 any mesh: 91
num articles: 300 num epi mesh: 67 any mesh: 108
num articles: 400 num epi mesh: 103 any mesh: 161
num articles: 500 num epi mesh: 155 any mesh: 221
num articles: 600 num epi mesh: 188 any mesh: 262
num articles: 700 num epi mesh: 216 any mesh: 304
num articles: 800 num epi mesh: 252 any mesh: 357
num articles: 900 num epi mesh: 282 any mesh: 416
num articles: 1000 num epi mesh: 320 any mesh: 474
num articles: 1100 num epi mesh: 344 any mesh: 520
num articles: 1200 num epi mesh: 357 any mesh: 561
num articles: 1300 num epi mesh: 394 any mesh: 617
num articles: 1400 num epi mesh: 417 any mesh: 662
num articles: 1500 num epi mesh: 453 any mesh: 718
num articles: 1600 num epi mesh: 474 any mesh: 754
num articles: 1700 num epi mesh: 524 any mesh: 816
num articles: 1800 num epi mesh: 577 any mesh: 878
num articles: 1900 num epi mesh: 613 any mesh: 921
num articles: 2000 num epi mesh: 649 any mesh

In [6]:
any_tags

4697

In [7]:
len(prev_pmids)

1510

In [8]:
pos_results_mesh = pd.DataFrame(columns=['pmid'])

In [9]:
pos_results_mesh['pmid'] = list(prev_pmids)

In [10]:
pos_results_mesh.to_csv('orphanet_epi_mesh-6.26.21.csv', index=False, encoding='utf-8')

Add abstract column to the dataframe

In [11]:
def getAbs(pmid):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(pmid)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    results = [abstract.text for abstract in root.iter('abstractText')]
    
    if len(results) > 0:
        return results[0]
    else:
        return ''

In [12]:
orphanet_epi_mesh= pd.read_csv('orphanet_epi_mesh-6.26.21.csv')

In [13]:
abstracts = []
for i,row in orphanet_epi_mesh.iterrows():
    if i%50 == 0:
        print(i)
    abstracts.append(getAbs(row['pmid']))

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500


In [14]:
orphanet_epi_mesh['abstract'] = abstracts
orphanet_epi_mesh['abstract'].replace('', np.nan, inplace=True)
orphanet_epi_mesh.dropna(inplace=True)

In [15]:
orphanet_epi_mesh.to_csv('positive_dataset.csv', index=False, encoding='utf-8')

Option to add articles from searches for rare disease names that have epidemiology MeSH terms. (Resulted in poorer performance from my testing.)

In [None]:
# get list of all rare disease names on GARD
with open('records.json') as f:
    records = json.load(f)
    
disorders = set()

for entry in records:
    disorders.add(entry['GARD_Name'])

In [None]:
j=0
keywords = {'prevalence','epidemiology','incidence'}
all_disorders_mesh = pd.DataFrame(columns=['pmid', 'abstract'])
for dz in disorders:
    if j%50 == 0:
        print(j, len(pos_results_disorders))
    j+=1
    
    # get results from searching for rare disease name through EBI API
    term = ''
    dz_words = dz.split()
    for word in dz_words:
        term += word + '%20'
    query = term[:-3]
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    pmid_to_abs = {}
    
    for result in root.iter('result'):
        pmids = [pmid.text for pmid in result.iter('id')]
        if len(pmids) > 0:
            pmid = pmids[0]
            if pmid[0].isdigit():
                abstracts = [abstract.text for abstract in result.iter('abstractText')]
                if len(abstracts) > 0:
                    mesh = set(mesh.text.lower() for mesh in result.iter('descriptorName'))
                    mesh2 = set(mesh.text.lower() for mesh in result.iter('qualifierName'))
                    # add the pmid if its article has epidemiology MeSH terms
                    if len(mesh & keywords) != 0 or len(mesh2 & keywords) != 0:
                        all_disorders_mesh = all_disorders_mesh.append({'pmid':pmid, 'abstract':abstracts[0]}
                                         , ignore_index=True)
    

        
    

In [None]:
all_disorders_mesh.to_csv('all_disorders_mesh.csv', index=False, encoding='utf-8')

In [None]:
all_disorders_mesh = pd.read_csv('all_disorders_mesh.csv')

In [None]:
combined = pd.concat([all_disorders_mesh, orphanet_epi_mesh]).reset_index(drop=True)

In [None]:
combined.to_csv('all_mesh.csv', index=False, encoding='utf-8')