In [1]:
import xml.etree.ElementTree as ET
import re
import requests
import string
import numpy as np
import pandas as pd
import csv

In [2]:
tree = ET.parse('Orpha Epi Data.xml')
root = tree.getroot()

In [3]:
def getAbs(pmid):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(pmid)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    
    results = [abstract.text for abstract in root.iter('abstractText')]
    
    if len(results) > 0:
        return results[0]
    else:
        return ''

In [4]:
df = pd.DataFrame(columns=['OrphaCode',' Disease Name','GeoLoc','PMID','Abstract'])

In [5]:
i=0
for disorder in root.iter('Disorder'):  
    code = disorder.find('./OrphaCode').text
    name = disorder.find('./Name').text
    #Each disorder, w/code and name, has multiple prevalence branches
    for prevalence in disorder.findall('./PrevalenceList/Prevalence'):
        geoloc = prevalence.find('./PrevalenceGeographic/Name').text
        source = prevalence.find('./Source').text
        #each prevalence, w/geoloc and source, has multiple pmids w/abstracts
        if 'PMID' in str(source):
                pmids = re.findall('\d{6,8}', source)
                for articleid in pmids:
                    pmid = articleid
                    abstract = getAbs(articleid)
                    #Note: there are duplicate PMIDs next to each other in the dataset
                    df.loc[i] = [code]+[name]+[geoloc]+[pmid]+[abstract]
                    i+=1
        else:
            #Activate this code to get everything, even ones w/o PMIDs
            #pmid=''
            #abstract=''
            #df.loc[i] = [code]+[name]+[geoloc]+[pmid]+[abstract]
            pass

In [6]:
df.head()

Unnamed: 0,OrphaCode,Disease Name,GeoLoc,PMID,Abstract
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Worldwide,11389160,<h4>Background</h4>We have previously describe...
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Worldwide,9689990,We report an inbred Omani family with four chi...
2,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Worldwide,11389160,<h4>Background</h4>We have previously describe...
3,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Worldwide,9689990,We report an inbred Omani family with four chi...
4,58,Alexander disease,Japan,21533827,Alexander disease (AxD) is a rare neurodegener...


In [7]:
df.tail()

Unnamed: 0,OrphaCode,Disease Name,GeoLoc,PMID,Abstract
10818,99791,Dentin dysplasia type II,Worldwide,10397672,"Dentin dysplasia, type II, is an inherited aut..."
10819,99792,Dentin dysplasia-sclerotic bones syndrome,Worldwide,264650,Patients with teeth showing all of the clinica...
10820,99792,Dentin dysplasia-sclerotic bones syndrome,Worldwide,264650,Patients with teeth showing all of the clinica...
10821,99776,Mosaic trisomy 9,Worldwide,22249800,CONTEXT:Mosaic trisomy 9 is considered to be a...
10822,99776,Mosaic trisomy 9,Worldwide,22249800,CONTEXT:Mosaic trisomy 9 is considered to be a...


In [8]:
df.to_csv('orphanet_epi_data2.csv', header = True,index=False, encoding='utf-8')