# Get data for natural history studies

In [45]:
import json
import urllib
from urllib import request
from urllib import parse
import neo4j
from neo4j import GraphDatabase, ServiceUnavailable, basic_auth
import datetime
from datetime import date
import time
import logging, logging.config
import itertools
from itertools import islice
import pandas

### Get the list of diseases from GARD

In [15]:
def get_GARD_diseases_list():
        """Returns list of the GARD Diseases"""
        
        with GraphDatabase.driver("bolt://disease.ncats.io:80") as driver:
                try:
                        with driver.session() as session:
                                cypher_query = '''
                                match p = (d:DATA)-[]-(m:S_GARD) return distinct d.gard_id, d.name
                                '''
                                results = session.run(cypher_query, parameters={})
                                myData = {}
                                for record in results:
                                        myData[record['d.gard_id']] = record['d.name']
                                
                        return myData
                except ServiceUnavailable as e:
                        print(e)

gard_diseases = get_GARD_diseases_list()

### Add code to fetch article ids from pubmed and abstracts from ebi

In [83]:
def find_articles(keyword, mindate, maxdate):
        """fetch articles and return a map"""
        
        params = urllib.parse.urlencode({'db': 'pubmed', 'term': keyword, 'mindate':mindate, 'maxdate':maxdate, 'retmode': 'json', 'retmax':"5"})
        data = request.urlopen("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?%s" % params).read().decode()
        #print ("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?%s" % params)
        return json.loads(data)

def fetch_abstract(pubmedID): 
        """fetch abstract for an article"""
        
        ebiUrl = "https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:" + pubmedID + "&resulttype=core&format=json"
        ebiData = request.urlopen(ebiUrl).read().decode()
        return json.loads(ebiData)


### For each disease search pubmed for natural history studies and from ebi save the title and abstract

In [93]:
gdiseases = list(islice(gard_diseases.items(),1000,6324))
df = pandas.DataFrame(columns=['gardid','disease','pmid','pubTypes','title','url','abstract'])
for idx, gard_id in enumerate(gdiseases): 
    print(gard_id[1])
    if (gard_id[1] is not None):
        time.sleep(0.3)
        pubmed_result = find_articles(gard_id[1] + ' Natural History',"1900/01/01","2020/01/01")
        if ('esearchresult' in pubmed_result and 'count' in pubmed_result['esearchresult'] and int(pubmed_result['esearchresult']['count'])>0):
            print (pubmed_result['esearchresult']['idlist'])
            top5 = list(islice(pubmed_result['esearchresult']['idlist'],5))
            for x in top5:
                time.sleep(0.3)
                ebi = fetch_abstract(x)
                #print(ebi)
   
                df = df.append(
                    {'gardid':gard_id[0],
                'disease':gard_id[1], 
                'pmid':x,
                'pubTypes':ebi['resultList']['result'][0]['pubTypeList']['pubType'] if 'resultList' in ebi and 'result' in ebi['resultList'] and len(ebi['resultList']['result'])>0 and 'pubTypeList' in ebi['resultList']['result'][0] and 'pubType' in ebi['resultList']['result'][0]['pubTypeList'] else 'Not Found',
                'title':ebi['resultList']['result'][0]['title'] if 'resultList' in ebi and 'result' in ebi['resultList'] and len(ebi['resultList']['result'])>0 and 'title' in ebi['resultList']['result'][0] else 'Not Found',
                'url':'https://pubmed.ncbi.nlm.nih.gov/'+x,
                'abstract':ebi['resultList']['result'][0]['abstractText'] if 'resultList' in ebi and 'result' in ebi['resultList'] and len(ebi['resultList']['result'])>0 and 'abstractText' in ebi['resultList']['result'][0] else 'Not Found'},
                ignore_index=True)
        else:
            #print('Adding empty row to df')
            df = df.append({'gardid':gard_id[0],
                'disease':gard_id[1], 
                'pmid':'N/A',
                'pubTypes':'N/A',
                'url':'N/A',
                'abstract':'N/A'},
                ignore_index=True)
    df.to_csv("nhs_by_disease.csv")


GRACILE syndrome
Ablepharon macrostomia syndrome
['11807864']
Acanthocheilonemiasis
Abetalipoproteinemia
['9600371']
None
Acromesomelic dysplasia
Acromicric dysplasia
['28077185']
Agnosia
['24870331', '23933572', '17028557', '9754238', '1291910']
Alternating hemiplegia of childhood


KeyboardInterrupt: 