In [9]:
import aiohttp
import asyncio
import requests
import time
import pandas as pd

class Logger():
    def __init__(self) -> None:
        self.t0 = time.time()
        self.tl = self.t0

    def __call__(self, msg: str, *args):
        tn = time.time()
        ts = f"{tn - self.t0:9.4f}s"
        if (tn - self.tl) < 0.1:
            ds = "<0.1s"
        else:
            ds = f"{tn - self.tl:4.1f}s"
        self.tl = tn

        msg = f"{ts} ({ds}): {' '.join([msg, *[str(a) for a in args]])}"
        print(msg)

async def fetch(sess, url, log):
    async with sess.get(url) as req:
        if req.status != 200:
            req.raise_for_status()
        log(f"Read {url}.")
        return await req.text()

async def fetch_all(sess, urls, log):
    tasks = []
    for url in urls:
        task = asyncio.create_task(fetch(sess, url, log))
        tasks.append(task)
    res = await asyncio.gather(*tasks)
    return res

async def read_json_from_hdruk_api(endpoints):
    URL = "https://phenotypes.healthdatagateway.org/api/v1/public"
    urls =[f"{URL}/{ep}/?format=json" for ep in endpoints]
    
    log = Logger()
    log(f"Reading {len(urls)} URLs...")

    async with aiohttp.ClientSession() as sess:
        responses = await fetch_all(sess, urls, log)
    
    data = {ep: pd.read_json(res) for ep, res in zip(endpoints, responses)}
    log("Finished reading.")
    
    return data

In [11]:
resources = [
    "phenotypes",
    "concepts",
    "coding-systems",
    "data-sources",
    "tags",
    "collections",
]
data = await read_json_from_hdruk_api(resources)

   0.0000s (<0.1s): Reading 6 URLs...
   0.4706s ( 0.5s): Read https://phenotypes.healthdatagateway.org/api/v1/public/tags/?format=json.
   0.4757s (<0.1s): Read https://phenotypes.healthdatagateway.org/api/v1/public/coding-systems/?format=json.
   0.4907s (<0.1s): Read https://phenotypes.healthdatagateway.org/api/v1/public/collections/?format=json.


ClientResponseError: 500, message='Internal Server Error', url=URL('https://phenotypes.healthdatagateway.org/api/v1/public/phenotypes/?format=json')

In [65]:
df = data["phenotypes"]
df.head()

Unnamed: 0,phenotype_id,version_id,phenotype_name,type,author,owner,tags,collections,clinical_terminologies,data_sources,versions
0,PH1,2,COVID-19 infection,Disease or Syndrome,BHF CVD COVID UK Consortium,ieuan.scanlon,[],"[{'description': 'BHF Data Science Centre', 'i...","[{'name': 'ICD10 codes', 'id': 4}, {'name': 'S...","[{'id': 1, 'name': 'GPES Data for Pandemic Pla...","[{'version_id': 2, 'version_name': 'COVID-19 i..."
1,PH2,4,Anxiety,Disease or Syndrome,"Matthew J Carr, Sarah Steeg, Roger T Webb, Nav...",ieuan.scanlon,[],"[{'description': 'Phenotype Library', 'id': 18...","[{'name': 'Read codes v2', 'id': 5}]","[{'id': 6, 'name': 'CPRD Aurum', 'url': 'https...","[{'version_id': 4, 'version_name': 'Anxiety', ..."
2,PH3,6,Depression,Disease or Syndrome,"Matthew J Carr, Sarah Steeg, Roger T Webb, Nav...",ieuan.scanlon,[],"[{'description': 'Phenotype Library', 'id': 18...","[{'name': 'Read codes v2', 'id': 5}]","[{'id': 6, 'name': 'CPRD Aurum', 'url': 'https...","[{'version_id': 6, 'version_name': 'Depression..."
3,PH4,8,Self Harm,Disease or Syndrome,"Matthew J Carr, Sarah Steeg, Roger T Webb, Nav...",ieuan.scanlon,[],"[{'description': 'Phenotype Library', 'id': 18...","[{'name': 'Read codes v2', 'id': 5}]","[{'id': 6, 'name': 'CPRD Aurum', 'url': 'https...","[{'version_id': 8, 'version_name': 'Self Harm'..."
4,PH5,1509,Cardiovascular Disease,Disease or Syndrome,"Ellie Paige, Jessica Barret, David Stevens, Ru...",ieuan.scanlon,[],"[{'description': 'Phenotype Library', 'id': 18...","[{'name': 'Read codes v2', 'id': 5}]","[{'id': 5, 'name': 'CPRD GOLD', 'url': 'https:...","[{'version_id': 1509, 'version_name': 'Cardiov..."


In [67]:
details = []
log = Logger()
log("Reading from API...")
for ph_id in df.phenotype_id:
    data = requests.get(f"{URL}/phenotypes/{ph_id}/detail/?format=json").content
    details.append(pd.read_json(data.decode('utf-8')).iloc[0])
    log(f"Read {ph_id} details.")

   0.0000s (<0.1s): Reading from API...
   0.6237s (0.6s): Read PH1 details.
   1.3838s (0.8s): Read PH2 details.
   2.2563s (0.9s): Read PH3 details.
   4.5496s (2.3s): Read PH4 details.
   5.9107s (1.4s): Read PH5 details.
   7.1803s (1.3s): Read PH6 details.
   7.7271s (0.5s): Read PH7 details.
   8.8062s (1.1s): Read PH8 details.
   9.6378s (0.8s): Read PH9 details.
  10.0980s (0.5s): Read PH10 details.
  11.2768s (1.2s): Read PH11 details.
  12.9636s (1.7s): Read PH12 details.
  13.6868s (0.7s): Read PH13 details.
  14.4874s (0.8s): Read PH14 details.
  16.0500s (1.6s): Read PH15 details.
  16.6812s (0.6s): Read PH16 details.
  17.1131s (0.4s): Read PH17 details.
  18.3713s (1.3s): Read PH18 details.
  18.9537s (0.6s): Read PH19 details.
  19.6792s (0.7s): Read PH20 details.
  20.1734s (0.5s): Read PH21 details.
  20.6616s (0.5s): Read PH22 details.
  21.1609s (0.5s): Read PH23 details.
  21.6600s (0.5s): Read PH24 details.
  22.1639s (0.5s): Read PH25 details.
  22.6474s (0.5s): 

ValueError: If using all scalar values, you must pass an index

In [None]:
codes = []
log = Logger()
log("Reading from API...")
for ph_id in df.phenotype_id:
    data = requests.get(f"{URL}/phenotypes/{ph_id}/export/codes/?format=json").content
    codes.append(pd.read_json(data.decode('utf-8')).iloc[0])
    log(f"Read {ph_id} code list.")

{'citation_requirements',
 'concepts',
 'definition',
 'implementation',
 'publication_doi',
 'publication_link',
 'publications',
 'source_reference',
 'validation_performed'}

In [64]:
det_df

{'owner'}