In [11]:
import requests
import json
import pandas as pd

from IPython.display import display

## Possible Associated Artifacts

- Currently empty: [TIDBIT 4 - Clinical Asthma - Jared](https://drive.google.com/open?id=1YniQicZYQ0Z-WuUg0_zy5g9Taubeofw5qg46xPl-SXc)

This notebook aims to find the most differentiated phenotypes between African American and Caucasian asthma patients, according to JHU Clinical Profiles

## JHU Clinical Profiles URL patterns

JHU Clinical Profiles contains data of patients of 3 diseases, _eds_, _asthma_ and _diabetes_, across 13 populations, represented by the following 13 keywords:

- _eds_
    - `jhu-eds-population`
    - `jhu-eds-population-Female`
    - `jhu-eds-population-WhiteorCaucasian`
    - `jhu-eds-population-Female-WhiteorCaucasian`
- _asthma_
    - `jhu-asthma-population`
    - `jhu-asthma-population-Female`
    - `jhu-asthma-population-Male`
    - `jhu-asthma-population-WhiteorCaucasian`
    - `jhu-asthma-population-BlackorAfricanAmerican`
    - `jhu-asthma-population-Other`
- _diabetes_
    - `jhu-diabetes-population`
    - `jhu-diabetes-population-Male`
    - `jhu-diabetes-population-Female`
    
For each population, 5 types of data, _conditions_, _procedures_, _hpos_, _labs_, and _medications_ are available for queries. The URLs are:

- https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{keywords}-conditions 
- https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{keywords}-procedures 
- https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{keywords}-hpos 
- https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{keywords}-labs 
- https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{keywords}-medications

## Util functions for querying JHU Clinical Profiles

In [7]:
avail_keywords = set([
    "jhu-eds-population",
    "jhu-eds-population-Female",
    "jhu-eds-population-WhiteorCaucasian",
    "jhu-eds-population-Female-WhiteorCaucasian",
    "jhu-asthma-population",
    "jhu-asthma-population-Female",
    "jhu-asthma-population-Male",
    "jhu-asthma-population-WhiteorCaucasian",
    "jhu-asthma-population-BlackorAfricanAmerican",
    "jhu-asthma-population-Other",
    "jhu-diabetes-population",
    "jhu-diabetes-population-Male",
    "jhu-diabetes-population-Female"])

avail_data_types = set([
    "conditions", 
    "procedures",
    "hpos",  # Human Phenotype Ontology
    "labs", 
    "medications"])

def make_hapi_url(keyword, data_type):
    return "https://hapi.clinicalprofiles.org/baseR4/ClinicalProfile/{}-{}".format(keyword, data_type)

def query_hapi(url):
    timeout_sec = 30
    
    try:
        res = requests.get(url, timeout=timeout_sec)
    except requests.exceptions.Timeout:
        print('Timeout for URL: ' + url)
        return None
    except KeyboardInterrupt:
        return None
    except BaseException as e:
        print('%s received for URL: %s' % (e, url))
        return None
    
    status_code = res.status_code
    if status_code != 200:
        print('Status code ' + str(status_code) + ' for url: ' + url)
        return None

    return res.json()

def parse_hapi_hpo_response_json(res_json, filter=True):
    hpo_json = res_json["hpo"]
    
    def parse_hapi_hpo_entry(hpo_entry):
        hpo_code = hpo_entry["code"][0]["coding"][0]["code"]
        hpo_display = hpo_entry["code"][0]["coding"][0]["display"]
        hpo_fraction = hpo_entry["fractionOfSubjects"]
        
        return hpo_code, hpo_display, hpo_fraction
    
    hpo_tuples = [parse_hapi_hpo_entry(hpo_entry) for hpo_entry in hpo_json]
    
    hpo_df = pd.DataFrame(hpo_tuples, columns=["HPO_code", "HPO_display", "fraction"])
    hpo_df = hpo_df.sort_values(by="fraction", axis=0, ascending=False)
    
    if filter:
        """
        Some phenotypes are marked as "EXCLUDED", indicating nonrelevance with patients' diseases after further treatments.
        
        Phenotypes with fraction 0 can be ignored; they may come from mistakenly recording
        """
        excluded_flag = hpo_df.HPO_display.str.startswith("EXCLUDED:")
        zero_frac_flag = (hpo_df.fraction == 0)
        keep_flag = (~excluded_flag) & (~zero_frac_flag)

        return hpo_df.loc[keep_flag, :]
    else:
        return hpo_df

## Functions for querying African American vs Caucasian asthma phenotypes

In [8]:
def query_asthma_cau_hpo():
    kw = "jhu-asthma-population-WhiteorCaucasian"
    hpo_url = make_hapi_url(kw, "hpos")
    res_json = query_hapi(hpo_url)
    
    hpo_df = parse_hapi_hpo_response_json(res_json, filter=True)
    hpo_df = hpo_df.assign(disease="asthma", population="Caucasian")
    
    return hpo_df
    
def query_asthma_aa_hpo():
    kw = "jhu-asthma-population-BlackorAfricanAmerican"
    hpo_url = make_hapi_url(kw, "hpos")
    res_json = query_hapi(hpo_url)
    
    hpo_df = parse_hapi_hpo_response_json(res_json, filter=True)
    hpo_df = hpo_df.assign(disease="asthma", population="African-American")
    
    return hpo_df

In [9]:
asthma_cau_hpo_df = query_asthma_cau_hpo()
asthma_aa_hpo_df = query_asthma_aa_hpo()

In [12]:
display(asthma_cau_hpo_df.head())
display(asthma_aa_hpo_df.head())

Unnamed: 0,HPO_code,HPO_display,fraction,disease,population
48,HP:0003193,Allergic rhinitis,0.22547,asthma,Caucasian
3,HP:0003074,Hyperglycemia,0.223382,asthma,Caucasian
20,HP:0005518,Increased mean corpuscular volume,0.212944,asthma,Caucasian
28,HP:0003124,Hypercholesterolemia,0.162839,asthma,Caucasian
27,HP:0002901,Hypocalcemia,0.152401,asthma,Caucasian


Unnamed: 0,HPO_code,HPO_display,fraction,disease,population
3,HP:0003074,Hyperglycemia,0.268657,asthma,African-American
48,HP:0003193,Allergic rhinitis,0.248756,asthma,African-American
23,HP:0025066,Decreased mean corpuscular volume,0.238806,asthma,African-American
27,HP:0002901,Hypocalcemia,0.231343,asthma,African-American
20,HP:0005518,Increased mean corpuscular volume,0.19403,asthma,African-American


In [None]:
asthma_cau_hpo_df.to_csv("JHU-clinical-profile-asthma-white-caucasian-hpo-all.tsv", sep="\t", header=True, index=False)
asthma_aa_hpo_df.to_csv("JHU-clinical-profile-asthma-african-american-hpo-all.tsv", sep="\t", header=True, index=False)

In [13]:
### "Asthma" itself was not included in the phenotype records
print(any(asthma_cau_hpo_df.HPO_display.str.contains("asthma")))
print(any(asthma_aa_hpo_df.HPO_display.str.contains("asthma")))

False
False


## Merge the two asthma phenotype tables, calculate the fraction differences and sort by the absolute values of differences

In [14]:
asthma_aa_cau_hpo_df = asthma_aa_hpo_df.loc[:, ["HPO_code", "HPO_display", "fraction", "disease"]]. \
                        merge(asthma_cau_hpo_df.loc[:, ["HPO_code", "HPO_display", "fraction"]], 
                              on="HPO_code", how="outer", suffixes=["_AA", "_CAU"])

asthma_aa_cau_hpo_df.loc[:, "disease"].fillna("asthma", inplace=True)
asthma_aa_cau_hpo_df.loc[:, "HPO_display_AA"].fillna(asthma_aa_cau_hpo_df.loc[:, "HPO_display_CAU"], inplace=True)
asthma_aa_cau_hpo_df.rename(columns={"HPO_display_AA": "HPO_display"}, inplace=True)
asthma_aa_cau_hpo_df.drop("HPO_display_CAU", axis=1, inplace=True)

asthma_aa_cau_hpo_df.loc[:, "fraction_AA"].fillna(0, inplace=True)
asthma_aa_cau_hpo_df.loc[:, "fraction_CAU"].fillna(0, inplace=True)
asthma_aa_cau_hpo_df = asthma_aa_cau_hpo_df.assign(fraction_diff=asthma_aa_cau_hpo_df.fraction_AA - asthma_aa_cau_hpo_df.fraction_CAU)

asthma_aa_cau_hpo_df = asthma_aa_cau_hpo_df.loc[(-asthma_aa_cau_hpo_df.fraction_diff.abs()).argsort(), 
                                                ["disease", "HPO_code", "HPO_display", "fraction_AA", "fraction_CAU", "fraction_diff"]]

In [15]:
asthma_aa_cau_hpo_df.head(10)

Unnamed: 0,disease,HPO_code,HPO_display,fraction_AA,fraction_CAU,fraction_diff
2,asthma,HP:0025066,Decreased mean corpuscular volume,0.238806,0.141962,0.096844
62,asthma,HP:0000821,Hypothyroidism,0.017413,0.104384,-0.086971
11,asthma,HP:0003124,Hypercholesterolemia,0.077114,0.162839,-0.085725
21,asthma,HP:0002155,Hypertriglyceridemia,0.047264,0.129436,-0.082173
20,asthma,HP:0003077,Hyperlipidemia,0.049751,0.129436,-0.079685
3,asthma,HP:0002901,Hypocalcemia,0.231343,0.152401,0.078942
6,asthma,HP:0003155,Elevated alkaline phosphatase,0.131841,0.068894,0.062947
5,asthma,HP:0001513,Obesity,0.151741,0.091858,0.059883
0,asthma,HP:0003074,Hyperglycemia,0.268657,0.223382,0.045275
8,asthma,HP:0001894,Thrombocytosis,0.09204,0.048017,0.044023


In [None]:
asthma_aa_cau_hpo_df.to_csv("JHU-clinical-profile-asthma-AA-vs-CAU-hpo.tsv", sep="\t", header=True, index=False)