In [16]:
import requests
import pandas as pd

from IPython.display import display

## Associated Artifacts

- Doc & talk by Jared: [Which drug should I (not) give to my diabetes patient? - TIDBIT 3 - Clinical Diabetes - Jared](https://drive.google.com/open?id=18v_mg3yaAy22aGFyH7hkwRc1SBf046rm0-x1fWI-xOo)

## Options

In [3]:
# COHD API server
server = 'http://cohd.nsides.io/api'  

# Pandas display options
pd.options.display.max_colwidth = 255
pd.options.display.max_rows = None

## functions to perform REST API requests

In [4]:
# Utility functions

def json_to_df(results):
    # convert COHD's JSON response to Pandas dataframe
    return pd.DataFrame(results['results'])

In [12]:
# COHD metadata functions

def datasets():
    global server
    
    url = '{server}/metadata/datasets'.format(server=server)
    json = requests.get(url).json()
    df = json_to_df(json)
    
    if len(df.columns) == 3:
        # re-order the columns so that it displays in a more logical order
        df = df[['dataset_id', 'dataset_name', 'dataset_description']]
    return df

def domain_counts(dataset_id=None):
    global server
    
    url = '{server}/metadata/domainCounts'.format(server=server)
    
    # Optional params
    params = {}
    if dataset_id is not None:
        params['dataset_id'] = dataset_id
        
    json = requests.get(url, params).json()
    df = json_to_df(json)
    if len(df.columns) == 3:
        # re-order the columns so that it displays in a more logical order
        df = df[['dataset_id', 'domain_id', 'count']]
    return df

In [13]:
# COHD OMOP functions

def find_concept(concept_name, dataset_id=None, domain=None, min_count=1):
    global server
    
    url = '{server}/omop/findConceptIDs'.format(server=server)
    
    # Params
    params = {
        'q': concept_name,
        'min_count': min_count
    }
    if dataset_id is not None:
        params['dataset_id'] = dataset_id
    if domain is not None:
        params['domain'] = domain
        
    json = requests.get(url, params).json()
    df = json_to_df(json)
    if len(df.columns) == 7:
        # re-order the columns so that it displays in a more logical order
        df = df[['concept_id', 'concept_name', 'domain_id', 'concept_class_id', 
                 'vocabulary_id', 'concept_code', 'concept_count']]
    return df

In [14]:
# COHD Clinical Frequency functions

def associated_concept_domain_freq(concept_id, domain_id, dataset_id=None):
    global server
    
    url = '{server}/frequencies/associatedConceptDomainFreq'.format(server=server)
    
    # Params
    params = {
        'concept_id': concept_id, 
        'domain': domain_id
    }
    if dataset_id is not None:
        params['dataset_id'] = dataset_id
        
    json = requests.get(url, params).json()
    df = json_to_df(json)
    if len(df.columns) == 7:
        df = df[['dataset_id', 'concept_id', 'associated_concept_id', 'associated_concept_name',
                'associated_domain_id', 'concept_count', 'concept_frequency']]
    return df

## General Mappig via COHD

Implementation:

Name (string) => Concept ID (digital, of "from_domain") => Asscociated Concept (of "to_domain") 

### All available domains

In [17]:
ds1_domain_df = domain_counts(dataset_id=1)
ds2_domain_df = domain_counts(dataset_id=2)

display(ds1_domain_df)
display(ds2_domain_df)

Unnamed: 0,dataset_id,domain_id,count
0,1,Condition,10159
1,1,Device,170
2,1,Drug,10264
3,1,Ethnicity,2
4,1,Gender,4
5,1,Measurement,188
6,1,Observation,870
7,1,Procedure,8270
8,1,Race,32
9,1,Relationship,5


Unnamed: 0,dataset_id,domain_id,count
0,2,Condition,11952
1,2,Device,204
2,2,Drug,12334
3,2,Ethnicity,2
4,2,Gender,4
5,2,Measurement,235
6,2,Observation,993
7,2,Procedure,10816
8,2,Race,32
9,2,Relationship,6


In [18]:
assert set(ds1_domain_df.domain_id) == set(ds2_domain_df.domain_id)

avail_domains = set(ds1_domain_df.domain_id)
avail_domains

{'Condition',
 'Device',
 'Drug',
 'Ethnicity',
 'Gender',
 'Measurement',
 'Observation',
 'Procedure',
 'Race',
 'Relationship'}

### functions for mapping concepts

In [19]:
def map_one_concept_id(concept_id, to_domain, concept_freq_thld=0.005, top_n=5):
    global avail_domains
    
    if to_domain.capitalize() not in avail_domains:
        raise ValueError("Error: available domains are {}. `map_one_concept_id` got {}".format(avail_domains, to_domain))
        
    to_domain = to_domain.capitalize()
    
    df = associated_concept_domain_freq(concept_id, to_domain)
    if not df.empty:
        if concept_freq_thld > 0:
            df = df.loc[df.concept_frequency >= concept_freq_thld, ["associated_concept_name", "concept_frequency"]]
        else:
            df = df.loc[:, ["associated_concept_name", "concept_frequency"]]
        
        df = df.sort_values(by="concept_frequency", axis=0, ascending=False, inplace=False)
        
        if top_n > 0:
            if df.shape[0] > top_n:
                df = df.head(top_n)
    
    return df

def map_multi_concept_ids(concept_ids, to_domain, concept_freq_thld=0.005, top_n=5):
    global avail_domains
    if to_domain.capitalize() not in avail_domains:
        raise ValueError("Error: available domains are {}. `map_multi_concept_ids` got {}".format(avail_domains, to_domain))
        
    to_domain = to_domain.capitalize()
    
    maps = [map_one_concept_id(concept_id, to_domain, concept_freq_thld=concept_freq_thld, top_n=top_n) for concept_id in concept_ids]
    df = pd.concat(maps, axis=0)
    df = df.sort_values(by="concept_frequency", axis=0, ascending=False, inplace=False)
    df = df.drop_duplicates(subset="associated_concept_name", keep='first', inplace=False)
    
    if top_n > 0:
        if df.shape[0] > top_n:
            df = df.head(top_n)
    
    return df

def find_concept_ids_by_name(name, domain):
    global avail_domains
    if domain.capitalize() not in avail_domains:
        raise ValueError("Error: available domains are {}. `find_concept_ids_by_name` got {}".format(avail_domains, domain))
        
    domain = domain.capitalize()
    
    df = find_concept(name, domain=domain, min_count=1) #dataset_id=2)
    return df.concept_id

def map_one_name(name, from_domain, to_domain, freq_thld=0.005, top_n=5):
    concept_ids = find_concept_ids_by_name(name, from_domain)
    
    df = map_multi_concept_ids(concept_ids, to_domain=to_domain, concept_freq_thld=freq_thld, top_n=top_n)
    df = df.assign(name=name, from_domain=from_domain, to_domain=to_domain)
    
    if top_n > 0:
        if df.shape[0] > top_n:
            df = condition_df.head(top_n)
    
    return df

def map_multi_names(names, from_domain, to_domain, freq_thld=0.005, top_n=5):
    maps = [map_one_name(name, from_domain, to_domain, freq_thld=freq_thld, top_n=top_n) for name in names]
    df = pd.concat(maps, axis=0)
    
    df = df.set_index("name")

    return df

### find all phenotypes associated with given diabetes drugs

In [22]:
# Drugs for diabetes
drug_names = ["metformin",
              "repaglinide",
              "glipizide",
              "sitagliptin",
              "insulin",
              "canagliflozin",
              "exenatide",
              "rosiglitazone"]

# If freq_thld <= 0,filtering on freq threshold is NOT performed
# If top_n <= 0, return all results
drug_condition_df = map_multi_names(drug_names, from_domain="drug", to_domain="condition", freq_thld=0, top_n=0)

In [25]:
print(drug_condition_df.shape)

(14073, 4)


In [26]:
display(drug_condition_df.head(10))

Unnamed: 0_level_0,associated_concept_name,concept_frequency,from_domain,to_domain
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
metformin,Essential hypertension,0.01035,drug,condition
metformin,Type 2 diabetes mellitus without complication,0.009377,drug,condition
metformin,Type 2 diabetes mellitus,0.007877,drug,condition
metformin,Hyperlipidemia,0.007686,drug,condition
metformin,Chest pain,0.004761,drug,condition
metformin,Coronary arteriosclerosis in native artery,0.004065,drug,condition
metformin,Dyspnea,0.003714,drug,condition
metformin,Electrocardiogram abnormal,0.003208,drug,condition
metformin,Obesity,0.002803,drug,condition
metformin,Abdominal pain,0.002795,drug,condition


### `insulin` <=> `Atelectasis` association comes out as a surprise

In [28]:
display(drug_condition_df.loc["insulin"].head(5))

Unnamed: 0_level_0,associated_concept_name,concept_frequency,from_domain,to_domain
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
insulin,Essential hypertension,0.012277,drug,condition
insulin,Type 2 diabetes mellitus without complication,0.009345,drug,condition
insulin,Hyperlipidemia,0.00926,drug,condition
insulin,Atelectasis,0.007841,drug,condition
insulin,Chest pain,0.007805,drug,condition


In [19]:
drug_condition_df.to_csv("COHD_drug_x_phenotype.tsv", sep="\t", header=True, index=True)