## Manuscript Caffeine Analysis
This notebook shows a meta-analysis of "caffeine clearance rates vs.dosage" and stratifed into three subgroups. All is publicly available on www.pk-db.com/api/v1/.   

The notebook is structured into three parts. 
1. data pooling
2. data inference
3. data visualization

In [10]:
# import libraries
# for the matter for conciseness basic func

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
from utils import merge, preprocess_interventions,preprocess_outputs,preprocess_groups, preprocess_individuals
import pandas as pd

{'token': '11089428e57df500715342c979d020ef79ba0f81'}


# 1. Data Pooling 

In [12]:
from urllib.parse import urljoin, urlencode
import requests
def flatten_json(y):
    """
    flatten the nested json. into a single dictonary.
    """
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
     
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

def get_data(url,headers={},**kwargs):

    """
    gets the data from a paginated rest api.
    """
    url_params = "?"+urlencode(kwargs)
    acctual_url = urljoin(url,url_params)
    response = requests.get(acctual_url)

    
    # you might want to add a login to your request. Thereby, you get access to data from your closed access studies.  
    #
    #def get_headers():
    #user = "admin"
    #password = "pkdb_admin"
    #token = get_login_token(user, password)
    #headers = {'Authorization': f'Token {token}'}
    #return headers
    #
    #response = requests.get(acctual_url,headers=headers) with a login and closed access data 
    

    num_pages = response.json()["last_page"]
    data = []
    for page in range(1,num_pages +1):
        url_current = acctual_url + f"&page={page}"
        response = requests.get(url_current,headers=headers)
        data += response.json()["data"]["data"]
    flatten_data = [flatten_json(d) for d in data]
    return pd.DataFrame(flatten_data)

In [13]:
entry_point = "https://pk-db.com/api/v1/"

base_format = {"format":"json","page_size":1000}
subject_params = {"healthy":"true"}
output_params = {"normed":"true", "measurement_type":"clearance","substance":"caffeine"}
intervention_params = {"normed":"true", "application":"single dose","substance":"caffeine"} # this search is not working check why

query_params = {
    "outputs_elastic":{**base_format, **output_params},
    "interventions_elastic":{**base_format, **intervention_params},
    "individuals_elastic":{**base_format,**subject_params},
    "groups_elastic":{**base_format,**subject_params}
    }


In [50]:
pk_data = {}
for query, params in query_params.items():
    name = query[:-8]
    print(f"load {name}")
    pk_data[query[:-8]] = get_data(f"{entry_point}{query}/", **params)
                                   
    

{'token': '11089428e57df500715342c979d020ef79ba0f81'}
load outputs
load interventions
load individuals
load groups


In [51]:
#preprocess
preprocess_params = {
    "outputs":preprocess_outputs,
    "interventions":preprocess_interventions,
    "individuals":preprocess_individuals,
    "groups":preprocess_groups
    }


In [52]:
for data_type, prerocess in preprocess_params.items():
    pk_data[data_type] = prerocess(pk_data[data_type])
                                   

None
0        38
1        38
2        38
3        38
4        38
       ... 
3947    142
3948    142
3949    142
3950    142
3951    142
Name: count, Length: 3952, dtype: int64


In [55]:
merged_outputs = merge(**pk_data)

In [56]:
merged_outputs

Unnamed: 0,study,pk,tissue,unit,measurement_type,substance,interventions,calculated,raw_pk,access,...,"(lean body mass, unit)","(cyp2d6 variant, choice)","(cyp2d6 variant, count)","(fat weight, count)","(fat weight, mean)","(fat weight, se)","(fat weight, unit)","(circadian status, choice)","(circadian status, count)",inferred
0,Benowitz2003,14371,plasma,liter / hour,clearance,caffeine,458,True,14363,public,...,,,,,,,,,,False
1,Benowitz2003,14264,plasma,liter / hour,clearance,caffeine,458,False,14259,public,...,,,,,,,,,,False
2,Benowitz2003,14265,plasma,liter / hour / kilogram,clearance,caffeine,458,False,14260,public,...,,,,,,,,,,False
3,Beach1986,13628,plasma,liter / hour,clearance,caffeine,442,False,13418,public,...,,,,,,,,,,False
4,Beach1986,13642,plasma,liter / hour,clearance,caffeine,442,False,13432,public,...,,,,,,,,,,False
5,Beach1986,13644,plasma,liter / hour,clearance,caffeine,442,False,13434,public,...,,,,,,,,,,False
6,Beach1986,13648,plasma,liter / hour,clearance,caffeine,442,False,13438,public,...,,,,,,,,,,False
7,Beach1986,13650,plasma,liter / hour,clearance,caffeine,442,False,13440,public,...,,,,,,,,,,False
8,Beach1986,13876,plasma,liter / hour,clearance,caffeine,442,False,13831,public,...,,,,,,,,,,False
9,Beach1986,13931,plasma,liter / hour,clearance,caffeine,442,True,13923,public,...,,,,,,,,,,False


In [57]:
merged_outputs.loc[(merged_outputs[("sex","choice")] == "M"),("oral contraceptives","choice")] = "N"


In [61]:
pd.DataFrame(merged_outputs.groupby(["unit", "unit_intervention"]).apply(len), columns=["count"])

def filter_out(data,unit_field,units):
    return data[~data[unit_field].isin(units)]

merged_outputs = filter_out(merged_outputs,"unit",["milliliter / meter ** 2 / minute"])

In [62]:
merged_outputs

Unnamed: 0,study,pk,tissue,unit,measurement_type,substance,interventions,calculated,raw_pk,access,...,"(lean body mass, unit)","(cyp2d6 variant, choice)","(cyp2d6 variant, count)","(fat weight, count)","(fat weight, mean)","(fat weight, se)","(fat weight, unit)","(circadian status, choice)","(circadian status, count)",inferred
0,Benowitz2003,14371,plasma,liter / hour,clearance,caffeine,458,True,14363,public,...,,,,,,,,,,False
1,Benowitz2003,14264,plasma,liter / hour,clearance,caffeine,458,False,14259,public,...,,,,,,,,,,False
2,Benowitz2003,14265,plasma,liter / hour / kilogram,clearance,caffeine,458,False,14260,public,...,,,,,,,,,,False
3,Beach1986,13628,plasma,liter / hour,clearance,caffeine,442,False,13418,public,...,,,,,,,,,,False
4,Beach1986,13642,plasma,liter / hour,clearance,caffeine,442,False,13432,public,...,,,,,,,,,,False
5,Beach1986,13644,plasma,liter / hour,clearance,caffeine,442,False,13434,public,...,,,,,,,,,,False
6,Beach1986,13648,plasma,liter / hour,clearance,caffeine,442,False,13438,public,...,,,,,,,,,,False
7,Beach1986,13650,plasma,liter / hour,clearance,caffeine,442,False,13440,public,...,,,,,,,,,,False
8,Beach1986,13876,plasma,liter / hour,clearance,caffeine,442,False,13831,public,...,,,,,,,,,,False
9,Beach1986,13931,plasma,liter / hour,clearance,caffeine,442,True,13923,public,...,,,,,,,,,,False
