In [5]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from IPython.display import display

# install pkdb_analysis via:
#     git clone https://github.com/matthiaskoenig/pkdb_analysis (private)
#     cd pkdb_analysis
#     pip install -e .
# in future:
#     pip install pkdb_analsis

from pkdb_analysis import PKDB, PKData, PKFilter
from pkdb_analysis.test import TESTDATA_CONCISE_FALSE_ZIP


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
 import sys
 print(sys.executable)
 print(sys.version)
 print(sys.version_info)


/home/janek/.virtualenvs/pkdb_analysis/bin/python
3.7.9 (default, Aug 18 2020, 02:07:21) 
[GCC 9.3.0]
sys.version_info(major=3, minor=7, micro=9, releaselevel='final', serial=0)


# Filter data
A recurring task is to filter data for a certain question. E.g. to compare two groups, or get the subset of data for all healthy smokers.

We work again with our test data set and will filter various subsets from it.

In [7]:
#test_data = PKDB.query()
#TEST_HDF5 ="./test_data.hdf5"

test_data = PKData.from_archive(path=TESTDATA_CONCISE_FALSE_ZIP)

In [8]:
test_data._concise()
print(test_data)

------------------------------
PKData (139906405909968)
------------------------------
studies           435  (  435)
groups            893  ( 8055)
individuals      6014  (55561)
interventions    1175  ( 1806)
outputs         71501  (71501)
timecourses       419  (  419)
------------------------------


In [9]:
test_data1 =  test_data.filter_study(lambda x: x["licence"] == "open")

In [10]:
print(test_data1)

------------------------------
PKData (139906390773136)
------------------------------
studies            57  (   57)
groups            893  ( 8055)
individuals      6014  (55561)
interventions    1175  ( 1806)
outputs         71501  (71501)
timecourses       419  (  419)
------------------------------


In [11]:
list(test_data.study_sids)[:10]

['PKDB00107',
 'PKDB00028',
 'PKDB00026',
 'PKDB00131',
 'PKDB00032',
 'PKDB00277',
 'PKDB00270',
 'PKDB00150',
 'PKDB00176',
 'PKDB00339']

## Filter functions
The main principle for filtering `PKData` is by using the `filter_*` and `exclude_*` functionality.

A key principle are hereby filter functions which for a given DataFrame return a logical index.

Depending on which subset of information this should be applied the `groups`, `individuals`, `subjects` (groups and individuals), `outputs` and `timecourses`.

## Filter by `study_sid`
A first example is filtering by `study_sid`, i.e. we only want the subset of data from a single study.
An overview over the existing study sids in the dataset is available via

Filtering a subset of data works by providing filter/selection functions which select a subset of the data.
The filters are written on the `groups`, `individuals`

In [12]:
def is_PKDB99999(d):
    """Filter for specific study_sid. """
    return d.study_sid == "PKDB00198"

data = test_data.filter_intervention(is_PKDB99999)
print(data)

------------------------------
PKData (139906390686736)
------------------------------
studies             1  (    1)
groups              4  (   35)
individuals        46  (  400)
interventions       1  (    1)
outputs           147  (  147)
timecourses         1  (    1)
------------------------------


The PKData now only contains data for the given study_sid:

In [13]:
print(data.study_sids)

{'PKDB00198'}


In [14]:
# for instance interventions
display(data.interventions)

Unnamed: 0.1,intervention_pk,Unnamed: 0,study_sid,study_name,raw_pk,normed,name,route,form,application,...,substance,value,mean,median,min,max,sd,se,cv,unit
1781,1153,224,PKDB00198,Abernethy1982,3517,True,paracetamol_iv,iv,solution,single dose,...,paracetamol,0.65,,,,,,,,gram


      intervention_pk  Unnamed: 0  study_sid     study_name  raw_pk  normed  \
1781             1153         224  PKDB00198  Abernethy1982    3517    True   

                name route      form  application  ...    substance value  \
1781  paracetamol_iv    iv  solution  single dose  ...  paracetamol  0.65   

      mean median   min   max    sd    se    cv  unit  
1781  None   None  None  None  None  None  None  gram  

[1 rows x 25 columns]

One could also define this as a simple lambda function

In [15]:
data = test_data.filter_intervention(lambda d: d.study_sid == "PKDB00198")
print(data)

------------------------------
PKData (139906390798224)
------------------------------
studies             1  (    1)
groups              4  (   35)
individuals        46  (  400)
interventions       1  (    1)
outputs           147  (  147)
timecourses         1  (    1)
------------------------------


## Concise data
All operations on `PKData` leave the data in a consistent manner. 
E.g. if an intervention is filtered out also all the outputs using this intervention are filtered out.
This behavior is controlled by the `concise` flag on most operations.

In [16]:
t1 = test_data.filter_intervention(is_PKDB99999)
t2 = test_data.filter_intervention(is_PKDB99999, concise=False)
print(t1)
print(t2)

------------------------------
PKData (139906381251280)
------------------------------
studies             1  (    1)
groups              4  (   35)
individuals        46  (  400)
interventions       1  (    1)
outputs           147  (  147)
timecourses         1  (    1)
------------------------------
------------------------------
PKData (139906390513168)
------------------------------
studies           435  (  435)
groups            893  ( 8055)
individuals      6014  (55561)
interventions       1  (    1)
outputs         71501  (71501)
timecourses       419  (  419)
------------------------------


In [17]:
# FIXME: only normed data
t1.interventions_mi

Unnamed: 0_level_0,Unnamed: 0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1153,224,PKDB00198,Abernethy1982,3517,True,paracetamol_iv,iv,solution,single dose,0,...,paracetamol,0.65,,,,,,,,gram


                 Unnamed: 0  study_sid     study_name  raw_pk  normed  \
intervention_pk                                                         
1153                    224  PKDB00198  Abernethy1982    3517    True   

                           name route      form  application time  ...  \
intervention_pk                                                    ...   
1153             paracetamol_iv    iv  solution  single dose    0  ...   

                   substance value  mean median   min   max    sd    se    cv  \
intervention_pk                                                                 
1153             paracetamol  0.65  None   None  None  None  None  None  None   

                 unit  
intervention_pk        
1153             gram  

[1 rows x 24 columns]

In [18]:
t2.interventions_mi

Unnamed: 0_level_0,Unnamed: 0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1153,224,PKDB00198,Abernethy1982,3517,True,paracetamol_iv,iv,solution,single dose,0,...,paracetamol,0.65,,,,,,,,gram


                 Unnamed: 0  study_sid     study_name  raw_pk  normed  \
intervention_pk                                                         
1153                    224  PKDB00198  Abernethy1982    3517    True   

                           name route      form  application time  ...  \
intervention_pk                                                    ...   
1153             paracetamol_iv    iv  solution  single dose    0  ...   

                   substance value  mean median   min   max    sd    se    cv  \
intervention_pk                                                                 
1153             paracetamol  0.65  None   None  None  None  None  None  None   

                 unit  
intervention_pk        
1153             gram  

[1 rows x 24 columns]

In [19]:
t2.outputs

Unnamed: 0.1,output_pk,intervention_pk,Unnamed: 0,study_name,measurement_type,tissue,sd,se,min,group_pk,...,max,substance,label,individual_pk,unit,cv,median,mean,time,choice
0,29987,0,31210,Chiew2010,cmax,plasma,,0.021276,,312,...,,paracetamol,paracetamol,-1,gram / liter,0.746605,,0.085492,,
1,29988,0,32535,Chiew2010,cmax,plasma,,0.007244,,312,...,,paracetamol glucuronide,paracetamol,-1,gram / liter,0.198094,,0.109713,,
2,29989,0,31213,Chiew2010,cmax,plasma,,0.002045,,312,...,,paracetamol sulfate,paracetamol,-1,gram / liter,0.276726,,0.022172,,
3,29990,0,33656,Chiew2010,tmax,plasma,,0.404200,,312,...,,paracetamol,paracetamol,-1,hour,0.873002,,1.389000,,
4,29991,0,29717,Chiew2010,tmax,plasma,,0.175700,,312,...,,paracetamol glucuronide,paracetamol,-1,hour,0.148228,,3.556000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71496,186700,1174,99176,Mohammed1993,cmax,plasma,,,,1852,...,,codeine,,-1,gram / liter,,,0.000143,,
71497,186701,1174,99167,Mohammed1993,kel,plasma,,,,1852,...,,codeine,,-1,1 / minute,,,0.004981,,
71498,186702,1174,99156,Mohammed1993,thalf,plasma,,,,1852,...,,codeine,,-1,hour,,,2.319383,,
71499,186703,1174,99191,Mohammed1993,tmax,plasma,,,,1852,...,,codeine,,-1,hour,,,0.750000,,


       output_pk  intervention_pk  Unnamed: 0    study_name measurement_type  \
0          29987                0       31210     Chiew2010             cmax   
1          29988                0       32535     Chiew2010             cmax   
2          29989                0       31213     Chiew2010             cmax   
3          29990                0       33656     Chiew2010             tmax   
4          29991                0       29717     Chiew2010             tmax   
...          ...              ...         ...           ...              ...   
71496     186700             1174       99176  Mohammed1993             cmax   
71497     186701             1174       99167  Mohammed1993              kel   
71498     186702             1174       99156  Mohammed1993            thalf   
71499     186703             1174       99191  Mohammed1993             tmax   
71500     186704             1174       99285  Mohammed1993            vd_ss   

       tissue  sd        se  min  group

## Query groups and individuals
### 2.1 Get data for groups with characteristica/keywords X
healthy=True, smoking=N, disease=None,
individual queries and combinations.


In [20]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

def smoking(d):
    # smoking status is curated for study (this could by Y/N/NR)
    return  d.measurement_type == "smoking"

def nonsmoker(d):
    # smoking is reported and no
    return smoking(d) & (d.choice == "N")

def smoker(d):
    # smoking is reported and yes
    return smoking(d) & (d.choice == "Y")

In [23]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

`f_idx` can be a single function, or a list of functions. A list of functions are applied successively and is equivalent to "AND logic". "OR logic" can be directly applied on the index.

In [24]:
healthy_nonsmoker = test_data.filter_subject(f_idx=[is_healthy, nonsmoker])
print(healthy_nonsmoker)
healthy_nonsmoker.groups_mi

------------------------------
PKData (139906403306320)
------------------------------
studies           162  (  162)
groups            256  ( 2658)
individuals      2135  (23075)
interventions     384  (  591)
outputs         22760  (22760)
timecourses       137  (  137)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,study_name,study_sid,measurement_type,group_count,group_name,max,substance,count,group_parent_pk,sd,unit,se,min,cv,median,mean,choice,value
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
331,6701,4488,Depre1992,PKDB00212,species,12,study one,,,18,330,,,,,,,,homo sapiens,
331,6702,4489,Depre1992,PKDB00212,healthy,12,study one,,,18,330,,,,,,,,Y,
331,6703,4490,Depre1992,PKDB00212,sex,12,study one,,,18,330,,,,,,,,M,
331,6704,4484,Depre1992,PKDB00212,overnight fast,12,study one,,,18,330,,,,,,,,Y,
331,6708,4485,Depre1992,PKDB00212,age,12,study one,25.0,,12,330,,year,,21.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847,37345,11586,Caraco1999,10381807,sex,8,chinese,,,18,1845,,,,,,,,M,
1847,37346,11587,Caraco1999,10381807,smoking,8,chinese,,,18,1845,,,,,,,,N,
1847,37356,11588,Caraco1999,10381807,weight,8,chinese,,,8,1845,,kilogram,3.7,,,,66.1,,
1847,37357,11589,Caraco1999,10381807,age,8,chinese,,,8,1845,,year,2.4,,,,31.3,,


                             Unnamed: 0  study_name  study_sid  \
group_pk characteristica_pk                                      
331      6701                      4488   Depre1992  PKDB00212   
         6702                      4489   Depre1992  PKDB00212   
         6703                      4490   Depre1992  PKDB00212   
         6704                      4484   Depre1992  PKDB00212   
         6708                      4485   Depre1992  PKDB00212   
...                                 ...         ...        ...   
1847     37345                    11586  Caraco1999   10381807   
         37346                    11587  Caraco1999   10381807   
         37356                    11588  Caraco1999   10381807   
         37357                    11589  Caraco1999   10381807   
         37358                    11590  Caraco1999   10381807   

                            measurement_type  group_count group_name   max  \
group_pk characteristica_pk                                    

Often attributes are mixed for groups so we have to exclude the opposites.
In the example, the group `20` consists of 5 smokers and 1 nonsmoker. So for a subset of the group smoking is No.
We can exclude groups via

In [25]:
healthy_nonsmoker = test_data.filter_subject([is_healthy, nonsmoker]).exclude_subject([smoker])
print(healthy_nonsmoker)
display(healthy_nonsmoker.groups_mi)

------------------------------
PKData (139906396201872)
------------------------------
studies           135  (  135)
groups            182  ( 1781)
individuals      1143  (11368)
interventions     317  (  504)
outputs         17141  (17141)
timecourses       111  (  111)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,study_name,study_sid,measurement_type,group_count,group_name,max,substance,count,group_parent_pk,sd,unit,se,min,cv,median,mean,choice,value
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
331,6701,4488,Depre1992,PKDB00212,species,12,study one,,,18,330,,,,,,,,homo sapiens,
331,6702,4489,Depre1992,PKDB00212,healthy,12,study one,,,18,330,,,,,,,,Y,
331,6703,4490,Depre1992,PKDB00212,sex,12,study one,,,18,330,,,,,,,,M,
331,6704,4484,Depre1992,PKDB00212,overnight fast,12,study one,,,18,330,,,,,,,,Y,
331,6708,4485,Depre1992,PKDB00212,age,12,study one,25.0,,12,330,,year,,21.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847,37345,11586,Caraco1999,10381807,sex,8,chinese,,,18,1845,,,,,,,,M,
1847,37346,11587,Caraco1999,10381807,smoking,8,chinese,,,18,1845,,,,,,,,N,
1847,37356,11588,Caraco1999,10381807,weight,8,chinese,,,8,1845,,kilogram,3.7,,,,66.1,,
1847,37357,11589,Caraco1999,10381807,age,8,chinese,,,8,1845,,year,2.4,,,,31.3,,


                             Unnamed: 0  study_name  study_sid  \
group_pk characteristica_pk                                      
331      6701                      4488   Depre1992  PKDB00212   
         6702                      4489   Depre1992  PKDB00212   
         6703                      4490   Depre1992  PKDB00212   
         6704                      4484   Depre1992  PKDB00212   
         6708                      4485   Depre1992  PKDB00212   
...                                 ...         ...        ...   
1847     37345                    11586  Caraco1999   10381807   
         37346                    11587  Caraco1999   10381807   
         37356                    11588  Caraco1999   10381807   
         37357                    11589  Caraco1999   10381807   
         37358                    11590  Caraco1999   10381807   

                            measurement_type  group_count group_name   max  \
group_pk characteristica_pk                                    

In addition often combinations of attributes have to be used to find the correct subjects.
For instance a combination of `healthy` and reported `disease`

In [26]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

healthy1 = test_data.filter_subject(is_healthy)
healthy2 = test_data.exclude_subject(disease)
healthy3 = test_data.filter_subject(is_healthy).exclude_subject(disease)

print(healthy1)
print(healthy2)
print(healthy3)

------------------------------
PKData (139906407679376)
------------------------------
studies           403  (  403)
groups            736  ( 6669)
individuals      5015  (45092)
interventions    1067  ( 1676)
outputs         61264  (61264)
timecourses       386  (  386)
------------------------------
------------------------------
PKData (139906407679248)
------------------------------
studies           418  (  418)
groups            777  ( 6969)
individuals      5109  (45738)
interventions    1099  ( 1716)
outputs         63211  (63211)
timecourses       399  (  399)
------------------------------
------------------------------
PKData (139906396183568)
------------------------------
studies           401  (  401)
groups            724  ( 6528)
individuals      4972  (44703)
interventions    1060  ( 1665)
outputs         60645  (60645)
timecourses       383  (  383)
------------------------------


## 3 Query interventions
### 3.1 Get outputs/timecourses for intervention with substance
intervention with measurement_type "dosing" and substance "caffeine"

In [27]:
def dosing_and_caffeine(d):
    return ((d["measurement_type"]=="dosing") & (d["substance"]=="caffeine"))

In [29]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

In [30]:
caffeine_data = test_data.filter_intervention(dosing_and_caffeine)

In [31]:
print(caffeine_data)

------------------------------
PKData (139906405759824)
------------------------------
studies            80  (   80)
groups            132  ( 1292)
individuals       978  ( 9717)
interventions     153  (  261)
outputs          9833  ( 9833)
timecourses        60  (   60)
------------------------------


## 4 Query outputs/timecourses
### 4.1 query by measurement_type
filter all outputs with measurement_type auc_inf

In [33]:
def is_auc_inf(d):
    return (d["measurement_type"]=="auc_inf")  

test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

test_data = test_data.filter_output(is_auc_inf).delete_timecourses()
print(test_data)

------------------------------
PKData (139906386474896)
------------------------------
studies           308  (  308)
groups            465  ( 4496)
individuals       813  ( 8434)
interventions     692  (  996)
outputs          3418  ( 3418)
timecourses         0  (    0)
------------------------------


## 5 Other Query others
### 5.1 Complex
get clearance of codeine for all.h5 subjects, which have been phenotyped for cyp2d6. 


In [36]:
def is_cyp2d6_phenotyped(d):
    cyp2d6_phenotype_substances = ['spar/(2hspar+5hspar)', 'deb/4hdeb', 'dtf/dmt']
    return d["measurement_type"].isin(["metabolic phenotype", "metabolic ratio"]) & d["substance"].isin(cyp2d6_phenotype_substances)

def codeine_clearance(d):
    return (d["measurement_type"]=="clearance") & (d["substance"]=="codeine")                                                        

In [37]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

phenotyped_data = test_data.filter_output(is_cyp2d6_phenotyped)

In [38]:
test_data.groups = phenotyped_data.groups
test_data.individuals = phenotyped_data.individuals
test_data = test_data.filter_output(codeine_clearance).delete_timecourses()

In [39]:
print(test_data)

------------------------------
PKData (139906409691088)
------------------------------
studies             3  (    3)
groups              5  (   41)
individuals        14  (   98)
interventions       3  (    3)
outputs            19  (   19)
timecourses         0  (    0)
------------------------------
