In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from IPython.display import display

# install pkdb_analysis via:
#     git clone https://github.com/matthiaskoenig/pkdb_analysis
#     cd pkdb_analysis
#     pip install -e .

from pkdb_analysis import PKDB, PKData, PKFilter

# Filter data
A recurring task is to filter data for a certain question. E.g. to compare two groups, or get the subset of data for all healthy smokers.

We work again with our test data set and will filter various subsets from it.

In [2]:
test_data = PKDB.query()
TEST_HDF5 ="./test_data.hdf5"
test_data.to_hdf5(path=TEST_HDF5)

INFO *** Querying data ***
INFO http://0.0.0.0:8000/api/v1/interventions_analysis/?format=json&page_size=2000&normed=true
INFO http://0.0.0.0:8000/api/v1/interventions_analysis/?format=json&page_size=2000&normed=true&page=1
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=1
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=2
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=3
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=4
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=5
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=6
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=7
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=200

In [27]:
print(test_data)

------------------------------
PKData (140541721754704)
------------------------------
studies           221 
groups            606  ( 4857)
individuals      2903  (26380)
interventions     541  (  804)
outputs         19841  (19841)
timecourses      1376  ( 1376)
------------------------------


In [30]:
list(test_data.study_sids)[:10]

['PKDB00104',
 'PKDB00039',
 'PKDB00013',
 'PKDB00056',
 'PKDB00107',
 'PKDB00226',
 'PKDB00175',
 'PKDB00090',
 'PKDB00266',
 '4079279']

## Filter functions
The main principle for filtering `PKData` is by using the `filter_*` and `exclude_*` functionality.

A key principle are hereby filter functions which for a given DataFrame return a logical index.

Depending on which subset of information this should be applied the `groups`, `individuals`, `subjects` (groups and individuals), `outputs` and `timecourses`.

## Filter by `study_sid`
A first example is filtering by `study_sid`, i.e. we only want the subset of data from a single study.
An overview over the existing study sids in the dataset is available via

Filtering a subset of data works by providing filter/selection functions which select a subset of the data.
The filters are written on the `groups`, `individuals`

In [4]:
def is_PKDB99999(d):
    """Filter for specific study_sid. """
    return d.study_sid == "PKDB00198"

data = test_data.filter_intervention(is_PKDB99999)
print(data)



------------------------------
PKData (140541887720592)
------------------------------
studies             1 
groups              4  (   36)
individuals        46  (  414)
interventions       1  (    1)
outputs            92  (   92)
timecourses         4  (    4)
------------------------------


The PKData now only contains data for the given study_sid:

In [5]:
print(data.study_sids)

{'PKDB00198'}


In [6]:
# for instance interventions
display(data.interventions)

Unnamed: 0,intervention_pk,study_sid,study_name,raw_pk,normed,name,route,form,application,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
0,0,PKDB00198,Abernethy1982,377,True,paracetamol_iv,iv,solution,single dose,0.0,...,paracetamol,0.65,,,,,,,,gram


One could also define this as a simple lambda function

In [7]:
data = test_data.filter_intervention(lambda d: d.study_sid == "PKDB00198")
print(data)



------------------------------
PKData (140541885789776)
------------------------------
studies             1 
groups              4  (   36)
individuals        46  (  414)
interventions       1  (    1)
outputs            92  (   92)
timecourses         4  (    4)
------------------------------


## Concise data
All operations on `PKData` leave the data in a consistent manner. 
E.g. if an intervention is filtered out also all the outputs using this intervention are filtered out.
This behavior is controlled by the `concise` flag on most operations.

In [8]:
t1 = test_data.filter_intervention(is_PKDB99999)
t2 = test_data.filter_intervention(is_PKDB99999, concise=False)
print(t1)
print(t2)



------------------------------
PKData (140541887727632)
------------------------------
studies             1 
groups              4  (   36)
individuals        46  (  414)
interventions       1  (    1)
outputs            92  (   92)
timecourses         4  (    4)
------------------------------
------------------------------
PKData (140541891193360)
------------------------------
studies           221 
groups            606  ( 4857)
individuals      2903  (26380)
interventions       1  (    1)
outputs         19841  (19841)
timecourses      1376  ( 1376)
------------------------------


In [9]:
# FIXME: only normed data
t1.interventions_mi

Unnamed: 0_level_0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,time_end,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,PKDB00198,Abernethy1982,377,True,paracetamol_iv,iv,solution,single dose,0.0,,...,paracetamol,0.65,,,,,,,,gram


In [10]:
t2.interventions_mi

Unnamed: 0_level_0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,time_end,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,PKDB00198,Abernethy1982,377,True,paracetamol_iv,iv,solution,single dose,0.0,,...,paracetamol,0.65,,,,,,,,gram


In [11]:
t2.outputs

Unnamed: 0,output_pk,intervention_pk,study_sid,study_name,group_pk,individual_pk,normed,calculated,tissue,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
0,9333,0,PKDB00198,Abernethy1982,183,-1,True,False,plasma,,...,paracetamol,,2.5500,,1.9900,3.4700,,,,hour
1,9334,0,PKDB00198,Abernethy1982,183,-1,True,False,plasma,,...,paracetamol,,108.5000,,62.2000,151.4000,,,,liter
2,9335,0,PKDB00198,Abernethy1982,183,-1,True,False,plasma,,...,paracetamol,,0.8100,,0.5300,1.3100,,,,liter / kilogram
3,9336,0,PKDB00198,Abernethy1982,183,-1,True,False,plasma,,...,paracetamol,,29.0400,,19.4400,38.7600,,,,liter / hour
4,9337,0,PKDB00198,Abernethy1982,183,-1,True,False,plasma,,...,paracetamol,,0.2244,,0.1452,0.3156,,,,liter / hour / kilogram
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19836,50554,534,PKDB00114,Yue1991,-1,3643,True,True,plasma,,...,normorphine,0.000071,,,,,,,,gram * hour / liter
19837,50555,534,PKDB00114,Yue1991,-1,3643,True,True,plasma,,...,normorphine,0.000007,,,,,,,,gram / liter
19838,50556,534,PKDB00114,Yue1991,-1,3643,True,True,plasma,,...,normorphine,0.001288,,,,,,,,1 / minute
19839,50557,534,PKDB00114,Yue1991,-1,3643,True,True,plasma,,...,normorphine,8.966857,,,,,,,,hour


## Query groups and individuals
### 2.1 Get data for groups with characteristica/keywords X
healthy=True, smoking=N, disease=None,
individual queries and combinations.


In [12]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

def smoking(d):
    # smoking status is curated for study (this could by Y/N/NR)
    return  d.measurement_type == "smoking"

def nonsmoker(d):
    # smoking is reported and no
    return smoking(d) & (d.choice == "N")

def smoker(d):
    # smoking is reported and yes
    return smoking(d) & (d.choice == "Y")

In [13]:
test_data = PKData.from_hdf5(TEST_HDF5)

`f_idx` can be a single function, or a list of functions. A list of functions are applied successively and is equivalent to "AND logic". "OR logic" can be directly applied on the index.

In [14]:
healthy_nonsmoker = test_data.filter_subject(f_idx=[is_healthy, nonsmoker])
print(healthy_nonsmoker)
healthy_nonsmoker.groups_mi



------------------------------
PKData (140541914544208)
------------------------------
studies            90 
groups            151  ( 1507)
individuals      1075  (11006)
interventions     183  (  291)
outputs          7246  ( 7246)
timecourses       450  (  450)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
183,3171,PKDB00198,Abernethy1982,obese men,7,182,42,species,homo sapiens,,,,,,,,,,
183,3172,PKDB00198,Abernethy1982,obese men,7,182,42,healthy,Y,,,,,,,,,,
183,3177,PKDB00198,Abernethy1982,obese men,7,182,21,obesity index,,,,,,133.0,,,,,percent
183,3178,PKDB00198,Abernethy1982,obese men,7,182,21,weight (categorial),obese,,,,,,,,,,
183,3184,PKDB00198,Abernethy1982,obese men,7,182,7,sex,M,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,15378,PKDB00114,Yue1991,all,8,-1,1,smoking,Y,,,,,,,,,,
792,15379,PKDB00114,Yue1991,all,8,-1,7,smoking,N,,,,,,,,,,
792,15380,PKDB00114,Yue1991,all,8,-1,1,smoking amount (cigarettes),,,,10.000,,,,,,,1 / day
792,15381,PKDB00114,Yue1991,all,8,-1,8,age,,,,34.375,,,,5.47,,,year


Often attributes are mixed for groups so we have to exclude the opposites.
In the example, the group `20` consists of 5 smokers and 1 nonsmoker. So for a subset of the group smoking is No.
We can exclude groups via

In [15]:
healthy_nonsmoker = test_data.filter_subject([is_healthy, nonsmoker]).exclude_subject([smoker])
print(healthy_nonsmoker)
display(healthy_nonsmoker.groups_mi)



------------------------------
PKData (140541914916368)
------------------------------
studies            77 
groups            114  ( 1081)
individuals       627  ( 5805)
interventions     157  (  258)
outputs          6012  ( 6012)
timecourses       389  (  389)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
191,3444,PKDB00160,Adithan1982,all,10,-1,10,species,homo sapiens,,,,,,,,,,
191,3445,PKDB00160,Adithan1982,all,10,-1,10,sex,M,,,,,,,,,,
191,3446,PKDB00160,Adithan1982,all,10,-1,10,age,,,,21.2,,,,,0.47,,year
191,3447,PKDB00160,Adithan1982,all,10,-1,10,weight,,,,61.6,,,,,1.94,,kilogram
191,3448,PKDB00160,Adithan1982,all,10,-1,10,healthy,Y,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789,15322,PKDB00112,Yue1989a,Chinese_M_NONSMOKE,84,780,133,weight,,,,63.0,,42.0,90.0,9.0,,,kilogram
789,15325,PKDB00112,Yue1989a,Chinese_M_NONSMOKE,84,780,132,oral contraceptives,N,,,,,,,,,,
789,15326,PKDB00112,Yue1989a,Chinese_M_NONSMOKE,84,780,1,oral contraceptives,Y,,,,,,,,,,
789,15332,PKDB00112,Yue1989a,Chinese_M_NONSMOKE,84,780,110,sex,M,,,,,,,,,,


In addition often combinations of attributes have to be used to find the correct subjects.
For instance a combination of `healthy` and reported `disease`

In [16]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

healthy1 = test_data.filter_subject(is_healthy)
healthy2 = test_data.exclude_subject(disease)
healthy3 = test_data.filter_subject(is_healthy).exclude_subject(disease)

print(healthy1)
print(healthy2)
print(healthy3)



------------------------------
PKData (140541898257104)
------------------------------
studies           198 
groups            334  ( 2935)
individuals      2164  (19140)
interventions     484  (  741)
outputs         17164  (17164)
timecourses      1220  ( 1220)
------------------------------
------------------------------
PKData (140541914805264)
------------------------------
studies           207 
groups            366  ( 3164)
individuals      2249  (19806)
interventions     503  (  760)
outputs         18416  (18416)
timecourses      1312  ( 1312)
------------------------------
------------------------------
PKData (140541914927504)
------------------------------
studies           197 
groups            334  ( 2935)
individuals      2144  (18991)
interventions     483  (  736)
outputs         17086  (17086)
timecourses      1220  ( 1220)
------------------------------


## 3 Query interventions
### 3.1 Get outputs/timecourses for intervention with substance
intervention with measurement_type "dosing" and substance "caffeine"

In [17]:
def dosing_and_caffeine(d):
    return ((d["measurement_type"]=="dosing") & (d["substance"]=="caffeine"))

In [18]:
test_data = PKData.from_hdf5(TEST_HDF5)

In [19]:
caffeine_data = test_data.filter_intervention(dosing_and_caffeine)



In [20]:
print(caffeine_data)

------------------------------
PKData (140541760368528)
------------------------------
studies            63 
groups            102  (  954)
individuals       726  ( 6435)
interventions     110  (  207)
outputs          5747  ( 5747)
timecourses       330  (  330)
------------------------------


## 4 Query outputs/timecourses
### 4.1 query by measurement_type
filter all outputs with measurement_type auc_inf

In [21]:
def is_auc_inf(d):
    return (d["measurement_type"]=="auc_inf")  

test_data = PKData.from_hdf5(TEST_HDF5)

test_data = test_data.filter_output(is_auc_inf).delete_timecourses()
print(test_data)



------------------------------
PKData (140541761933392)
------------------------------
studies           155 
groups            200  ( 1713)
individuals       512  ( 4527)
interventions     336  (  529)
outputs          1877  ( 1877)
timecourses         0  (    0)
------------------------------


## 5 Other Query others
### 5.1 Complex
get clearance of codeine for all.h5 subjects, which have been phenotyped for cyp2d6. 


In [22]:
def is_cyp2d6_phenotyped(d):
    cyp2d6_phenotype_substances = ['spar/(2hspar+5hspar)', 'deb/4hdeb', 'dtf/dmt']
    return d["measurement_type"].isin(["metabolic phenotype", "metabolic ratio"]) & d["substance"].isin(cyp2d6_phenotype_substances)

def codeine_clearance(d):
    return (d["measurement_type"]=="clearance") & (d["substance"]=="codeine")                                                        

In [23]:
test_data = PKData.from_hdf5(TEST_HDF5)

phenotyped_data = test_data.filter_output(is_cyp2d6_phenotyped)



In [24]:
test_data.groups = phenotyped_data.groups
test_data.individuals = phenotyped_data.individuals
test_data = test_data.filter_output(codeine_clearance).delete_timecourses()



In [25]:
print(test_data)

------------------------------
PKData (140541897941392)
------------------------------
studies            11 
groups             14  (  138)
individuals        13  (  135)
interventions      13  (   21)
outputs            30  (   30)
timecourses         0  (    0)
------------------------------
