In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from IPython.display import display

from pkdb_analysis import PKData, PKFilter
from pkdb_analysis.tests.constants import TEST_HDF5

# Filter data
A recurring task is to filter data for a certain question. E.g. to compare two groups, or get the subset of data for all healthy smokers.

We work again with our test data set and will filter various subsets from it.

In [2]:
test_data = PKData.from_hdf5(TEST_HDF5)
print(test_data)

INFO Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO NumExpr defaulting to 8 threads.


------------------------------
PKData (140231641712848)
------------------------------
studies             4 
groups              8  (   86)
individuals       246  ( 3144)
interventions      18  (   18)
outputs          1064  ( 1106)
timecourses        40  (   46)
------------------------------


## Filter functions
The main principle for filtering `PKData` is by using the `filter_*` and `exclude_*` functionality.

A key principle are hereby filter functions which for a given DataFrame return a logical index.

Depending on which subset of information this should be applied the `groups`, `individuals`, `subjects` (groups and individuals), `outputs` and `timecourses`.

## Filter by `study_sid`
A first example is filtering by `study_sid`, i.e. we only want the subset of data from a single study.
An overview over the existing study sids in the dataset is available via

In [3]:
test_data.study_sids

{'PKDB99996', 'PKDB99997', 'PKDB99998', 'PKDB99999'}

Filtering a subset of data works by providing filter/selection functions which select a subset of the data.
The filters are written on the `groups`, `individuals`

In [4]:
def is_PKDB99999(d):
    """Filter for specific study_sid. """
    return d.study_sid == "PKDB99999"

data = test_data.filter_intervention(is_PKDB99999)
print(data)



------------------------------
PKData (140229638208080)
------------------------------
studies             1 
groups              1  (    6)
individuals         6  (   42)
interventions       3  (    3)
outputs           194  (  194)
timecourses         4  (    4)
------------------------------


The PKData now only contains data for the given study_sid:

In [5]:
print(data.study_sids)

{'PKDB99999'}


In [6]:
# for instance interventions
display(data.interventions)

Unnamed: 0,study_sid,study_name,intervention_pk,raw_pk,normed,name,route,form,application,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
8,PKDB99999,Test1,93,91,True,po75,oral,tablet,single dose,0.0,...,midazolam,0.0075,,,,,,,,gram
9,PKDB99999,Test1,94,92,True,po15,oral,tablet,single dose,0.0,...,midazolam,0.015,,,,,,,,gram
11,PKDB99999,Test1,96,95,True,iv,iv,solution,single dose,0.0,...,midazolam,7.5e-05,,,,,,,,gram / kilogram


One could also define this as a simple lambda function

In [7]:
data = test_data.filter_intervention(lambda d: d.study_sid == "PKDB99999")
print(data)



------------------------------
PKData (140229687749904)
------------------------------
studies             1 
groups              1  (    6)
individuals         6  (   42)
interventions       3  (    3)
outputs           194  (  194)
timecourses         4  (    4)
------------------------------


## Concise data
All operations on `PKData` leave the data in a consistent manner. 
E.g. if an intervention is filtered out also all the outputs using this intervention are filtered out.
This behavior is controlled by the `concise` flag on most operations.

In [8]:
t1 = test_data.filter_intervention(is_PKDB99999)
t2 = test_data.filter_intervention(is_PKDB99999, concise=False)
print(t1)
print(t2)



------------------------------
PKData (140229544119056)
------------------------------
studies             1 
groups              1  (    6)
individuals         6  (   42)
interventions       3  (    3)
outputs           194  (  194)
timecourses         4  (    4)
------------------------------
------------------------------
PKData (140229543983056)
------------------------------
studies             4 
groups              8  (   86)
individuals       246  ( 3144)
interventions       6  (    6)
outputs          1064  ( 1106)
timecourses        40  (   46)
------------------------------


In [9]:
# FIXME: only normed data
t1.interventions_mi

Unnamed: 0_level_0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,time_unit,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93,PKDB99999,Test1,91,True,po75,oral,tablet,single dose,0.0,hr,...,midazolam,0.0075,,,,,,,,gram
94,PKDB99999,Test1,92,True,po15,oral,tablet,single dose,0.0,hr,...,midazolam,0.015,,,,,,,,gram
96,PKDB99999,Test1,95,True,iv,iv,solution,single dose,0.0,hr,...,midazolam,7.5e-05,,,,,,,,gram / kilogram


In [10]:
t2.interventions_mi

Unnamed: 0_level_0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,time_unit,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91,PKDB99999,Test1,-1,False,po75,oral,tablet,single dose,0.0,hr,...,midazolam,7.5,,,,,,,,mg
92,PKDB99999,Test1,-1,False,po15,oral,tablet,single dose,0.0,hr,...,midazolam,15.0,,,,,,,,mg
93,PKDB99999,Test1,91,True,po75,oral,tablet,single dose,0.0,hr,...,midazolam,0.0075,,,,,,,,gram
94,PKDB99999,Test1,92,True,po15,oral,tablet,single dose,0.0,hr,...,midazolam,0.015,,,,,,,,gram
95,PKDB99999,Test1,-1,False,iv,iv,solution,single dose,0.0,hr,...,midazolam,0.075,,,,,,,,mg/kg
96,PKDB99999,Test1,95,True,iv,iv,solution,single dose,0.0,hr,...,midazolam,7.5e-05,,,,,,,,gram / kilogram


In [11]:
t2.outputs

Unnamed: 0,study_sid,study_name,output_pk,intervention_pk,group_pk,individual_pk,normed,calculated,tissue,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
0,PKDB99996,Test4,2510,106,27,-1,False,False,plasma,,...,caffeine,,0.78000,,,,0.18000,,,hr
1,PKDB99996,Test4,2515,106,27,-1,True,False,plasma,,...,caffeine,,4.80000,,,,1.10000,0.27500,0.229,hour
2,PKDB99996,Test4,2519,108,27,-1,False,False,plasma,,...,caffeine,,4.07000,,,,0.56000,,,µg/ml
3,PKDB99996,Test4,2523,106,27,-1,True,False,plasma,,...,caffeine,,0.00407,,,,0.00056,0.00014,0.138,gram / liter
4,PKDB99996,Test4,2528,106,27,-1,True,False,plasma,24.0,...,caffeine,,0.02970,,,,0.00660,0.00165,0.222,gram * hour / liter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1101,PKDB99998,Test2,2095,98,-1,171,True,False,plasma,,...,paracetamol,0.0577,,,,,,,,gram * hour / liter
1102,PKDB99998,Test2,2105,98,-1,181,True,False,plasma,,...,paracetamol,0.0536,,,,,,,,gram * hour / liter
1103,PKDB99998,Test2,2110,98,-1,186,True,False,plasma,,...,paracetamol,0.0414,,,,,,,,gram * hour / liter
1104,PKDB99998,Test2,2111,98,-1,187,True,False,plasma,,...,paracetamol,0.0502,,,,,,,,gram * hour / liter


## Query groups and individuals
### 2.1 Get data for groups with characteristica/keywords X
healthy=True, smoking=N, disease=None,
individual queries and combinations.


In [12]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

def smoking(d):
    # smoking status is curated for study (this could by Y/N/NR)
    return  d.measurement_type == "smoking"

def nonsmoker(d):
    # smoking is reported and no
    return smoking(d) & (d.choice == "N")

def smoker(d):
    # smoking is reported and yes
    return smoking(d) & (d.choice == "Y")

In [13]:
test_data = PKData.from_hdf5(TEST_HDF5)

`f_idx` can be a single function, or a list of functions. A list of functions are applied successively and is equivalent to "AND logic". "OR logic" can be directly applied on the index.

In [14]:
healthy_nonsmoker = test_data.filter_subject(f_idx=[is_healthy, nonsmoker])
print(healthy_nonsmoker)
healthy_nonsmoker.groups_mi



------------------------------
PKData (140229542701904)
------------------------------
studies             3 
groups              6  (   73)
individuals       244  ( 3136)
interventions       6  (    6)
outputs           944  (  986)
timecourses        40  (   46)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20,481,PKDB99999,Test1,all,6,-1,6,species,homo sapiens,,,,,,,,,,
20,482,PKDB99999,Test1,all,6,-1,6,healthy,Y,,,,,,,,,,
20,483,PKDB99999,Test1,all,6,-1,1,smoking,Y,,,,,,,,,,
20,484,PKDB99999,Test1,all,6,-1,5,smoking,N,,,,,,,,,,
20,485,PKDB99999,Test1,all,6,-1,6,age,,,,,,25.0,37.0,,,,yr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,1086,PKDB99996,Test4,all,16,-1,16,alcohol,N,,,,,,,,,,
27,1087,PKDB99996,Test4,all,16,-1,16,weight,,,,76.7,,,,6.8,,,kilogram
27,1088,PKDB99996,Test4,all,16,-1,16,age,,,,27.1,,,,3.1,,,yr
27,1089,PKDB99996,Test4,all,16,-1,16,ethnicity,NR,,,,,,,,,,


Often attributes are mixed for groups so we have to exclude the opposites.
In the example, the group `20` consists of 5 smokers and 1 nonsmoker. So for a subset of the group smoking is No.
We can exclude groups via

In [15]:
healthy_nonsmoker = test_data.filter_subject([is_healthy, nonsmoker]).exclude_subject([smoker])
print(healthy_nonsmoker)
display(healthy_nonsmoker.groups_mi)



------------------------------
PKData (140229540174224)
------------------------------
studies             1 
groups              1  (   11)
individuals         0  (    0)
interventions       2  (    2)
outputs            71  (  113)
timecourses        12  (   18)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
27,1080,PKDB99996,Test4,all,16,-1,16,species,homo sapiens,,,,,,,,,,
27,1081,PKDB99996,Test4,all,16,-1,16,healthy,Y,,,,,,,,,,
27,1082,PKDB99996,Test4,all,16,-1,16,sex,M,,,,,,,,,,
27,1083,PKDB99996,Test4,all,16,-1,16,smoking,N,,,,,,,,,,
27,1084,PKDB99996,Test4,all,16,-1,16,abstinence,,methylxanthine,,,,,,,,,
27,1085,PKDB99996,Test4,all,16,-1,16,medication,N,,,,,,,,,,
27,1086,PKDB99996,Test4,all,16,-1,16,alcohol,N,,,,,,,,,,
27,1087,PKDB99996,Test4,all,16,-1,16,weight,,,,76.7,,,,6.8,,,kilogram
27,1088,PKDB99996,Test4,all,16,-1,16,age,,,,27.1,,,,3.1,,,yr
27,1089,PKDB99996,Test4,all,16,-1,16,ethnicity,NR,,,,,,,,,,


In addition often combinations of attributes have to be used to find the correct subjects.
For instance a combination of `healthy` and reported `disease`

In [16]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

healthy1 = test_data.filter_subject(is_healthy)
healthy2 = test_data.exclude_subject(disease)
healthy3 = test_data.filter_subject(is_healthy).exclude_subject(disease)

print(healthy1)
print(healthy2)
print(healthy3)



------------------------------
PKData (140229542675728)
------------------------------
studies             3 
groups              6  (   73)
individuals       244  ( 3136)
interventions       6  (    6)
outputs           944  (  986)
timecourses        40  (   46)
------------------------------
------------------------------
PKData (140229542192656)
------------------------------
studies             4 
groups              6  (   73)
individuals       246  ( 3144)
interventions       9  (    9)
outputs          1064  ( 1106)
timecourses        40  (   46)
------------------------------
------------------------------
PKData (140229542702160)
------------------------------
studies             3 
groups              6  (   73)
individuals       244  ( 3136)
interventions       6  (    6)
outputs           944  (  986)
timecourses        40  (   46)
------------------------------


## 3 Query interventions
### 3.1 Get outputs/timecourses for intervention with substance
intervention with measurement_type "dosing" and substance "caffeine"

In [17]:
def dosing_and_caffeine(d):
    return ((d["measurement_type"]=="dosing") & (d["substance"]=="caffeine"))

### 3.2 Get outputs/timecourses where multiple interventions were given

In [18]:
test_data = PKData.from_hdf5(TEST_HDF5)

In [19]:
caffeine_data = test_data.filter_intervention(dosing_and_caffeine)



In [20]:
print(caffeine_data)

------------------------------
PKData (140229546305488)
------------------------------
studies             1 
groups              1  (   11)
individuals         0  (    0)
interventions       1  (    1)
outputs            71  (   71)
timecourses        12  (   12)
------------------------------


## 4 Query outputs/timecourses
### 4.1 query by measurement_type
filter all outputs with measurement_type auc_inf

In [21]:
def is_auc_inf(d):
    return (d["measurement_type"]=="auc_inf")  

test_data = PKData.from_hdf5(TEST_HDF5)

test_data = test_data.filter_output(is_auc_inf).delete_timecourses()
print(test_data)



------------------------------
PKData (140229537556752)
------------------------------
studies             3 
groups              6  (   73)
individuals       118  ( 1534)
interventions       5  (    5)
outputs           276  (  278)
timecourses         0  (    0)
------------------------------


## 5 Other Query others
### 5.1 Complex
get clearance of codeine for all.h5 subjects, which have been phenotyped for cyp2d6. 


In [22]:
def is_cyp2d6_phenotyped(d):
    cyp2d6_phenotype_substances = ['spar/(2hspar+5hspar)', 'deb/4hdeb', 'dtf/dmt']
    return d["measurement_type"].isin(["metabolic phenotype", "metabolic ratio"]) & d["substance"].isin(cyp2d6_phenotype_substances)

def codeine_clearance(d):
    return (d["measurement_type"]=="clearance") & (d["substance"]=="codeine")                                                        

In [23]:
test_data = PKData.from_hdf5(TEST_HDF5)
phenotyped_data = test_data.filter_output(is_cyp2d6_phenotyped)



In [24]:
test_data.groups = phenotyped_data.groups
test_data.individuals = phenotyped_data.individuals
test_data = test_data.filter_output(codeine_clearance).delete_timecourses()



In [25]:
print(test_data)

------------------------------
PKData (140229539075856)
------------------------------
studies             0 
groups              0  (    0)
individuals         0  (    0)
interventions       0  (    0)
outputs             0  (    0)
timecourses         0  (    0)
------------------------------


## 6  Pitfalls 

In [26]:
test_data = PKData.from_hdf5(TEST_HDF5)
# Wrong 
def is_healthy_smoker(d): 
    """ This will yield zero subjects. No characteristica satisfy measurement_type == 'healthy' and measurement_type == 'disease'. 
    """
    return ((d["measurement_type"]=="healthy") & (d["choice"]=="Y")) & ((d["measurement_type"]=="smoking") & (d["choice"]=="Y"))
         
# Correct 
def is_healthy_smoker(d): 
    """ """
    return [(d["measurement_type"]=="healthy") & (d["choice"]=="Y"), (d["measurement_type"]=="smoking") & (d["choice"]=="Y")]

   
# Wrong 
def not_smoker_y(d):
    """ Be care this might not do what you expect. Excluding a specific characteristica will not eliminate any subject unless it is the only characteristica.
    """
    return ~((d["measurement_type"]=="smoking") & (d["choice"]=="Y")) 
not_smoker_y_data = test_data.filter_subject(not_smoker_y)

#Correct
# exlcude smoker
def smoker_y(d):
    return (d["measurement_type"]=="smoking") & (d["choice"]=="Y")
healthy_data = test_data.exclude_subject(smoker_y)


# Wrong 
def not_disease(d):
    """ Be care this might not do what you expect. Excluding a specific characteristica will not eliminate any subject unless it is the only characteristica
    """
    return  ~(d["measurement_type"]=="disease")
healthy_data = test_data.filter_subject(not_disease)

# Correct 
# exlcude the disease
def disease(d):
    return  d["measurement_type"]=="disease"
healthy_data = test_data.exclude_subject(disease)


