In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from IPython.display import display

from pkdb_analysis import PKData, PKFilter
from pkdb_analysis.test import TESTDATA_CONCISE_FALSE_ZIP


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Filter data
A recurring task is to filter data for a certain question. E.g. to compare two groups, or get the subset of data for all healthy smokers.

We work again with our test data set and will filter various subsets from it.

In [4]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)
print(test_data)

INFO NumExpr defaulting to 8 threads.


------------------------------
PKData (139814370682512)
------------------------------
studies           497  (  497)
groups           1404  (11621)
individuals      6449  (59284)
interventions    1175  ( 1806)
outputs         71501  (71501)
timecourses       419  (  419)
------------------------------


## Filter functions
The main principle for filtering `PKData` is by using the `filter_*` and `exclude_*` functionality.

A key principle are hereby filter functions which for a given DataFrame return a logical index.

Depending on which subset of information this should be applied the `groups`, `individuals`, `subjects` (groups and individuals), `outputs` and `timecourses`.

## Filter by `study_sid`
A first example is filtering by `study_sid`, i.e. we only want the subset of data from a single study.
An overview over the existing study sids in the dataset is available via

In [5]:
test_data.study_sids

{'10381807',
 '10412886',
 '10444424',
 '10460065',
 '10634135',
 '10741630',
 '10934672',
 '10976543',
 '11061578',
 '11112089',
 '11180018',
 '11259986',
 '1133173',
 '11361054',
 '11497338',
 '1154379',
 '11709322',
 '11736870',
 '11753267',
 '11872673',
 '11986393',
 '12189441',
 '12200754',
 '12236852',
 '1246330',
 '12723464',
 '12784317',
 '13053413',
 '14612892',
 '14691614',
 '1484193',
 '1488408',
 '14979606',
 '14982753',
 '15206993',
 '15317833',
 '1551497',
 '15518608',
 '1577043',
 '1613123',
 '16158445',
 '1623898',
 '16261361',
 '170711',
 '17108811',
 '1732127',
 '17541571',
 '18213452',
 '1895958',
 '19094067',
 '2121568',
 '21252240',
 '2198434',
 '22673010',
 '23469684',
 '24517114',
 '25323804',
 '2584298',
 '264686',
 '28063968',
 '2816559',
 '28350522',
 '2857025',
 '2902373',
 '2921843',
 '29230348',
 '29403866',
 '30171779',
 '30729119',
 '3113968',
 '32071850',
 '3335120',
 '3356089',
 '3356110',
 '3437070',
 '3514335',
 '3519643',
 '3522621',
 '3546378',
 '36

Filtering a subset of data works by providing filter/selection functions which select a subset of the data.
The filters are written on the `groups`, `individuals`

In [6]:
def is_PKDB99999(d):
    """Filter for specific study_sid. """
    return d.study_sid == "PKDB99999"

data = test_data.filter_intervention(is_PKDB99999)
print(data)

------------------------------
PKData (139813409755024)
------------------------------
studies             0  (    0)
groups              0  (    0)
individuals         0  (    0)
interventions       0  (    0)
outputs             0  (    0)
timecourses         0  (    0)
------------------------------


The PKData now only contains data for the given study_sid:

In [7]:
print(data.study_sids)

set()


In [8]:
# for instance interventions
display(data.interventions)

Unnamed: 0.1,intervention_pk,Unnamed: 0,study_sid,study_name,raw_pk,normed,name,route,form,application,...,substance,value,mean,median,min,max,sd,se,cv,unit


Empty DataFrame
Columns: [intervention_pk, Unnamed: 0, study_sid, study_name, raw_pk, normed, name, route, form, application, time, time_end, time_unit, measurement_type, choice, substance, value, mean, median, min, max, sd, se, cv, unit]
Index: []

[0 rows x 25 columns]

One could also define this as a simple lambda function

In [9]:
data = test_data.filter_intervention(lambda d: d.study_sid == "PKDB99999")
print(data)

------------------------------
PKData (139814370711440)
------------------------------
studies             0  (    0)
groups              0  (    0)
individuals         0  (    0)
interventions       0  (    0)
outputs             0  (    0)
timecourses         0  (    0)
------------------------------


## Concise data
All operations on `PKData` leave the data in a consistent manner. 
E.g. if an intervention is filtered out also all the outputs using this intervention are filtered out.
This behavior is controlled by the `concise` flag on most operations.

In [10]:
t1 = test_data.filter_intervention(is_PKDB99999)
t2 = test_data.filter_intervention(is_PKDB99999, concise=False)
print(t1)
print(t2)

------------------------------
PKData (139813389324176)
------------------------------
studies             0  (    0)
groups              0  (    0)
individuals         0  (    0)
interventions       0  (    0)
outputs             0  (    0)
timecourses         0  (    0)
------------------------------
------------------------------
PKData (139813389323088)
------------------------------
studies           497  (  497)
groups           1404  (11621)
individuals      6449  (59284)
interventions       0  (    0)
outputs         71501  (71501)
timecourses       419  (  419)
------------------------------


In [11]:
# FIXME: only normed data
t1.interventions_mi

In [12]:
t2.interventions_mi

In [13]:
t2.outputs

Unnamed: 0.1,output_pk,intervention_pk,Unnamed: 0,study_name,measurement_type,tissue,sd,se,min,group_pk,...,max,substance,label,individual_pk,unit,cv,median,mean,time,choice
0,29987,0,31210,Chiew2010,cmax,plasma,,0.021276,,312,...,,paracetamol,paracetamol,-1,gram / liter,0.746605,,0.085492,,
1,29988,0,32535,Chiew2010,cmax,plasma,,0.007244,,312,...,,paracetamol glucuronide,paracetamol,-1,gram / liter,0.198094,,0.109713,,
2,29989,0,31213,Chiew2010,cmax,plasma,,0.002045,,312,...,,paracetamol sulfate,paracetamol,-1,gram / liter,0.276726,,0.022172,,
3,29990,0,33656,Chiew2010,tmax,plasma,,0.404200,,312,...,,paracetamol,paracetamol,-1,hour,0.873002,,1.389000,,
4,29991,0,29717,Chiew2010,tmax,plasma,,0.175700,,312,...,,paracetamol glucuronide,paracetamol,-1,hour,0.148228,,3.556000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71496,186700,1174,99176,Mohammed1993,cmax,plasma,,,,1852,...,,codeine,,-1,gram / liter,,,0.000143,,
71497,186701,1174,99167,Mohammed1993,kel,plasma,,,,1852,...,,codeine,,-1,1 / minute,,,0.004981,,
71498,186702,1174,99156,Mohammed1993,thalf,plasma,,,,1852,...,,codeine,,-1,hour,,,2.319383,,
71499,186703,1174,99191,Mohammed1993,tmax,plasma,,,,1852,...,,codeine,,-1,hour,,,0.750000,,


       output_pk  intervention_pk  Unnamed: 0    study_name measurement_type  \
0          29987                0       31210     Chiew2010             cmax   
1          29988                0       32535     Chiew2010             cmax   
2          29989                0       31213     Chiew2010             cmax   
3          29990                0       33656     Chiew2010             tmax   
4          29991                0       29717     Chiew2010             tmax   
...          ...              ...         ...           ...              ...   
71496     186700             1174       99176  Mohammed1993             cmax   
71497     186701             1174       99167  Mohammed1993              kel   
71498     186702             1174       99156  Mohammed1993            thalf   
71499     186703             1174       99191  Mohammed1993             tmax   
71500     186704             1174       99285  Mohammed1993            vd_ss   

       tissue  sd        se  min  group

## Query groups and individuals
### 2.1 Get data for groups with characteristica/keywords X
healthy=True, smoking=N, disease=None,
individual queries and combinations.


In [14]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

def smoking(d):
    # smoking status is curated for study (this could by Y/N/NR)
    return  d.measurement_type == "smoking"

def nonsmoker(d):
    # smoking is reported and no
    return smoking(d) & (d.choice == "N")

def smoker(d):
    # smoking is reported and yes
    return smoking(d) & (d.choice == "Y")

In [16]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

`f_idx` can be a single function, or a list of functions. A list of functions are applied successively and is equivalent to "AND logic". "OR logic" can be directly applied on the index.

In [17]:
healthy_nonsmoker = test_data.filter_subject(f_idx=[is_healthy, nonsmoker])
print(healthy_nonsmoker)
healthy_nonsmoker.groups_mi

------------------------------
PKData (139813403748560)
------------------------------
studies           162  (  162)
groups            256  ( 2658)
individuals      2135  (23075)
interventions     384  (  591)
outputs         22760  (22760)
timecourses       137  (  137)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,study_name,study_sid,measurement_type,group_count,group_name,max,substance,count,group_parent_pk,sd,unit,se,min,cv,median,mean,choice,value
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
331,6701,4488,Depre1992,PKDB00212,species,12,study one,,,18,330,,,,,,,,homo sapiens,
331,6702,4489,Depre1992,PKDB00212,healthy,12,study one,,,18,330,,,,,,,,Y,
331,6703,4490,Depre1992,PKDB00212,sex,12,study one,,,18,330,,,,,,,,M,
331,6704,4484,Depre1992,PKDB00212,overnight fast,12,study one,,,18,330,,,,,,,,Y,
331,6708,4485,Depre1992,PKDB00212,age,12,study one,25.0,,12,330,,year,,21.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847,37345,11586,Caraco1999,10381807,sex,8,chinese,,,18,1845,,,,,,,,M,
1847,37346,11587,Caraco1999,10381807,smoking,8,chinese,,,18,1845,,,,,,,,N,
1847,37356,11588,Caraco1999,10381807,weight,8,chinese,,,8,1845,,kilogram,3.7,,,,66.1,,
1847,37357,11589,Caraco1999,10381807,age,8,chinese,,,8,1845,,year,2.4,,,,31.3,,


                             Unnamed: 0  study_name  study_sid  \
group_pk characteristica_pk                                      
331      6701                      4488   Depre1992  PKDB00212   
         6702                      4489   Depre1992  PKDB00212   
         6703                      4490   Depre1992  PKDB00212   
         6704                      4484   Depre1992  PKDB00212   
         6708                      4485   Depre1992  PKDB00212   
...                                 ...         ...        ...   
1847     37345                    11586  Caraco1999   10381807   
         37346                    11587  Caraco1999   10381807   
         37356                    11588  Caraco1999   10381807   
         37357                    11589  Caraco1999   10381807   
         37358                    11590  Caraco1999   10381807   

                            measurement_type  group_count group_name   max  \
group_pk characteristica_pk                                    

Often attributes are mixed for groups so we have to exclude the opposites.
In the example, the group `20` consists of 5 smokers and 1 nonsmoker. So for a subset of the group smoking is No.
We can exclude groups via

In [18]:
healthy_nonsmoker = test_data.filter_subject([is_healthy, nonsmoker]).exclude_subject([smoker])
print(healthy_nonsmoker)
display(healthy_nonsmoker.groups_mi)

------------------------------
PKData (139813389366224)
------------------------------
studies           135  (  135)
groups            182  ( 1781)
individuals      1143  (11368)
interventions     317  (  504)
outputs         17141  (17141)
timecourses       111  (  111)
------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,study_name,study_sid,measurement_type,group_count,group_name,max,substance,count,group_parent_pk,sd,unit,se,min,cv,median,mean,choice,value
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
331,6701,4488,Depre1992,PKDB00212,species,12,study one,,,18,330,,,,,,,,homo sapiens,
331,6702,4489,Depre1992,PKDB00212,healthy,12,study one,,,18,330,,,,,,,,Y,
331,6703,4490,Depre1992,PKDB00212,sex,12,study one,,,18,330,,,,,,,,M,
331,6704,4484,Depre1992,PKDB00212,overnight fast,12,study one,,,18,330,,,,,,,,Y,
331,6708,4485,Depre1992,PKDB00212,age,12,study one,25.0,,12,330,,year,,21.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847,37345,11586,Caraco1999,10381807,sex,8,chinese,,,18,1845,,,,,,,,M,
1847,37346,11587,Caraco1999,10381807,smoking,8,chinese,,,18,1845,,,,,,,,N,
1847,37356,11588,Caraco1999,10381807,weight,8,chinese,,,8,1845,,kilogram,3.7,,,,66.1,,
1847,37357,11589,Caraco1999,10381807,age,8,chinese,,,8,1845,,year,2.4,,,,31.3,,


                             Unnamed: 0  study_name  study_sid  \
group_pk characteristica_pk                                      
331      6701                      4488   Depre1992  PKDB00212   
         6702                      4489   Depre1992  PKDB00212   
         6703                      4490   Depre1992  PKDB00212   
         6704                      4484   Depre1992  PKDB00212   
         6708                      4485   Depre1992  PKDB00212   
...                                 ...         ...        ...   
1847     37345                    11586  Caraco1999   10381807   
         37346                    11587  Caraco1999   10381807   
         37356                    11588  Caraco1999   10381807   
         37357                    11589  Caraco1999   10381807   
         37358                    11590  Caraco1999   10381807   

                            measurement_type  group_count group_name   max  \
group_pk characteristica_pk                                    

In addition often combinations of attributes have to be used to find the correct subjects.
For instance a combination of `healthy` and reported `disease`

In [19]:
def is_healthy(d): 
    # healthy is reported and True
    return (d.measurement_type == "healthy") & (d.choice == "Y")

def disease(d):
    # any disease is reported
    return  d.measurement_type == "disease"

healthy1 = test_data.filter_subject(is_healthy)
healthy2 = test_data.exclude_subject(disease)
healthy3 = test_data.filter_subject(is_healthy).exclude_subject(disease)

print(healthy1)
print(healthy2)
print(healthy3)

------------------------------
PKData (139813391202320)
------------------------------
studies           403  (  403)
groups            736  ( 6669)
individuals      5015  (45092)
interventions    1067  ( 1676)
outputs         61264  (61264)
timecourses       386  (  386)
------------------------------
------------------------------
PKData (139813391203344)
------------------------------
studies           418  (  418)
groups            777  ( 6969)
individuals      5109  (45738)
interventions    1099  ( 1716)
outputs         63211  (63211)
timecourses       399  (  399)
------------------------------
------------------------------
PKData (139813389367568)
------------------------------
studies           401  (  401)
groups            724  ( 6528)
individuals      4972  (44703)
interventions    1060  ( 1665)
outputs         60645  (60645)
timecourses       383  (  383)
------------------------------


## 3 Query interventions
### 3.1 Get outputs/timecourses for intervention with substance
intervention with measurement_type "dosing" and substance "caffeine"

In [20]:
def dosing_and_caffeine(d):
    return ((d["measurement_type"]=="dosing") & (d["substance"]=="caffeine"))

### 3.2 Get outputs/timecourses where multiple interventions were given

In [22]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

In [23]:
caffeine_data = test_data.filter_intervention(dosing_and_caffeine)

In [24]:
print(caffeine_data)

------------------------------
PKData (139813389072272)
------------------------------
studies            80  (   80)
groups            132  ( 1292)
individuals       978  ( 9717)
interventions     153  (  261)
outputs          9833  ( 9833)
timecourses        60  (   60)
------------------------------


## 4 Query outputs/timecourses
### 4.1 query by measurement_type
filter all outputs with measurement_type auc_inf

In [26]:
def is_auc_inf(d):
    return (d["measurement_type"]=="auc_inf")  

test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)

test_data = test_data.filter_output(is_auc_inf).delete_timecourses()
print(test_data)

------------------------------
PKData (139813408432592)
------------------------------
studies           308  (  308)
groups            465  ( 4496)
individuals       813  ( 8434)
interventions     692  (  996)
outputs          3418  ( 3418)
timecourses         0  (    0)
------------------------------


## 5 Other Query others
### 5.1 Complex
get clearance of codeine for all.h5 subjects, which have been phenotyped for cyp2d6. 


In [27]:
def is_cyp2d6_phenotyped(d):
    cyp2d6_phenotype_substances = ['spar/(2hspar+5hspar)', 'deb/4hdeb', 'dtf/dmt']
    return d["measurement_type"].isin(["metabolic phenotype", "metabolic ratio"]) & d["substance"].isin(cyp2d6_phenotype_substances)

def codeine_clearance(d):
    return (d["measurement_type"]=="clearance") & (d["substance"]=="codeine")                                                        

In [30]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)
phenotyped_data = test_data.filter_output(is_cyp2d6_phenotyped)

In [31]:
test_data.groups = phenotyped_data.groups
test_data.individuals = phenotyped_data.individuals
test_data = test_data.filter_output(codeine_clearance).delete_timecourses()

In [32]:
print(test_data)

------------------------------
PKData (139813386675024)
------------------------------
studies             3  (    3)
groups              5  (   41)
individuals        14  (   98)
interventions       3  (    3)
outputs            19  (   19)
timecourses         0  (    0)
------------------------------


## 6  Pitfalls 

In [34]:
test_data = PKData.from_archive(TESTDATA_CONCISE_FALSE_ZIP)
# Wrong 
def is_healthy_smoker(d): 
    """ This will yield zero subjects. No characteristica satisfy measurement_type == 'healthy' and measurement_type == 'disease'. 
    """
    return ((d["measurement_type"]=="healthy") & (d["choice"]=="Y")) & ((d["measurement_type"]=="smoking") & (d["choice"]=="Y"))
         
# Correct 
def is_healthy_smoker(d): 
    """ """
    return [(d["measurement_type"]=="healthy") & (d["choice"]=="Y"), (d["measurement_type"]=="smoking") & (d["choice"]=="Y")]

   
# Wrong 
def not_smoker_y(d):
    """ Be care this might not do what you expect. Excluding a specific characteristica will not eliminate any subject unless it is the only characteristica.
    """
    return ~((d["measurement_type"]=="smoking") & (d["choice"]=="Y")) 
not_smoker_y_data = test_data.filter_subject(not_smoker_y)

#Correct
# exlcude smoker
def smoker_y(d):
    return (d["measurement_type"]=="smoking") & (d["choice"]=="Y")
healthy_data = test_data.exclude_subject(smoker_y)


# Wrong 
def not_disease(d):
    """ Be care this might not do what you expect. Excluding a specific characteristica will not eliminate any subject unless it is the only characteristica
    """
    return  ~(d["measurement_type"]=="disease")
healthy_data = test_data.filter_subject(not_disease)

# Correct 
# exlcude the disease
def disease(d):
    return  d["measurement_type"]=="disease"
healthy_data = test_data.exclude_subject(disease)
