In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from IPython.display import display

from pkdb_analysis import PKData, PKFilter
from pkdb_analysis.tests.constants import TEST_HDF5

# Working with PK-DB data
To easily work with PK-DB data we provide the `pkdb_analysis` python library.
These includes helper functions for querying data and filter existing data sets. In the following we provide an overview over the typical functionality when working with PK-DB data.

The main class to work with is `PKData`. It is possible to directly query the database or to load data from file.

## Load data from file
PKData can be serialized to HDF5 files. In the following we will load the test data set and print an overview.

In [2]:
from pkdb_analysis import PKData, PKFilter
from pkdb_analysis.tests.constants import TEST_HDF5

data = PKData.from_hdf5(TEST_HDF5)
print(data)

INFO Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO NumExpr defaulting to 8 threads.


------------------------------
PKData (140432779705360)
------------------------------
studies             4 
groups              8  (   86)
individuals       246  ( 3144)
interventions      18  (   18)
outputs          1064  ( 1106)
timecourses        40  (   46)
------------------------------


## Load data from database
Alternatively data can be loaded from the database using the `PKDB.query()` function.
This is documented in the `Querying PK-DB` section.

## Accessing groups, individuals, interventions, outputs and timecourses
All PKData consists of consistent information on:
- `studies`: PK-DB studies, uniquely identified via a `study_sid`
- `groups`: groups, uniquely identified via `group_pk`
- `individuals`: individuals, uniquely identified via `individual_pk`
- `interventions`: interventions, uniquely identified via `intervention_pk`
- `outputs`: outputs, uniquely identified via `output_pk`
- `timecourses`: groups, uniquely identified via `timecourse_pk`

The `print` function provides a simple overview over the content

In [3]:
print(data)

------------------------------
PKData (140432779705360)
------------------------------
studies             4 
groups              8  (   86)
individuals       246  ( 3144)
interventions      18  (   18)
outputs          1064  ( 1106)
timecourses        40  (   46)
------------------------------


We can access the information via the respective fields, e.g., groups via `data.groups` or the multi-index data via `data.groups_mi`.

In [4]:
data.groups

Unnamed: 0,study_sid,study_name,group_pk,group_name,group_count,group_parent_pk,characteristica_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
0,PKDB99997,Test3,26,all,2,-1,1068,2,abstinence,,cocoa,,,,,,,,,
1,PKDB99997,Test3,26,all,2,-1,1067,2,abstinence,,tea,,,,,,,,,
2,PKDB99997,Test3,26,all,2,-1,1066,2,abstinence,,coffee,,,,,,,,,
3,PKDB99997,Test3,26,all,2,-1,1065,2,species,homo sapiens,,,,,,,,,,
4,PKDB99999,Test1,20,all,6,-1,481,6,species,homo sapiens,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,PKDB99998,Test2,25,80-90,10,21,523,40,medication,N,,,,,,,,,,
82,PKDB99998,Test2,25,80-90,10,21,584,10,gfr,,,,36.99422,,,,10.982659,,,milliliter / meter ** 2 / minute
83,PKDB99998,Test2,25,80-90,10,21,526,40,ethnicity,caucasian,,,,,,,,,,
84,PKDB99998,Test2,25,80-90,10,21,527,38,smoking,N,,,,,,,,,,


In [5]:
data.groups_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
group_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20,481,PKDB99999,Test1,all,6,-1,6,species,homo sapiens,,,,,,,,,,
20,482,PKDB99999,Test1,all,6,-1,6,healthy,Y,,,,,,,,,,
20,483,PKDB99999,Test1,all,6,-1,1,smoking,Y,,,,,,,,,,
20,484,PKDB99999,Test1,all,6,-1,5,smoking,N,,,,,,,,,,
20,485,PKDB99999,Test1,all,6,-1,6,age,,,,,,25.0,37.0,,,,yr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,1086,PKDB99996,Test4,all,16,-1,16,alcohol,N,,,,,,,,,,
27,1087,PKDB99996,Test4,all,16,-1,16,weight,,,,76.7,,,,6.8,,,kilogram
27,1088,PKDB99996,Test4,all,16,-1,16,age,,,,27.1,,,,3.1,,,yr
27,1089,PKDB99996,Test4,all,16,-1,16,ethnicity,NR,,,,,,,,,,


To access the number of items use the `*_count`.

In [6]:
print(f"Number of groups: {data.groups_count}")

Number of groups: 8


The `groups`, `individuals`, `interventions`, `outputs` and `timecourses` are `pandas.DataFrame` instances, so all the classical pandas operations can be applied on the data.
For instance to access a single `group` use logical indexing by the `group_pk` field.
E.g. to get the group `20` use

In [7]:
data.groups[data.groups.group_pk==20]

Unnamed: 0,study_sid,study_name,group_pk,group_name,group_count,group_parent_pk,characteristica_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
4,PKDB99999,Test1,20,all,6,-1,481,6,species,homo sapiens,,,,,,,,,,
5,PKDB99999,Test1,20,all,6,-1,482,6,healthy,Y,,,,,,,,,,
6,PKDB99999,Test1,20,all,6,-1,483,1,smoking,Y,,,,,,,,,,
7,PKDB99999,Test1,20,all,6,-1,484,5,smoking,N,,,,,,,,,,
8,PKDB99999,Test1,20,all,6,-1,485,6,age,,,,,,25.0,37.0,,,,yr
9,PKDB99999,Test1,20,all,6,-1,486,6,overnight fast,Y,,,,,,,,,,


In the group tables multiple rows exist which belong to a single group! This is important to understand filtering of the data later on. For instance in this example the information on `species`, `healthy`, `smoking`, `age` and `overnight_fast` are all separate rows in the `groups` table, but belong to a single row.

When looking at the multi-index table this becomes more clear. We now get the group 20 form the `groups_mi`. We can simply use the `.loc` to lookup the group by `pk`

In [8]:
data.groups_mi.loc[20]

Unnamed: 0_level_0,study_sid,study_name,group_name,group_count,group_parent_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
characteristica_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
481,PKDB99999,Test1,all,6,-1,6,species,homo sapiens,,,,,,,,,,
482,PKDB99999,Test1,all,6,-1,6,healthy,Y,,,,,,,,,,
483,PKDB99999,Test1,all,6,-1,1,smoking,Y,,,,,,,,,,
484,PKDB99999,Test1,all,6,-1,5,smoking,N,,,,,,,,,,
485,PKDB99999,Test1,all,6,-1,6,age,,,,,,25.0,37.0,,,,yr
486,PKDB99999,Test1,all,6,-1,6,overnight fast,Y,,,,,,,,,,


In a similar manner we can explore the other information, i.e. `individuals`, `interventions`, `outputs` and `timecourses`.

In [9]:
data.individuals_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,study_sid,study_name,individual_name,individual_group_pk,count,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
individual_pk,characteristica_pk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
39,481,PKDB99999,Test1,WS,20,6,species,homo sapiens,,,,,,,,,,
39,482,PKDB99999,Test1,WS,20,6,healthy,Y,,,,,,,,,,
39,483,PKDB99999,Test1,WS,20,1,smoking,Y,,,,,,,,,,
39,484,PKDB99999,Test1,WS,20,5,smoking,N,,,,,,,,,,
39,486,PKDB99999,Test1,WS,20,6,overnight fast,Y,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,1068,PKDB99997,Test3,H.C.,26,2,abstinence,,cocoa,,,,,,,,,
284,1065,PKDB99997,Test3,F.M.,26,2,species,homo sapiens,,,,,,,,,,
284,1066,PKDB99997,Test3,F.M.,26,2,abstinence,,coffee,,,,,,,,,
284,1067,PKDB99997,Test3,F.M.,26,2,abstinence,,tea,,,,,,,,,


In [10]:
data.interventions_mi

Unnamed: 0_level_0,study_sid,study_name,raw_pk,normed,name,route,form,application,time,time_unit,...,substance,value,mean,median,min,max,sd,se,cv,unit
intervention_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91,PKDB99999,Test1,-1,False,po75,oral,tablet,single dose,0.0,hr,...,midazolam,7.5,,,,,,,,mg
92,PKDB99999,Test1,-1,False,po15,oral,tablet,single dose,0.0,hr,...,midazolam,15.0,,,,,,,,mg
93,PKDB99999,Test1,91,True,po75,oral,tablet,single dose,0.0,hr,...,midazolam,0.0075,,,,,,,,gram
94,PKDB99999,Test1,92,True,po15,oral,tablet,single dose,0.0,hr,...,midazolam,0.015,,,,,,,,gram
95,PKDB99999,Test1,-1,False,iv,iv,solution,single dose,0.0,hr,...,midazolam,0.075,,,,,,,,mg/kg
96,PKDB99999,Test1,95,True,iv,iv,solution,single dose,0.0,hr,...,midazolam,7.5e-05,,,,,,,,gram / kilogram
97,PKDB99998,Test2,-1,False,paracetamol1000mg_iv,iv,solution,constant infusion,0.0,min,...,paracetamol,1000.0,,,,,,,,mg
98,PKDB99998,Test2,97,True,paracetamol1000mg_iv,iv,solution,constant infusion,0.0,min,...,paracetamol,1.0,,,,,,,,gram
99,PKDB99997,Test3,-1,False,theobromine,oral,,,0.0,hr,...,theobromine,1.0,,,,,,,,g
100,PKDB99997,Test3,99,True,theobromine,oral,,,0.0,hr,...,theobromine,1.0,,,,,,,,gram


In [11]:
data.outputs_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,study_sid,study_name,normed,calculated,tissue,time,time_unit,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
output_pk,intervention_pk,group_pk,individual_pk,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1409,96,-1,39,PKDB99999,Test1,False,False,plasma,,,thalf,,midazolam,2.30,,,,,,,,hr
1410,96,-1,39,PKDB99999,Test1,False,False,plasma,,,vd,,midazolam,0.71,,,,,,,,l/kg
1411,96,-1,39,PKDB99999,Test1,False,False,plasma,,,clearance,,midazolam,292.00,,,,,,,,ml/min
1412,96,-1,39,PKDB99999,Test1,False,False,plasma,,,clearance_unbound,,midazolam,5840.00,,,,,,,,ml/min
1413,96,-1,39,PKDB99999,Test1,False,False,plasma,,,plasma_binding,,midazolam,95.00,,,,,,,,percent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,108,27,-1,PKDB99996,Test4,True,True,plasma,,,thalf,,paraxanthine,,9.36,,,,,,,hour
2605,106,27,-1,PKDB99996,Test4,True,True,plasma,,,vd,,paraxanthine,,90.30,,,,,,,liter
2605,108,27,-1,PKDB99996,Test4,True,True,plasma,,,vd,,paraxanthine,,90.30,,,,,,,liter
2606,106,27,-1,PKDB99996,Test4,True,True,plasma,,,tmax,,paraxanthine,,6.00,,,,,,,hour


In [12]:
data.timecourses_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,study_sid,study_name,normed,tissue,time,time_unit,measurement_type,choice,substance,value,mean,median,min,max,sd,se,cv,unit
timecourse_pk,intervention_pk,group_pk,individual_pk,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
67,96,20,-1,PKDB99999,Test1,False,plasma,"[0.0, 0.17, 0.33, 0.5, 0.75, 1.0, 1.5, 2.0, 3....",hr,concentration,,midazolam,,"[185.0, 144.0, 121.0, 106.0, 83.8, 76.7, 58.3,...",,,,"[23.3, 18.2, 20.6, 26.8, 10.3, 19.9, 9.37, 13....",,,ng/ml
68,94,20,-1,PKDB99999,Test1,False,plasma,"[0.17, 0.33, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4....",hr,concentration,,midazolam,,"[10.1, 132.0, 134.0, 96.7, 96.6, 67.7, 57.5, 4...",,,,"[4.76, 74.8, 30.1, 42.3, 52.9, 25.1, 11.8, 10....",,,ng/ml
69,96,20,-1,PKDB99999,Test1,True,plasma,"[0.0, 0.17, 0.33, 0.5, 0.75, 1.0, 1.5, 2.0, 3....",hr,concentration,,midazolam,,"[0.000185, 0.000144, 0.000121, 0.000106, 8.38e...",,,,"[2.33e-05, 1.82e-05, 2.06e-05, 2.68e-05, 1.03e...","[9.52e-06, 7.42e-06, 8.4e-06, 1.09e-05, 4.19e-...","[0.126, 0.126, 0.171, 0.253, 0.123, 0.259, 0.1...",gram / liter
70,94,20,-1,PKDB99999,Test1,True,plasma,"[0.17, 0.33, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4....",hr,concentration,,midazolam,,"[1.01e-05, 0.000132, 0.000134, 9.67e-05, 9.66e...",,,,"[4.76e-06, 7.49e-05, 3.01e-05, 4.23e-05, 5.29e...","[1.94e-06, 3.06e-05, 1.23e-05, 1.73e-05, 2.16e...","[0.471, 0.566, 0.224, 0.437, 0.548, 0.37, 0.20...",gram / liter
71,98,22,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol,,"[0.0, 7.92, 15.1, 15.2, 14.4, 13.7, 12.3, 11.4...",,,,"[nan, 0.872, 1.48, 1.95, 1.81, 2.15, 1.61, 13....",,,mg/L
72,98,22,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol glucuronide,,"[0.0, 0.482, 1.1, 2.27, 2.75, 3.99, 5.92, 9.63...",,,,"[nan, 0.894, 0.688, 0.55, 0.894, 0.619, 1.17, ...",,,mg/L
73,98,22,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol sulfate,,"[0.0, 0.547, 1.09, 1.55, 2.44, 3.3, 3.9, 4.12,...",,,,"[nan, 0.0911, 0.228, 0.205, 0.592, 0.592, 0.6,...",,,mg/L
74,98,23,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol,,"[0.0, 9.64, 14.6, 15.4, 15.5, 13.9, 13.5, 12.7...",,,,"[nan, 0.357, 0.57, 0.5, 1.2, 1.57, 0.97, 1.1, ...",,,mg/L
75,98,23,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol glucuronide,,"[0.0, 0.138, 1.03, 1.79, 2.48, 3.37, 4.61, 8.6...",,,,"[nan, 1.03, 0.55, 1.03, 1.24, 1.44, 1.38, 1.03...",,,mg/L
76,98,23,-1,PKDB99998,Test2,False,plasma,"[0.0, 0.125, 0.25, 0.292, 0.333, 0.417, 0.5, 0...",hr,concentration,,paracetamol sulfate,,"[0.0, 0.501, 0.934, 1.62, 2.07, 2.64, 3.23, 4....",,,,"[nan, 0.137, 0.342, 0.183, 0.364, 0.501, 0.592...",,,mg/L
