# This is some boilerplate code for interacting with the Pioneer 100 data source

The documentation is available at http://isbtranslatorapi.adversary.us/

In [1]:
import urllib2
import json
import requests

In [2]:
base_url = 'http://isbtranslatorapi.adversary.us'

## Metadata access

How do I find the different things I am interested in?

In [3]:
# The base metadata takes a minute.  It is counting a lot of stuff
def query_isb(endpoint, data={}, base_url=base_url):
    req = requests.post('%s/%s' % (base_url,endpoint), data=data)
    return req.json()

In [4]:
print json.dumps( query_isb('v1/metadata'), indent=2)

[
  {
    "count": 989596, 
    "key": "analytes"
  }, 
  {
    "count": 16553204, 
    "key": "correlation"
  }
]


## Analytes

Analytes are the nodes in the correlation network

In [5]:
print "These are the queryable fields."
print json.dumps( query_isb('v1/metadata/analytes'), indent=2)

These are the queryable fields.
{
  "fields": [
    "type", 
    "phylum", 
    "vendor", 
    "pubmed_id", 
    "hmdb", 
    "hgnc_id", 
    "gene_location", 
    "_id", 
    "aggregation_level", 
    "description", 
    "uniprot", 
    "tissue_id", 
    "tissue_description", 
    "sub_pathway", 
    "order", 
    "subsystem", 
    "pubchem", 
    "tissue", 
    "nsamples", 
    "health_status", 
    "name", 
    "abbreviation", 
    "kegg", 
    "kingdom", 
    "super_pathway", 
    "study_title", 
    "class", 
    "source_project", 
    "measurement_technology", 
    "family", 
    "category", 
    "peptide"
  ], 
  "Unique Analytes": 989596
}


In [6]:
print "You can view see what values are associated with a field"
print "Note most queries are paginated using from and size."

print json.dumps( query_isb('v1/metadata/analytes/category', data={'from':0, 'size':20}), indent=2)


You can view see what values are associated with a field
Note most queries are paginated using from and size.
{
  "count": 6, 
  "category": [
    "Proteomics", 
    "Metabolites", 
    "Genetics", 
    "Clinical Labs", 
    "gene expression", 
    "Microbiome"
  ]
}


### The categories associated with the Pioneer 100 dataset are Proteomics, Metabolites, Genetics, Clinical Labs, and Microbiome.

## You can query for an analyte using the `v1/analyte` endpoint.

In [None]:
print json.dumps( query_isb('v1/analyte'), indent=2)

## Some example analytes from each category

In [11]:
p100_categories = ["Proteomics", "Metabolites", "Genetics", "Clinical Labs", "Microbiome"]
                   
for category in p100_categories:
    print "Category:" , category
    print "="*20
    print json.dumps( query_isb('v1/analyte', data={'category':category, 'size':2}), indent=2)
    print

Category: Proteomics
[
  {
    "category": "Proteomics", 
    "subsystem": "brain", 
    "vendor": "ISB", 
    "description": "Extracellular matrix and cell adhesion protein that plays a role in nervous system development and in synaptic plasticity. Both soluble and membranous forms promote neurite outgrowth of cerebellar and hippocampal neurons and suppress neuronal cell death. Plays a role in neuronal positioning of pyramidal neurons and in regulation of both the number of interneurons and the efficacy of GABAergic synapses. May play a role in regulating cell migration in nerve regeneration and cortical development. Potentiates integrin-dep [...] (1224 aa)", 
    "measurement_technology": "mass spectometry", 
    "abbreviation": "CHL1", 
    "uniprot": "O00533", 
    "peptide": "VIAVNEVGR", 
    "_id": "PROTE.None.brain.CHL1.VIAVNEVGR", 
    "name": "cell adhesion molecule with homology to L1CAM (close homolog of L1)"
  }, 
  {
    "category": "Proteomics", 
    "subsystem": "brain",

## You can use a comma separated list to query for multiple values in fields

In [13]:
print json.dumps( query_isb('v1/analyte', data={'category':'Proteomics,Metabolites'}), indent=2)

[
  {
    "category": "Metabolites", 
    "kegg": "c00041", 
    "vendor": "Metabolon", 
    "name": "alanine", 
    "measurement_technology": "lc/ms-neg", 
    "super_pathway": "amino-acid", 
    "pubchem": "5950", 
    "hmdb": "hmdb00161", 
    "_id": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.alanine", 
    "sub_pathway": "alanine-and-aspartate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c00152", 
    "vendor": "Metabolon", 
    "name": "asparagine", 
    "measurement_technology": "lc/ms-polar", 
    "super_pathway": "amino-acid", 
    "pubchem": "6267", 
    "hmdb": "hmdb00168", 
    "_id": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.asparagine", 
    "sub_pathway": "alanine-and-aspartate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c02847", 
    "vendor": "Metabolon", 
    "name": "n-acetylalanine", 
    "measurement_technology": "lc/ms-pos", 
    "super_pathway": "amino-acid", 
    "pubchem": "88064", 
    "hmd

## Multiple options are treated as AND operations

In [16]:
print json.dumps( query_isb('v1/analyte', data={'super_pathway':'amino-acid', "sub_pathway":"glutamate-metabolism"}), indent=2)

[
  {
    "category": "Metabolites", 
    "kegg": "c00025", 
    "vendor": "Metabolon", 
    "name": "glutamate", 
    "measurement_technology": "lc/ms-pos", 
    "super_pathway": "amino-acid", 
    "pubchem": "611", 
    "hmdb": "hmdb00148", 
    "_id": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "sub_pathway": "glutamate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c00064", 
    "vendor": "Metabolon", 
    "name": "glutamine", 
    "measurement_technology": "lc/ms-neg", 
    "super_pathway": "amino-acid", 
    "pubchem": "5961", 
    "hmdb": "hmdb00641", 
    "_id": "METAB.None.amino-acid.glutamate-metabolism.glutamine", 
    "sub_pathway": "glutamate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c12270", 
    "vendor": "Metabolon", 
    "name": "n-acetyl-aspartyl-glutamate-(naag)", 
    "measurement_technology": "lc/ms-pos", 
    "super_pathway": "amino-acid", 
    "pubchem": "5255", 
    "hmdb": "hmdb01067", 
    "_id": "M

The `_id` field is what you then use to query the correlations

# Correlations

In [17]:
my_analytes = query_isb('v1/analyte', data={'category':'Metabolites', "sub_pathway":"glutamate-metabolism"})
print json.dumps(my_analytes, indent=2)

[
  {
    "category": "Metabolites", 
    "kegg": "c00025", 
    "vendor": "Metabolon", 
    "name": "glutamate", 
    "measurement_technology": "lc/ms-pos", 
    "super_pathway": "amino-acid", 
    "pubchem": "611", 
    "hmdb": "hmdb00148", 
    "_id": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "sub_pathway": "glutamate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c00064", 
    "vendor": "Metabolon", 
    "name": "glutamine", 
    "measurement_technology": "lc/ms-neg", 
    "super_pathway": "amino-acid", 
    "pubchem": "5961", 
    "hmdb": "hmdb00641", 
    "_id": "METAB.None.amino-acid.glutamate-metabolism.glutamine", 
    "sub_pathway": "glutamate-metabolism"
  }, 
  {
    "category": "Metabolites", 
    "kegg": "c12270", 
    "vendor": "Metabolon", 
    "name": "n-acetyl-aspartyl-glutamate-(naag)", 
    "measurement_technology": "lc/ms-pos", 
    "super_pathway": "amino-acid", 
    "pubchem": "5255", 
    "hmdb": "hmdb01067", 
    "_id": "M

In [18]:
# join together some interesting ids and look for correlations
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'size':50}), indent=2)

[
  {
    "coefficient": 0.174321453409167, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.alanine", 
    "_id_2": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "bh_adjusted_pvalue": 0.380768641466359, 
    "test": "SPEARMAN", 
    "pvalue": 0.071174338089057
  }, 
  {
    "coefficient": 0.0314956879203117, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.alanine", 
    "_id_2": "METAB.None.amino-acid.glutamate-metabolism.glutamine", 
    "bh_adjusted_pvalue": 0.927871281472151, 
    "test": "SPEARMAN", 
    "pvalue": 0.746252687001929
  }, 
  {
    "coefficient": -0.0951822953880743, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-ac

In [19]:
# same thing but drop the crumby pvalues 
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'bh_adjusted_pvalue': .01}), indent=2)

[
  {
    "coefficient": 0.353101450932198, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.alanine", 
    "_id_2": "METAB.None.amino-acid.glutamate-metabolism.n-acetylglutamate", 
    "bh_adjusted_pvalue": 0.00908972639444744, 
    "test": "SPEARMAN", 
    "pvalue": 0.000178017705823694
  }, 
  {
    "coefficient": 0.441712355846865, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.alanine-and-aspartate-metabolism.asparagine", 
    "_id_2": "METAB.None.amino-acid.glutamate-metabolism.glutamine", 
    "bh_adjusted_pvalue": 0.000202525760180394, 
    "test": "SPEARMAN", 
    "pvalue": 1.70569104408648e-06
  }, 
  {
    "coefficient": -0.38943668009946, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "

In [21]:
# next page of bonf sig correlations
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'bh_adjusted_pvalue': .01, 'from':10, 'size':10}), indent=2)

[
  {
    "coefficient": -0.369891889650906, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "_id_2": "CHEMS.None.Genova.hippuric_acid", 
    "bh_adjusted_pvalue": 0.00725628805829692, 
    "test": "SPEARMAN", 
    "pvalue": 8.14856199971786e-05
  }, 
  {
    "coefficient": 0.553218557133643, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "_id_2": "CHEMS.None.Genova.homa_ir", 
    "bh_adjusted_pvalue": 3.49060856537096e-07, 
    "test": "SPEARMAN", 
    "pvalue": 5.33388124498582e-10
  }, 
  {
    "coefficient": 0.429884781705221, 
    "description": "mean value, age and sex adjusted", 
    "study": "Hundred Person Wellness Project", 
    "_id_1": "METAB.None.amino-acid.glutamate-metabolism.glutamate", 
    "_id_2": "CHEMS.No

In [22]:
# hmm leptin looks interesting and I would like to learn more about it

print json.dumps(query_isb('v1/analyte/CHEMS.None.Genova.leptin'), indent=2)

[
  {
    "category": "Clinical Labs", 
    "_id": "CHEMS.None.Genova.leptin", 
    "vendor": "Genova", 
    "name": "leptin"
  }
]


In [23]:
# in fact I would like to learn about all of the bsig genes

sigs = []

frm = 0
size=1000
res = query_isb('v1/correlation', data={'ids1':ids, 'bh_adjusted_pvalue': .01, 'from':frm, 'size':size})
correlations = res[:]
while len(res) > 0:
    print "Saving records from %i to %i" %(frm, frm+size)
    sigs += [x['_id_1'] for x in res]
    sigs += [x['_id_2'] for x in res]
    frm += size
    res =  query_isb('v1/correlation', data={'ids1':ids, 'bh_adjusted_pvalue': .01, 'from':frm, 'size':size})
    correlations += res

Saving records from 0 to 1000


In [25]:
sigs = list(set(sigs))
print len(sigs)

100


In [26]:
sig_ids = ','.join(sigs)

frm = 0
size = 1000
meta = []
res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
meta += res
# Note: this is relying on the pagination, it would be smarter to just partition
# the *sig_ids* set which would greatly speed up the query
while len(res) > 0:
    frm += size
    print "Saving records from %i to %i" %(frm, frm+size)
    res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
    meta+=res
    

Saving records from 1000 to 2000


In [27]:
sig_ids = ','.join(sigs)

frm = 0
size = 1000
meta = []
res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
meta += res
# Note: this is relying on the pagination, it would be smarter to just partition
# the *sig_ids* set which would greatly speed up the query
while len(res) > 0:
    frm += size
    print "Saving records from %i to %i" %(frm, frm+size)
    res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
    meta+=res
    

Saving records from 1000 to 2000


In [28]:
len(meta)

100

In [29]:
#faster less dumb way of doing the same as above
sig_list = sigs
sig_ids = ','.join(sigs)

frm = 0
size = 1000
res = []
sub_sig = sig_list[frm:frm+size]
res += query_isb('/v1/analyte', data={'ids':','.join(sub_sig), 'size':size})

while len(sub_sig) > 0:
    frm += size
    sub_sig = sig_list[frm:frm+size]
    print "Saving records from %i to %i" %(frm, frm+size)
    res += query_isb('/v1/analyte', data={'ids':','.join(sub_sig), 'size':size})

Saving records from 1000 to 2000


# Pandas 4 life

In [30]:
import pandas

corr_net = pandas.DataFrame(correlations)
corr_net.head()

Unnamed: 0,_id_1,_id_2,bh_adjusted_pvalue,coefficient,description,pvalue,study,test
0,METAB.None.amino-acid.alanine-and-aspartate-me...,METAB.None.amino-acid.glutamate-metabolism.n-a...,0.00909,0.353101,"mean value, age and sex adjusted",0.000178,Hundred Person Wellness Project,SPEARMAN
1,METAB.None.amino-acid.alanine-and-aspartate-me...,METAB.None.amino-acid.glutamate-metabolism.glu...,0.000203,0.441712,"mean value, age and sex adjusted",2e-06,Hundred Person Wellness Project,SPEARMAN
2,METAB.None.amino-acid.creatine-metabolism.crea...,METAB.None.amino-acid.glutamate-metabolism.pyr...,0.002303,-0.389437,"mean value, age and sex adjusted",3.1e-05,Hundred Person Wellness Project,SPEARMAN
3,METAB.None.amino-acid.glutamate-metabolism.glu...,CHEMS.None.AGES.aminoadipic acid,0.005164,0.399905,"mean value, age and sex adjusted",4.9e-05,Hundred Person Wellness Project,SPEARMAN
4,METAB.None.amino-acid.glutamate-metabolism.glu...,CHEMS.None.Genova.adiponectin,0.00206,-0.40371,"mean value, age and sex adjusted",1.5e-05,Hundred Person Wellness Project,SPEARMAN


In [31]:
metadata = pandas.DataFrame(res)
metadata.head()

Unnamed: 0,_id,abbreviation,category,description,hmdb,kegg,measurement_technology,name,peptide,pubchem,sub_pathway,subsystem,super_pathway,uniprot,vendor
0,CHEMS.None.AGES.aminoadipic acid,,Clinical Labs,,,,,aminoadipic acid,,,,,,,AGES
1,CHEMS.None.Genova.adiponectin,,Clinical Labs,,,,,adiponectin,,,,,,,Genova
2,CHEMS.None.Genova.asparagine_plasma,,Clinical Labs,,HMDB00168,,,asparagine,,,,,,,Genova
3,CHEMS.None.Genova.body_mass_index,,Clinical Labs,,,,,body_mass_index,,,,,,,Genova
4,CHEMS.None.Genova.c_peptide,,Clinical Labs,,,,,c_peptide,,,,,,,Genova
