# This is some boilerplate code for interacting with the TCGA/GTEx data source

The documentation is available at http://http://isbtranslatorapi.adversary.us/

In [1]:
import urllib2
import json
import requests

In [2]:
base_url = 'http://isbtranslatorapi.adversary.us'

## Metadata access

How do I find the different things I am interested in?

In [3]:
# The base metadata takes a minute.  It is counting a lot of stuff
def query_isb(endpoint, data={}, base_url=base_url):
    req = requests.post('%s/%s' % (base_url,endpoint), data=data)
    return req.json()

In [4]:
print json.dumps( query_isb('v1/metadata'), indent=2)

[
  {
    "count": 989596, 
    "key": "analytes"
  }, 
  {
    "count": 16553204, 
    "key": "correlation"
  }
]


## Analytes

Analytes are the nodes in the correlation network

In [5]:
print "These are the queryable fields."
print json.dumps( query_isb('v1/metadata/analytes'), indent=2)

These are the queryable fields.
{
  "fields": [
    "type", 
    "phylum", 
    "vendor", 
    "pubmed_id", 
    "hmdb", 
    "hgnc_id", 
    "gene_location", 
    "_id", 
    "aggregation_level", 
    "description", 
    "uniprot", 
    "tissue_id", 
    "tissue_description", 
    "sub_pathway", 
    "order", 
    "subsystem", 
    "pubchem", 
    "tissue", 
    "nsamples", 
    "health_status", 
    "name", 
    "abbreviation", 
    "kegg", 
    "kingdom", 
    "super_pathway", 
    "study_title", 
    "class", 
    "source_project", 
    "measurement_technology", 
    "family", 
    "category", 
    "peptide"
  ], 
  "Unique Analytes": 989596
}


In [6]:
print "You can view see what values are associated with a field"
print "Note most queries are paginated using from and size."

print json.dumps( query_isb('v1/metadata/analytes/abbreviation', data={'from':0, 'size':20}), indent=2)


You can view see what values are associated with a field
Note most queries are paginated using from and size.
{
  "count": 18385, 
  "abbreviation": [
    "CHPF2", 
    "PARK7", 
    "RETN", 
    "PITX2", 
    "ZNF500", 
    "CASQ1", 
    "MMAA", 
    "WDR37", 
    "EVX1", 
    "ANKRD52", 
    "MRPL10", 
    "ADAT2", 
    "ZFYVE21", 
    "UBE2G2", 
    "APOBEC3H", 
    "FUZ", 
    "PRKAR2A", 
    "DSCR9", 
    "SLC5A4", 
    "SIGLEC1"
  ]
}


## You can query for an analyte using the `v1/analyte` endpoint.

In [7]:
print json.dumps( query_isb('v1/analyte', data={'abbreviation':'CHPF2'}), indent=2)

[
  {
    "category": "gene expression", 
    "nsamples": "577", 
    "name": "chondroitin polymerizing factor 2", 
    "gene_location": "7q36.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:29270", 
    "abbreviation": "CHPF2", 
    "health_status": "Healthy", 
    "tissue": "Adipose Tissue", 
    "tissue_id": "GTEx - Adipose Tissue", 
    "_id": "gtex.adipose-tissue.CHPF2", 
    "tissue_description": "Adipose Tissue healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "145", 
    "name": "chondroitin polymerizing factor 2", 
    "gene_location": "7q36.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:29270", 
    "abbreviation": "CHPF2", 
    "health_status": "Healthy", 
    "tissue": "Adrenal Gland", 
    "tissue_id": "GTEx - Adrenal Gland", 
    "_id": "gtex.adrenal-gland.CHPF2", 
    "tissue_description": "Adrenal Gland healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "689", 
    "name": "chondroitin polymerizin

You can use a comma separated list to query for multiple genes

In [8]:
print json.dumps( query_isb('v1/analyte', data={'abbreviation':'CHPF2,BRCA2'}), indent=2)

[
  {
    "category": "gene expression", 
    "nsamples": "577", 
    "name": "BRCA2, DNA repair associated", 
    "gene_location": "13q13.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:1101", 
    "abbreviation": "BRCA2", 
    "health_status": "Healthy", 
    "tissue": "Adipose Tissue", 
    "tissue_id": "GTEx - Adipose Tissue", 
    "_id": "gtex.adipose-tissue.BRCA2", 
    "tissue_description": "Adipose Tissue healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "577", 
    "name": "chondroitin polymerizing factor 2", 
    "gene_location": "7q36.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:29270", 
    "abbreviation": "CHPF2", 
    "health_status": "Healthy", 
    "tissue": "Adipose Tissue", 
    "tissue_id": "GTEx - Adipose Tissue", 
    "_id": "gtex.adipose-tissue.CHPF2", 
    "tissue_description": "Adipose Tissue healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "145", 
    "name": "BRCA2, DNA repair associ

In [9]:
print json.dumps( query_isb('v1/analyte', data={'abbreviation':'CHPF2,BRCA2', "tissue":"Brain"}), indent=2)

[
  {
    "category": "gene expression", 
    "nsamples": "1259", 
    "name": "BRCA2, DNA repair associated", 
    "gene_location": "13q13.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:1101", 
    "abbreviation": "BRCA2", 
    "health_status": "Healthy", 
    "tissue": "Brain", 
    "tissue_id": "GTEx - Brain", 
    "_id": "gtex.brain.BRCA2", 
    "tissue_description": "Brain healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "1259", 
    "name": "chondroitin polymerizing factor 2", 
    "gene_location": "7q36.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:29270", 
    "abbreviation": "CHPF2", 
    "health_status": "Healthy", 
    "tissue": "Brain", 
    "tissue_id": "GTEx - Brain", 
    "_id": "gtex.brain.CHPF2", 
    "tissue_description": "Brain healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "120", 
    "name": "BRCA2, DNA repair associated", 
    "gene_location": "13q13.1", 
    "source_project": "TCGA",

The `_id` field is what you then use to query the correlations

# Correlations

In [11]:
my_analytes = query_isb('v1/analyte', data={'abbreviation':'CHPF2,BRCA2', "tissue":"Brain", 'source_project':'GTEx'})
print json.dumps(my_analytes, indent=2)

[
  {
    "category": "gene expression", 
    "nsamples": "1259", 
    "name": "BRCA2, DNA repair associated", 
    "gene_location": "13q13.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:1101", 
    "abbreviation": "BRCA2", 
    "health_status": "Healthy", 
    "tissue": "Brain", 
    "tissue_id": "GTEx - Brain", 
    "_id": "gtex.brain.BRCA2", 
    "tissue_description": "Brain healthy tissue"
  }, 
  {
    "category": "gene expression", 
    "nsamples": "1259", 
    "name": "chondroitin polymerizing factor 2", 
    "gene_location": "7q36.1", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:29270", 
    "abbreviation": "CHPF2", 
    "health_status": "Healthy", 
    "tissue": "Brain", 
    "tissue_id": "GTEx - Brain", 
    "_id": "gtex.brain.CHPF2", 
    "tissue_description": "Brain healthy tissue"
  }
]


In [12]:
# join together some interesting ids and look for correlations
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'size':50}), indent=2)

[
  {
    "coefficient": 0.138, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A1BG", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 8.637e-07
  }, 
  {
    "coefficient": 0.163, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A1CF", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 6.31e-09
  }, 
  {
    "coefficient": 0.153, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A2M", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 4.654e-08
  }, 
  {
    "coefficient": 0.025, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A2ML1", 
    "_id_2": "gtex.brain.BRCA

In [13]:
# same thing but drop the crumby pvalues (Bonf sig)
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'pvalue': .05/18385}), indent=2)

[
  {
    "coefficient": 0.138, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A1BG", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 8.637e-07
  }, 
  {
    "coefficient": 0.163, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A1CF", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 6.31e-09
  }, 
  {
    "coefficient": 0.153, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A2M", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 4.654e-08
  }, 
  {
    "coefficient": 0.138, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.A4GNT", 
    "_id_2": "gtex.brain.BRCA

In [14]:
# next page of bonf sig correlations
ids = ','.join([analyte['_id'] for analyte in my_analytes])
print json.dumps( query_isb('v1/correlation', data={'ids1':ids, 'pvalue': .05/18385, 'from':10, 'size':10}), indent=2)

[
  {
    "coefficient": 0.17, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.AARS2", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 1.265e-09
  }, 
  {
    "coefficient": 0.171, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.AARSD1", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 1.072e-09
  }, 
  {
    "coefficient": 0.293, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.AASDH", 
    "_id_2": "gtex.brain.BRCA2", 
    "bh_adjusted_pvalue": null, 
    "test": "SPEARMAN", 
    "pvalue": 2.781e-26
  }, 
  {
    "coefficient": 0.203, 
    "description": "tissue/condition specific gene expression", 
    "study": "GTEx", 
    "_id_1": "gtex.brain.AASDHPPT", 
    "_id_2": "gtex.br

In [15]:
# hmm abca10 looks interesting and I would like to learn more about it

print json.dumps(query_isb('v1/analyte/gtex.brain.ABCA10'), indent=2)

[
  {
    "category": "gene expression", 
    "nsamples": "1259", 
    "name": "ATP binding cassette subfamily A member 10", 
    "gene_location": "17q24.3", 
    "source_project": "GTEx", 
    "hgnc_id": "HGNC:30", 
    "abbreviation": "ABCA10", 
    "health_status": "Healthy", 
    "tissue": "Brain", 
    "tissue_id": "GTEx - Brain", 
    "_id": "gtex.brain.ABCA10", 
    "tissue_description": "Brain healthy tissue"
  }
]


In [18]:
# in fact I would like to learn about all of the bsig genes

sigs = []

frm = 0
size=1000
res = query_isb('v1/correlation', data={'ids1':ids, 'pvalue': .01/18385, 'from':frm, 'size':size})
correlations = res[:]
while len(res) > 0:
    print "Saving records from %i to %i" %(frm, frm+size)
    sigs += [x['_id_1'] for x in res]
    sigs += [x['_id_2'] for x in res]
    frm += size
    res =  query_isb('v1/correlation', data={'ids1':ids, 'pvalue': .01/18385, 'from':frm, 'size':size})
    correlations += res

Saving records from 0 to 1000
Saving records from 1000 to 2000
Saving records from 2000 to 3000
Saving records from 3000 to 4000
Saving records from 4000 to 5000
Saving records from 5000 to 6000
Saving records from 6000 to 7000
Saving records from 7000 to 8000
Saving records from 8000 to 9000
Saving records from 9000 to 10000


In [22]:
sigs = list(set(sigs))
print len(sigs)

9940


In [26]:
sig_ids = ','.join(sigs)

frm = 0
size = 1000
meta = []
res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
meta += res
# Note: this is relying on the pagination, it would be smarter to just partition
# the *sig_ids* set which would greatly speed up the query
while len(res) > 0:
    frm += size
    print "Saving records from %i to %i" %(frm, frm+size)
    res = query_isb('/v1/analyte', data={'ids':sig_ids, 'from':frm, 'size':size})
    meta+=res
    

Saving records from 1000 to 2000
Saving records from 2000 to 3000
Saving records from 3000 to 4000
Saving records from 4000 to 5000
Saving records from 5000 to 6000
Saving records from 6000 to 7000
Saving records from 7000 to 8000
Saving records from 8000 to 9000
Saving records from 9000 to 10000
Saving records from 10000 to 11000


In [28]:
len(meta)

9940

In [30]:
#faster less dumb way of doing the same as above
sig_list = sigs
sig_ids = ','.join(sigs)

frm = 0
size = 1000
res = []
sub_sig = sig_list[frm:frm+size]
res += query_isb('/v1/analyte', data={'ids':','.join(sub_sig), 'size':size})

while len(sub_sig) > 0:
    frm += size
    sub_sig = sig_list[frm:frm+size]
    print "Saving records from %i to %i" %(frm, frm+size)
    res += query_isb('/v1/analyte', data={'ids':','.join(sub_sig), 'size':size})

Saving records from 1000 to 2000
Saving records from 2000 to 3000
Saving records from 3000 to 4000
Saving records from 4000 to 5000
Saving records from 5000 to 6000
Saving records from 6000 to 7000
Saving records from 7000 to 8000
Saving records from 8000 to 9000
Saving records from 9000 to 10000
Saving records from 10000 to 11000


# Pandas 4 life

In [31]:
import pandas

corr_net = pandas.DataFrame(correlations)
corr_net.head()

Unnamed: 0,_id_1,_id_2,bh_adjusted_pvalue,coefficient,description,pvalue,study,test
0,gtex.brain.A1CF,gtex.brain.BRCA2,,0.163,tissue/condition specific gene expression,6.31e-09,GTEx,SPEARMAN
1,gtex.brain.A2M,gtex.brain.BRCA2,,0.153,tissue/condition specific gene expression,4.654e-08,GTEx,SPEARMAN
2,gtex.brain.AAAS,gtex.brain.BRCA2,,0.233,tissue/condition specific gene expression,5.877e-17,GTEx,SPEARMAN
3,gtex.brain.AAGAB,gtex.brain.BRCA2,,0.173,tissue/condition specific gene expression,6.602e-10,GTEx,SPEARMAN
4,gtex.brain.AAK1,gtex.brain.BRCA2,,0.152,tissue/condition specific gene expression,6.084e-08,GTEx,SPEARMAN


In [32]:
metadata = pandas.DataFrame(res)
metadata.head()

Unnamed: 0,_id,abbreviation,category,gene_location,health_status,hgnc_id,name,nsamples,source_project,tissue,tissue_description,tissue_id
0,gtex.brain.AAAS,AAAS,gene expression,12q13.13,Healthy,HGNC:13666,aladin WD repeat nucleoporin,1259,GTEx,Brain,Brain healthy tissue,GTEx - Brain
1,gtex.brain.AARS2,AARS2,gene expression,6p21.1,Healthy,HGNC:21022,"alanyl-tRNA synthetase 2, mitochondrial",1259,GTEx,Brain,Brain healthy tissue,GTEx - Brain
2,gtex.brain.ABCA2,ABCA2,gene expression,9q34.3,Healthy,HGNC:32,ATP binding cassette subfamily A member 2,1259,GTEx,Brain,Brain healthy tissue,GTEx - Brain
3,gtex.brain.ABCA5,ABCA5,gene expression,17q24.3,Healthy,HGNC:35,ATP binding cassette subfamily A member 5,1259,GTEx,Brain,Brain healthy tissue,GTEx - Brain
4,gtex.brain.ABCA6,ABCA6,gene expression,17q24.2-q24.3,Healthy,HGNC:36,ATP binding cassette subfamily A member 6,1259,GTEx,Brain,Brain healthy tissue,GTEx - Brain
