In [1]:
# import urllib2
import requests
import json
import requests
import csv
import pandas

# Small example

Big GIM (Gene Interaction Miner) is a Translator Knowledge Source that contains function interaction data for all pairs of genes. Functional interaction data are available from four different sources: 

1. tissue-specific gene expression correlations from healthy tissue samples (GTEx), 
2. tissue-specific gene expression correlations from cancer samples (TCGA), 
3. tissue-specific probabilities of function interaction (GIANT), and 
4. direct interactions (BioGRID). 
   
The data is stored as a Google BigQuery table enabling fast access.

## Swagger api specification

http://biggim.ncats.io/api/

* 1.0 [Query Examples](#query_examples)
    * 1.1 [Simple predefined query](#query_simple)
    * 1.2 [Check the status of the simple predefined query](#query_status)
    * 1.3 [Get the results as dataframe](#query_results)


In [2]:
base_url = 'http://biggim.ncats.io/api'

In [3]:
#a couple of simple helper functions
def post(endpoint, data={}, base_url=base_url):
    req = requests.post('%s/%s' % (base_url,endpoint), data=data)
    req.raise_for_status()
    return req.json()

def get(endpoint, data={}, base_url=base_url):
    req = requests.get('%s/%s' % (base_url,endpoint), data=data)
    req.raise_for_status()
    return req.json()
    

def jprint(dct):
    print(json.dumps(dct, indent=2))

In [4]:
try:
    tables = get('/metadata/table')
    #jprint(tables)
except requests.HTTPError as e:
    print(e)
    print("#returned message")
    print(jprint(e.response.json()))

default_table = [t for t in tables if t['default'] == True][0]['name']
print("Default table name: %s" % default_table)

Default table name: BigGIM_70_v1


<a id="query_examples"></a>
## Query examples

<a id="query_simple"></a>
### Simple predefined query

In [5]:
ids1 = []
ids2 = []
with open('theo_dump.tab') as fh:
    reader = csv.reader(fh, delimiter='\t')
    next(reader, None) # header
    for line in reader:
        (g1,g2,j) = line
        ids1.append(g1)
        ids2.append(g2)    

In [6]:
ids1 = ",".join(ids1)  
ids2 = ",".join(ids2) 

In [7]:
#tt = 

In [8]:


example_query = {
      # The table to select from.
      "table": default_table, 
      # A comma delimited list of column names to return.
      "columns": "TCGA_GBM_Correlation,TCGA_GBM_Pvalue,GTEx_Brain_Correlation,GTEx_Brain_Pvalue", 
      # A comma delimited list of Entrez gene ids to select.
      "ids1": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,5722,1111,112,3333",
      # Entrez gene ids to select. If not given, the query selects any gene related to a gene in ids 1. 
      # If given, the query only selects relations that contain a gene in ids1 and a gene in ids2.
      "ids2": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,3333,1111,112,3333", 
      # The type of join made on restrictions. Either intersect or union
      "restriction_join": "intersect", 
      # A list of pairs of values column name,value with which to restrict
      "restriction_gt": "TCGA_GBM_Correlation,.8, GTEx_Brain_Correlation,.8", 
      # the results of the query to rows where the value of the column is greater than the given value.
      #"restriction_lt": "TCGA_GBM_Pvalue,.05, GTEx_Brain_Pvalue,.01",
      # The maximum number of rows to return.
      "limit": 100 
}
print("Query request:")
jprint(example_query)
try:
    query_submit = get('interactions/query', data=example_query)
    jprint(query_submit)
except requests.HTTPError as e:
    print(e)
    print ("#returned message")
    print( jprint(e.response.json()))

Query request:
{
  "ids2": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,3333,1111,112,3333",
  "restriction_join": "intersect",
  "ids1": "5111,6996,57697,6815,889,7112,2176,1019,5888,5706,5722,1111,112,3333",
  "limit": 100,
  "columns": "TCGA_GBM_Correlation,TCGA_GBM_Pvalue,GTEx_Brain_Correlation,GTEx_Brain_Pvalue",
  "table": "BigGIM_70_v1",
  "restriction_gt": "TCGA_GBM_Correlation,.8, GTEx_Brain_Correlation,.8"
}
{
  "status": "submitted",
  "request_id": "e52d7030-579f-4bb3-bb72-174bb6fa9e19"
}


<a id="query_status"></a>
### Check the status of the simple predefined query

In [9]:
import time
try:
    while True:
        query_status = get('interactions/query/status/%s'% (query_submit['request_id'],))
        jprint(query_status)
        if query_status['status'] !='running':
            # query has finished
            break
        else:
            time.sleep(1)
            print("Checking again")
except requests.HTTPError as e:
    print(e)
    print("#returned message")
    print(jprint(e.response.json()))

{
  "status": "running",
  "message": "Extraction job is running.",
  "request_id": "e52d7030-579f-4bb3-bb72-174bb6fa9e19"
}
Checking again
{
  "status": "complete",
  "size": "95.0 B",
  "rows": 0,
  "request_uri": [
    "https://storage.googleapis.com/ncats_bigquery_results/e52d7030-579f-4bb3-bb72-174bb6fa9e19000000000000.csv"
  ],
  "processed_data": "0B",
  "request_id": "e52d7030-579f-4bb3-bb72-174bb6fa9e19"
}


<a id="query_results"></a>
### Get the results as dataframe

In [10]:
result = pandas.concat(map(pandas.read_csv, query_status['request_uri']))
#result

pvalue_columns = []
column_names = []
for ss in substudies['substudies']:
    print(ss)


NameError: name 'substudies' is not defined

<a id="full_example"></a>
# Full example

Lets get relationships for a certain tissue where any tissue related correlation has a pvalue < .05

 Assume we are interested in issues in lymphoid tissues

In [None]:
# get lymphoid substudies
import numpy as np
query_status = None
# get lymphoid substudies
substudies = get('metadata/tissue/%s' % ("lymphoid_tissue",))

# we only want things in a specific table, we'll use the default
table = get('/metadata/table/%s' %  (default_table,))

print("Using table %s\n%s" % (table['name'], table['description']))

# grab pvalue columns
pvalue_columns = []
column_names = []
for ss in substudies['substudies']:
    for column in ss['columns']:
        # only if column is from our table
        if column['table']['name'] == table['name']:
            # add column to select
            column_names.append(column['name'])
            if column['interactions_type'] == 'P-value (-log10)':
                #add pvalue to where
                pvalue_columns.append(column['name'])
 
# grab columns with lower pvalues
pv = []
for p in pvalue_columns:
    pv.append(p)
    pv.append(str(abs(np.log10(.05))))
query_arg = {}
query_arg['table'] = table['name']
query_arg['columns'] = ','.join(sorted(column_names))
if len(pv):
    query_arg['restriction_gt'] = ','.join(pv)
# get if any of the lymphoid columns have 
query_arg['restriction_join'] = 'union'
query_arg['limit'] = 1000000
print "The constructed query."
jprint(query_arg)

print("Submitting query request.")
try:
    query_submit = get('interactions/query', data=query_arg)
    print("Query request response.")
    jprint(query_submit)
except requests.HTTPError as e:
    print e
    print #returned message
    print jprint(e.response.json())

print("Check query status")
try:
    ctr = 1
    while True:
        
        query_status = get('interactions/query/status/%s'% (query_submit['request_id'],))
        jprint(query_status)
        if query_status['status'] !='running':
            # query has finished
            break
        else:
            time.sleep(ctr)
            ctr += 1
            #linear backoff
            print "Checking again"
except requests.HTTPError as e:
    print e
    print #returned message
    print jprint(e.response.json())


if query_status and query_status['request_uri']:
    print("Query successful. Getting result.")
    result = pandas.concat(map(pandas.read_csv, query_status['request_uri']))
else:
    print "Error see above"

In [None]:
#Full example result
result.head()