In [11]:
import requests
import random
import pandas as pd
import uuid
import json

In [23]:
# This is a useful trick to explore API responses from https://github.com/caldwell/renderjson
from IPython.display import display_javascript, display_html, display

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 300px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)


In [4]:
API = 'https://www.targetvalidation.org/api/1.1'

# Find what's known for a list of genes

> eg. Maria L in the fly group routinely screens lists of human genes for the presence of evidence in neurological diseases. She wants to understand a bit about the biology of these genes. She looks at open targets to find what is known about the target and to which disease it has already been connected.

Compiling evidence relative to a list of genes can be a time-consuming and tedious task. Now the process can be automated, thanks to the targetvalidation.org API.

This tutorial illustrates how to compile all the evidence contained in targetvalidation.org for a list of genes into a compact representation (eg. a spreadsheet) using python. 
You might have a list of genes being generated from an experiment or a genetic screen, or perhaps you are following a particular set of targets and would like to maintain an updated look at what evidence has been compiled around them.
The process can be useful to reduce bias and increase success rates when prioritizing a list of genes for experimental follow-up. 

For this tutorial, we will use a list of random genes taken from the HGNC catalog. First, let's download the full catalog and create a list of symbols only.

In [3]:
%%capture
%%bash
[ -f ../data/hgnc_complete_set.txt.gz ] || wget --directory-prefix=../data/ http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz 
[ -f ../data/hgnc_complete_set.txt ] || gunzip ../data/hgnc_complete_set.txt.gz 
tail -n +2 ../data/hgnc_complete_set.txt | cut -f2 | grep -v withdrawn > ../data/hgnc_symbol_set.txt

We then print ten random gene names to check that our shuffling is working.

In [57]:
import random
with open('../data/hgnc_symbols.txt') as f:
    genes = [line.rstrip() for line in f]
random.shuffle(genes)
print(genes[:10])

['HCG24', 'DPCD', 'IGAD1', 'FCGRT', 'CAMK2A', 'HMGA1P3', 'FOXD3-AS1', 'HMGN1P25', 'FOXD4L5', 'HMOX1']


To use them in querying the Target Validation API we have to convert them to Ensembl gene ids:

In [65]:
def get_ensid(genesymbol):
    '''
    uses the targetvalidation.org API to get ENS IDs
    '''
    r = requests.get(API + '/public/search', 
                     params={'q':genesymbol,'size':1,'filter':'gene'})
    result = r.json()
    try:
        if result["data"][0]["data"]["approved_symbol"] == genesymbol:
            geneId = result["data"][0]["id"]
            return geneId
    except IndexError:
            return None

assert get_ensid('SOD1') == 'ENSG00000142168'

random_genes = [get_ensid(x) for x in genes[:30] if get_ensid(x) is not None]
print(random_genes)

['ENSG00000230313', 'ENSG00000166171', 'ENSG00000104870', 'ENSG00000070808', 'ENSG00000258011', 'ENSG00000230798', 'ENSG00000204779', 'ENSG00000100292', 'ENSG00000184844', 'ENSG00000169325', 'ENSG00000204165', 'ENSG00000168959', 'ENSG00000153107', 'ENSG00000232605', 'ENSG00000107984', 'ENSG00000176022', 'ENSG00000254722', 'ENSG00000107105', 'ENSG00000253540', 'ENSG00000214688', 'ENSG00000276550', 'ENSG00000230223', 'ENSG00000153046', 'ENSG00000090581']


## how many disease in each therapeutic area are associated with each target

In [76]:
r = requests.post(API + '/public/association/filter', json = {'target':random_genes,'facets':True})

In [77]:
RenderJSON(r.json())

In [83]:
therapeuticareas = []

for bucket in r.json()['facets']['therapeutic_area']['buckets']:
    therapeuticareas.append({
            'target_count' : bucket['unique_target_count']['value'], 
            'disease_count' : bucket['unique_disease_count']['value'],
            'therapeutic_area' : bucket['label'],
            'key' : bucket['key']
        })

df = pd.DataFrame(therapeuticareas)
df.sort_values(by='target_count',ascending=False).head()

Unnamed: 0,disease_count,key,target_count,therapeutic_area
1,146,efo_0000616,16,neoplasm
0,419,efo_0000508,11,genetic disorder
3,97,efo_0000618,11,nervous system disease
4,76,efo_0002461,10,skeletal system disease
7,62,efo_0000701,10,skin disease


### TODO:
* Show the weight of genetic evidence behind the link (divide in mendelian vs gwas)
* displays list of top 3 diseases, name and Association Score.  
* show number of disease associations

* show pathway and .groupby pathway so that you bring the genes with shared biology at the top
