# Workflow 2, Module 1D

## Chemical similarity

The query as written links chenmicals to genes via GO terms, but we can more directly link chemicals to genes.

### Set up functions

In [1]:
robokop_server = 'robokop.renci.org'

In [2]:
import requests
import pandas as pd

In [3]:
def expand(type1,identifier,type2,rebuild=None,csv=None,predicate=None):
    url=f'http://{robokop_server}:80/api/simple/expand/{type1}/{identifier}/{type2}'
    params = {'rebuild': rebuild, 
              'csv'    : csv,
              'predicate': predicate} 
    params = { k:v for k,v in params.items() if v is not None }
    response = requests.get(url,params=params)
    print( f'Return Status: {response.status_code}' )
    if response.status_code == 200:
        return response.json()
    return []

In [4]:
def similarity(type1,ident,type2,by_type,threshhold=None,maxresults=None,rebuild=None):
    url=f'http://{robokop_server}/api/simple/similarity/{type1}/{ident}/{type2}/{by_type}'
    params = { 'threshhold': threshhold, 'maxresults': maxresults, 'rebuild': rebuild }
    params = { k:v for k,v in params.items() if v is not None }
    response=requests.get(url, params = params)
    print( 'Return code:',response.status_code )
    return response.json()

In [5]:
def parse_answer(returnanswer):
    nodes = [answer['nodes'][1] for answer in returnanswer['answers']]
    edges = [answer['edges'][0] for answer in returnanswer['answers']]
    answers = [ {"result_id": node["id"], 
                 "result_name": node["name"] if 'name' in node else node['id'], 
                 "type": edge["type"],
                 "source": edge['edge_source']}
              for node,edge in zip(nodes,edges)]
    return pd.DataFrame(answers)

### Expand from disease to genes

In [6]:
FA='MONDO:0019391'
fa_genes = expand('disease',FA,'gene')
fa_gene_frame = parse_answer(fa_genes)
fa_gene_frame

Return Status: 200


Unnamed: 0,result_id,result_name,source,type
0,HGNC:3585,FANCD2,pharos.gene_get_disease,disease_to_gene_association
1,HGNC:3588,FANCG,pharos.gene_get_disease,disease_to_gene_association
2,HGNC:3582,FANCA,pharos.gene_get_disease,disease_to_gene_association
3,HGNC:23168,FANCM,pharos.disease_get_gene,disease_to_gene_association
4,HGNC:3587,FANCF,pharos.gene_get_disease,disease_to_gene_association
5,HGNC:26144,PALB2,pharos.disease_get_gene,disease_to_gene_association
6,HGNC:3586,FANCE,pharos.disease_get_gene,disease_to_gene_association
7,HGNC:9820,RAD51C,pharos.disease_get_gene,disease_to_gene_association
8,HGNC:3583,FANCB,pharos.disease_get_gene,disease_to_gene_association
9,HGNC:25009,UBE2T,pharos.gene_get_disease,disease_to_gene_association


## Similiarity

Now we have a list of genes.  Let's find genes with similar chemical associations.  Here's a query for doing that for one of the genes:

In [8]:
genes = list(fa_gene_frame['result_id'])
similar = similarity('gene',genes[0],'gene','chemical_substance',threshhold=0.3)
pd.DataFrame(similar)

Return code: 200


Unnamed: 0,id,name,similarity
0,HGNC:25568,FANCI,0.348837
1,HGNC:24573,NEIL3,0.304348
2,HGNC:17255,KNTC1,0.30303
3,HGNC:12307,TRIP13,0.302632
4,HGNC:14629,CDCA8,0.301205


So we want to do this for each of the FA genes, and we want to only include results that aren't already part of the original gene list.  For demo purposes, let's only use the first 10 rows of fa_genes.

In [11]:
subset_fa_gene_frame = fa_gene_frame.iloc[0:10]

In [12]:
sim_frames = []
for row in subset_fa_gene_frame.iterrows():
    similar = similarity('gene',row[1]['result_id'],'gene','chemical_substance',threshhold=0.3)
    sframe = pd.DataFrame(similar)
    sframe['query_id'] = row[1]['result_id']
    sframe['query_name'] = row[1]['result_name']
    sim_frames.append(sframe)
all_sim = pd.concat(sim_frames)

Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200


In [13]:
v = fa_gene_frame['result_name'].values
new_genes = all_sim[ ~ all_sim['name'].isin(v) ]
new_genes

Unnamed: 0,id,name,query_id,query_name,similarity
1,HGNC:24573,NEIL3,HGNC:3585,FANCD2,0.304348
2,HGNC:17255,KNTC1,HGNC:3585,FANCD2,0.303030
3,HGNC:12307,TRIP13,HGNC:3585,FANCD2,0.302632
4,HGNC:14629,CDCA8,HGNC:3585,FANCD2,0.301205
0,HGNC:11629,TCF19,HGNC:3588,FANCG,0.410714
1,HGNC:3511,EXO1,HGNC:3588,FANCG,0.366667
2,HGNC:23170,WDHD1,HGNC:3588,FANCG,0.344828
4,HGNC:17228,RAD54B,HGNC:3588,FANCG,0.339623
5,HGNC:16673,ANP32E,HGNC:3588,FANCG,0.333333
6,HGNC:21348,CENPU,HGNC:3588,FANCG,0.328358


Currently the list is sorted by 1) the rank of the original FA gene and then 2) the similarity.  We can sort by overall similarity.

In [14]:
new_genes.sort_values(by=["similarity"],ascending=False)

Unnamed: 0,id,name,query_id,query_name,similarity
0,HGNC:33867,SDHAF1,HGNC:26144,PALB2,0.526316
1,HGNC:12994,ZNF202,HGNC:26144,PALB2,0.523810
2,HGNC:12962,TRIM26,HGNC:26144,PALB2,0.520000
2,HGNC:24575,GINS2,HGNC:25009,UBE2T,0.472222
3,HGNC:16122,FAM83D,HGNC:25009,UBE2T,0.465517
4,HGNC:9369,PRIM1,HGNC:25009,UBE2T,0.463768
3,HGNC:25714,CCDC51,HGNC:26144,PALB2,0.461538
5,HGNC:13026,ZNF232,HGNC:26144,PALB2,0.454545
4,HGNC:21948,ZNF746,HGNC:26144,PALB2,0.454545
6,HGNC:16192,SLC17A9,HGNC:26144,PALB2,0.450000
