# Workflow 2, Module 1A

## Gene functional similarity

### Set up functions

In [3]:
robokop_server = 'robokop.renci.org'

In [4]:
import requests
import pandas as pd

In [3]:
def expand(type1,identifier,type2,rebuild=None,csv=None,predicate=None):
    url=f'http://{robokop_server}:80/api/simple/expand/{type1}/{identifier}/{type2}'
    params = {'rebuild': rebuild, 
              'csv'    : csv,
              'predicate': predicate} 
    params = { k:v for k,v in params.items() if v is not None }
    response = requests.get(url,params=params)
    print( f'Return Status: {response.status_code}' )
    if response.status_code == 200:
        return response.json()
    return []

In [4]:
def similarity(type1,ident,type2,by_type,threshhold=None,maxresults=None,rebuild=None):
    url=f'http://{robokop_server}/api/simple/similarity/{type1}/{ident}/{type2}/{by_type}'
    params = { 'threshhold': threshhold, 'maxresults': maxresults, 'rebuild': rebuild }
    params = { k:v for k,v in params.items() if v is not None }
    response=requests.get(url, params = params)
    print( 'Return code:',response.status_code )
    return response.json()

In [6]:
def parse_answer(returnanswer):
    nodes = [answer['nodes'][1] for answer in returnanswer['answers']]
    edges = [answer['edges'][0] for answer in returnanswer['answers']]
    answers = [ {"result_id": node["id"], 
                 "result_name": node["name"] if 'name' in node else node['id'], 
                 "type": edge["type"],
                 "source": edge['edge_source']}
              for node,edge in zip(nodes,edges)]
    return pd.DataFrame(answers)

### Expand from disease to genes

In [8]:
FA='MONDO:0019391'
fa_genes = expand('disease',FA,'gene')
fa_gene_frame = parse_answer(fa_genes)
fa_gene_frame

Return Status: 200


Unnamed: 0,result_id,result_name,source,type
0,HGNC:3585,FANCD2,pharos.gene_get_disease,disease_to_gene_association
1,HGNC:3588,FANCG,pharos.gene_get_disease,disease_to_gene_association
2,HGNC:3582,FANCA,pharos.gene_get_disease,disease_to_gene_association
3,HGNC:23168,FANCM,pharos.disease_get_gene,disease_to_gene_association
4,HGNC:3587,FANCF,pharos.gene_get_disease,disease_to_gene_association
5,HGNC:26144,PALB2,pharos.disease_get_gene,disease_to_gene_association
6,HGNC:3586,FANCE,pharos.disease_get_gene,disease_to_gene_association
7,HGNC:9820,RAD51C,pharos.disease_get_gene,disease_to_gene_association
8,HGNC:3583,FANCB,pharos.disease_get_gene,disease_to_gene_association
9,HGNC:25009,UBE2T,pharos.gene_get_disease,disease_to_gene_association


## Similiarity

Now we have a list of genes.  Let's find genes with similar biological processes.  Here's a query for doing that for one of the genes:

In [17]:
genes = list(fa_gene_frame['result_id'])
similar = similarity('gene',genes[0],'gene','biological_process_or_activity',threshhold=0.1)
pd.DataFrame(similar)

Return code: 200


Unnamed: 0,id,name,similarity
0,HGNC:3584,FANCC,0.333333
1,HGNC:3582,FANCA,0.315789
2,HGNC:25568,FANCI,0.214286
3,HGNC:20748,FANCL,0.142857
4,HGNC:30705,SPATA22,0.142857
5,UniProtKB:D9MXF4,UniProtKB:D9MXF4,0.117647
6,HGNC:11738,TEX15,0.111111
7,HGNC:19849,C14orf39,0.105263
8,HGNC:3588,FANCG,0.105263


So we want to do this for each of the FA genes, and we want to only include results that aren't already part of the original gene list.

In [23]:
sim_frames = []
for row in fa_gene_frame.iterrows():
    similar = similarity('gene',row[1]['result_id'],'gene','biological_process_or_activity',threshhold=0.1)
    sframe = pd.DataFrame(similar)
    sframe['query_id'] = row[1]['result_id']
    sframe['query_name'] = row[1]['result_name']
    sim_frames.append(sframe)
all_sim = pd.concat(sim_frames)

Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200
Return code: 200


In [24]:
all_sim

Unnamed: 0,id,name,similarity,query_id,query_name
0,HGNC:3584,FANCC,0.333333,HGNC:3585,FANCD2
1,HGNC:3582,FANCA,0.315789,HGNC:3585,FANCD2
2,HGNC:25568,FANCI,0.214286,HGNC:3585,FANCD2
3,HGNC:20748,FANCL,0.142857,HGNC:3585,FANCD2
4,HGNC:30705,SPATA22,0.142857,HGNC:3585,FANCD2
5,UniProtKB:D9MXF4,UniProtKB:D9MXF4,0.117647,HGNC:3585,FANCD2
6,HGNC:11738,TEX15,0.111111,HGNC:3585,FANCD2
7,HGNC:19849,C14orf39,0.105263,HGNC:3585,FANCD2
8,HGNC:3588,FANCG,0.105263,HGNC:3585,FANCD2
0,UniProtKB:A1L461,UniProtKB:A1L461,0.333333,HGNC:3588,FANCG


In [39]:
v = all_sim['query_name'].values
new_genes = all_sim[ ~ all_sim['name'].isin(v) ]
new_genes

Unnamed: 0,id,name,similarity,query_id,query_name
4,HGNC:30705,SPATA22,0.142857,HGNC:3585,FANCD2
5,UniProtKB:D9MXF4,UniProtKB:D9MXF4,0.117647,HGNC:3585,FANCD2
6,HGNC:11738,TEX15,0.111111,HGNC:3585,FANCD2
7,HGNC:19849,C14orf39,0.105263,HGNC:3585,FANCD2
0,UniProtKB:A1L461,UniProtKB:A1L461,0.333333,HGNC:3588,FANCG
1,UniProtKB:D9MXF4,UniProtKB:D9MXF4,0.272727,HGNC:3588,FANCG
3,UniProtKB:B3KVC6,UniProtKB:B3KVC6,0.250000,HGNC:3588,FANCG
4,UniProtKB:A0A2P9AC96,UniProtKB:A0A2P9AC96,0.250000,HGNC:3588,FANCG
6,UniProtKB:A0A2P9A9L1,UniProtKB:A0A2P9A9L1,0.250000,HGNC:3588,FANCG
7,UniProtKB:B4DN49,UniProtKB:B4DN49,0.250000,HGNC:3588,FANCG


Currently the list is sorted by 1) the rank of the original FA gene and then 2) the similarity.  We can sort by overall similarity.

In [41]:
new_genes.sort_values(by=["similarity"],ascending=False)

Unnamed: 0,id,name,similarity,query_id,query_name
0,UniProtKB:B7Z6Y4,UniProtKB:B7Z6Y4,1.000000,HGNC:3586,FANCE
1,UniProtKB:Q6MZN0,UniProtKB:Q6MZN0,1.000000,HGNC:3586,FANCE
5,UniProtKB:A0A0S2Z3N5,UniProtKB:A0A0S2Z3N5,1.000000,HGNC:3586,FANCE
4,UniProtKB:A0A024R8W1,UniProtKB:A0A024R8W1,1.000000,HGNC:3586,FANCE
3,UniProtKB:B3KNQ3,UniProtKB:B3KNQ3,1.000000,HGNC:3586,FANCE
2,UniProtKB:Q53FK7,UniProtKB:Q53FK7,1.000000,HGNC:3586,FANCE
0,HGNC:13060,ZNF267,0.750000,HGNC:20994,ZSCAN2
1,HGNC:13139,ZNF7,0.750000,HGNC:20994,ZSCAN2
1,UniProtKB:Q93000,UniProtKB:Q93000,0.666667,HGNC:37101,DDX11L8
2,HGNC:4928,HKR1,0.666667,HGNC:20994,ZSCAN2


### Probe Similarity

We're getting back answers that we have said are similar by `biological_process_or_activity`.  But the similarity service doesn't tell us which processes the genes actually have in common, just that they have some in common.

We can reconstruct what these common processes are by using a query to the quick service, where we will construct a query that goes `(gene1)-(set of biological processes)-(gene2)`.

In the following snippet, we'll show how to do this for one particular pair of genes: ZNF267 and ZSCAN2.

In [2]:
def quick(question):
    url=f'http://{robokop_server}:80/api/simple/quick/'
    response = requests.post(url,json=question)
    print( f"Return Status: {response.status_code}" )
    if response.status_code == 200:
        return response.json()
    return response

In [7]:
def make_question(types,curies,sets):
    question = {
                'machine_question': {
                    'nodes': [],
                    'edges': []
                }
            }
    for i,t in enumerate(types):
        newnode = {'id': f'n{i}', 'type': t}
        if curies[i] is not None:
            newnode['curie'] = curies[i]
        if sets[i] is not None:
            newnode['set'] = True
        question['machine_question']['nodes'].append(newnode)
        if i > 0:
            edge = {'id': f'e{i}', 'source_id': f'n{i-1}', 'target_id': f'n{i}'}
            question['machine_question']['edges'].append( edge )
    return question

In [17]:
import json
question = make_question(['gene','biological_process_or_activity','gene'],
                         ['HGNC:13060',None,'HGNC:20994'],
                         [None,True,None])
print(json.dumps(question,indent=4))
a = quick(question)

{
    "machine_question": {
        "nodes": [
            {
                "id": "n0",
                "type": "gene",
                "curie": "HGNC:13060"
            },
            {
                "id": "n1",
                "type": "biological_process_or_activity",
                "set": true
            },
            {
                "id": "n2",
                "type": "gene",
                "curie": "HGNC:20994"
            }
        ],
        "edges": [
            {
                "id": "e1",
                "source_id": "n0",
                "target_id": "n1"
            },
            {
                "id": "e2",
                "source_id": "n1",
                "target_id": "n2"
            }
        ]
    }
}
Return Status: 200


Because we used sets on the intermediate node, we should get back a single answer, but let's double check that:

In [18]:
len(a['answers'])

1

Now, let's make it a bit nicer to look at by extracting the process nodes from our answer and putting them in a data frame:

In [19]:
import pandas as pd
processes = [ {'id': n['id'], 'name': n['name']} for n in a['answers'][0]['nodes'] if n['type'] == 'biological_process_or_activity']
df = pd.DataFrame(processes)
df

Unnamed: 0,id,name
0,GO:0000981,"DNA-binding transcription factor activity, RNA..."
1,GO:0003677,DNA binding
2,GO:0006351,"transcription, DNA-templated"
3,GO:0006357,regulation of transcription by RNA polymerase II
4,GO:0007275,multicellular organism development
5,GO:0046872,metal ion binding
