# Workflow 1, Module 2, Question 4

## What genes are involved in [pathway/process]?

### Define some functions for use below

In [1]:
robokop_server = 'robokop.renci.org'

In [2]:
import requests
import json
import pandas as pd

In [3]:
def expand(type1,identifier,type2,rebuild=None,csv=None,predicate=None):
    url=f'http://{robokop_server}:80/api/simple/expand/{type1}/{identifier}/{type2}'
    params = {'rebuild': rebuild, 
              'csv'    : csv,
              'predicate': predicate} 
    params = { k:v for k,v in params.items() if v is not None }
    response = requests.get(url,params=params)
    print( f'Return Status: {response.status_code}' )
    if response.status_code == 200:
        return response.json()
    return []

In [4]:
def parse_answer(returnanswer):
    nodes = [answer['nodes'][1] for answer in returnanswer['answers']]
    edges = [answer['edges'][0] for answer in returnanswer['answers']]
    answers = [ {"result_id": node["id"], 
                 "result_name": node["name"] if 'name' in node else node['id'], 
                 "type": edge["type"],
                 "source": edge['edge_source']}
              for node,edge in zip(nodes,edges)]
    return pd.DataFrame(answers)

In [5]:
def enrichment(type1,identlist,type2,threshhold=None,maxresults=None,numtype1=None,include_descendants=None,rebuild=None):
    url=f'http://{robokop_server}/api/simple/enriched/{type1}/{type2}'
    params = { 'threshhold': threshhold, 'maxresults': maxresults, 
              'num_type1':numtype1, 'identifiers': identlist, 
              'include_descendants':include_descendants, 'rebuild': rebuild }
    params = { k:v for k,v in params.items() if v is not None }
    response=requests.post(url, json = params)
    print( f'Return Status: {response.status_code}' )
    if response.status_code == 200:
        return response.json()
    return []

### Get the inputs to question 4

We'll start question 4 by getting the set of genes from the enrichment version of question 3.

In [6]:
PRKAA1 = 'HGNC:9376'
XBP1 = 'HGNC:12801'
MTATP1 = 'HGNC:7414'
NEIL1 = 'HGNC:18448'
FRK = 'HGNC:3955'

mody_genes = [PRKAA1,XBP1,MTATP1,NEIL1,FRK]

In [7]:
enriched_processes = enrichment('gene',mody_genes,'biological_process_or_activity')
process_frame = pd.DataFrame(enriched_processes)
process_frame

Return Status: 200


Unnamed: 0,id,name,p
0,GO:0055089,fatty acid homeostasis,0.000004
1,GO:0006633,fatty acid biosynthetic process,0.000025
2,GO:0042149,cellular response to glucose starvation,0.000025
3,GO:0071333,cellular response to glucose stimulus,0.000064
4,GO:0009631,cold acclimation,0.000425
5,GO:0062028,regulation of stress granule assembly,0.000425
6,GO:0050405,[acetyl-CoA carboxylase] kinase activity,0.000425
7,GO:2000758,positive regulation of peptidyl-lysine acetyla...,0.000425
8,GO:0047322,[hydroxymethylglutaryl-CoA reductase (NADPH)] ...,0.000425
9,GO:0035404,histone-serine phosphorylation,0.000638


### Strategy 1: expand with loop

If you want to know the genes involved in a process, you can use expand.  If you have a list of processes and you want to know the union of all genes involved with, this is the best approach.

Expand takes a single process, so you need to call it individually.  Here we combine the results into a single frame, but you may wish to handle it differently.  We'll (arbitrarily) limit to the top 10 processes by enrichment score.

In [10]:
top_10 = process_frame.iloc[:10]

In [11]:
gframes = []
for process in top_10['id']:
    genes = expand('biological_process_or_activity',process,'gene')
    gene_frame = parse_answer(genes)
    gene_frame['process'] = process
    gframes.append(gene_frame)
all_genes = pd.concat(gframes)
all_genes

Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200


Unnamed: 0,result_id,result_name,source,type,process
0,HGNC:7967,NR1H4,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
1,HGNC:9376,PRKAA1,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
2,HGNC:12744,MLXIPL,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
3,HGNC:61,ABCD1,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
4,HGNC:2843,DGAT1,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
5,HGNC:14929,SIRT1,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
6,HGNC:613,APOE,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
7,HGNC:9175,POLD1,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
8,HGNC:9377,PRKAA2,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089
9,HGNC:24865,GPAM,biolink.gene_get_process_or_function,actively_involved_in,GO:0055089


### Enrichment

If you don't care about every process that every gene is involved with, but are instead trying to find the most common process, you can use enrichment and do a single call.  Since biological processes are part of an ontology, let's also use descendants.

In [16]:
enriched_genes = enrichment('biological_process_or_activity',list(top_10['id']),'gene',include_descendants=True)

Return Status: 200


In [17]:
pd.DataFrame(enriched_genes)

Unnamed: 0,id,name,p
0,HGNC:9377,PRKAA2,1.826679e-15
1,HGNC:9376,PRKAA1,3.483596e-15
2,HGNC:15829,ELOVL6,7.183826e-12
3,HGNC:21308,ELOVL5,1.193760e-11
4,HGNC:14415,ELOVL4,1.193760e-11
5,HGNC:26292,ELOVL7,5.404678e-10
6,HGNC:14418,ELOVL1,1.358189e-09
7,HGNC:18047,ELOVL3,1.358189e-09
8,HGNC:14416,ELOVL2,1.358189e-09
9,HGNC:429,ALOX12,1.352447e-08
