In [122]:
from BioLink.biolink_client import BioLinkWrapper
import requests
from pprint import pprint
import graphviz as gv
import pandas as pd
import cache_magic
import numpy as np

In [2]:
blw = BioLinkWrapper()

In [36]:
robokop_server = 'robokop.renci.org'
def expand(type1,identifier,type2,rebuild=None,csv=None,predicate=None):
    url=f'http://{robokop_server}:80/api/simple/expand/{type1}/{identifier}/{type2}'
    params = {'rebuild': rebuild, 
              'csv'    : csv,
              'predicate': predicate} 
    params = { k:v for k,v in params.items() if v is not None }
    response = requests.get(url,params=params)
    print( f'Return Status: {response.status_code}' )
    if response.status_code == 200:
        return response.json()
    return []
            

In [37]:
from Modules.Mod0_lookups import DiseaseGeneLookUp

# workflow input is a disease identifier
input_disease = 'MONDO:0019391' 
lu = DiseaseGeneLookUp()
input_object = lu.input_object_lookup(input_curie=input_disease)
input_object

{'id': 'MONDO:0019391',
 'label': 'Fanconi anemia',
 'description': 'Fanconi anemia (FA) is a hereditary DNA repair disorder characterized by progressive pancytopenia with bone marrow failure, variable congenital malformations and predisposition to develop hematological or solid tumors.'}

In [38]:
# Module specification
mod0_config = {
    'id': input_object['id'],
    'data_type': 'disease',
    'input_type': {
        'complexity': 'single',
        'id_type': ['MONDO', 'DO', 'OMIM'],   
    },
    'output_type': {
        'complexity': 'set',
        'id_type': 'HGNC'
    },
    'taxon': 'human',
    'limit': None,
    'source': 'Monarch Biolink',
    'predicate': 'blm:gene associated with condition'
}

In [164]:
# get genes associated with disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup(disease2genes_object=mod0_config)
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes['hit_id'].tolist()

## Get all chemicals that increase expression or abundance of disease associated genes

In [40]:
preds=['increases_expression_of',
    'increases_abundance_of',
    'increases_activity_of',
    'increases_stability_of',
    'decreases_degradation_of',
    'increases_response_to',
    'increases_molecular_modification_of']


results = {}
for gene in disease_associated_genes['hit_id'].tolist():
    for p in preds:
        results[p] = expand('gene',gene,'chemical_substance',predicate=p)

Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200


KeyboardInterrupt: 

In [41]:
%cache robo_results = results

Loading cached value for variable 'robo_results'. Time since caching: 2:19:53.762008


In [42]:
all_edges = []
all_nodes = []
for pred, result in robo_results.items():
    if 'answers' in result.keys():
        for answer in result['answers']:
            all_edges = all_edges + answer['edges']
            all_nodes = all_nodes + answer['nodes']

In [43]:
edges_df = pd.DataFrame(all_edges)

In [45]:
nodes_df = pd.DataFrame(all_nodes)

In [53]:
increase_chemicals = nodes_df[nodes_df['type'] == 'chemical_substance']

In [57]:
increases_chemicals_list = list(zip(increase_chemicals['id'].tolist(), increase_chemicals['name'].tolist()))

In [138]:
increase_chemicals[['id', 'name']]

Unnamed: 0,id,name
1,CHEBI:25255,Methyl Methanesulfonate
3,CHEBI:44423,Hydroxyurea
5,CHEBI:16842,Formaldehyde
7,CHEBI:35456,Cadmium Chloride
9,MESH:D002945,Cisplatin
11,CHEBI:82160,Polychlorinated Biphenyls
13,CHEBI:3908,Coumestrol
15,CHEBI:88217,"N,N,N',N'-tetrakis(2-pyridylmethyl)ethylenedia..."
17,CHEBI:37537,Tetradecanoylphorbol Acetate
19,CHEBI:27684,Phenylmercuric Acetate


## Get all genes that decrease or consume the chemicals

In [91]:
kegg_decreases_results = []
for chebi in increases_chemicals_list:
    kegg_decreases_results.append(expand('chemical_substance',chebi[0],'gene',predicate='negatively_regulates__entity_to_entity'))

Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200
Return Status: 200


In [104]:
kegg_edges = []
kegg_nodes = []
for kegg_result in kegg_decreases_results:
    if 'answers' in kegg_result.keys():
        for answer in kegg_result['answers']:
            kegg_edges = kegg_edges + answer['edges']
            kegg_nodes = kegg_nodes + answer['nodes']

In [105]:
kegg_edge_df = pd.DataFrame(kegg_edges)

In [106]:
kegg_edge_df = kegg_results[kegg_results['type'] == 'negatively_regulates__entity_to_entity']

In [107]:
kegg_node_df = pd.DataFrame(kegg_nodes)

In [110]:
kegg_node_df = kegg_node_df[kegg_node_df['type']=='gene']
kegg_node_df = kegg_node_df[['id', 'name']]

In [127]:
chem_df = pd.DataFrame(increases_chemicals_list)

In [118]:
merged = pd.merge(kegg_edge_df, kegg_node_df, how='inner', left_on='source_id', right_on='id')

In [121]:
merged = merged[['source_database', 'source_id', 'name', 'target_id', 'weight', 'type' ]].dropna()

In [130]:
icd = pd.DataFrame(increases_chemicals_list, columns=['chebi', 'name'])

In [139]:
final_kegg = pd.merge(merged, icd, how='inner', left_on='target_id', right_on='chebi')

In [142]:
final_kegg[['source_id', 'name_x', 'target_id', 'name_y', 'weight']].drop_duplicates()

Unnamed: 0,source_id,name_x,target_id,name_y,weight
0,HGNC:2596,CYP1A2,CHEBI:16842,Formaldehyde,0.511471
2,HGNC:25531,FAR2,CHEBI:16842,Formaldehyde,0.616381
3,HGNC:2596,CYP1A2,CHEBI:16469,Estradiol,0.48452
5,HGNC:12530,UGT1A1,CHEBI:16469,Estradiol,0.547286
6,HGNC:20233,COQ6,CHEBI:29865,Benzo(a)pyrene,0.606531
9,HGNC:20233,COQ6,CHEBI:33216,bisphenol A,0.606531
12,HGNC:20233,COQ6,CHEBI:39867,Valproic Acid,0.606531
15,HGNC:932,BAAT,CHEBI:15891,Taurine,0.592436
16,HGNC:4259,GGT7,CHEBI:15891,Taurine,0.592436
17,HGNC:4175,GATM,CHEBI:15891,Taurine,0.592436


## get all proteins that consume chemicals through Rhea

In [10]:
from Rhea.Rhea import RheaMethods

In [11]:
rhea = RheaMethods()

In [162]:
reactions = []
for chebi in nodes_df[nodes_df['type'] == 'chemical_substance']['id'].tolist():
    proteins = []
    for rxn in rhea.substrate2gene(chebi):
        proteins = proteins + rxn['proteins']
    reactions.append(
        {
            'chemical': chebi,
            'proteins': proteins,
            'total_proteins': len(proteins)
        }
                    )
    
from collections import defaultdict
rhea_chems = defaultdict(int)
for rxn in reactions:
    rhea_chems[rxn['chemical']] += rxn['total_proteins']
rhea_chems_df = pd.DataFrame(list(zip(rhea_chems.keys(), rhea_chems.values())), columns=['chebi', 'total_proteins'])
final_rhea_df = pd.merge(increase_chemicals[['id', 'name']], rhea_chems_df, how='left', left_on='id', right_on='chebi')

Unnamed: 0,name,chebi,total_proteins
1,Hydroxyurea,CHEBI:44423,15
2,Formaldehyde,CHEBI:16842,9727
16,Estradiol,CHEBI:16469,630
