In [None]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd
from pprint import pprint
from collections import defaultdict
from mygene import MyGeneInfo
from GraphVisuals.graphviz_wrapper import PathGraph

## Module declaration

In [None]:
worklow_modules = [
    ('Mod1A', 'Mod1A Functional Similarity'),
    ('Mod1B', 'Mod1B Phenotype Similarity'),
    ('Mod1C', 'Mod1C Coexpression'),
    ('Mod1D', 'Mod1D Gene Chemical Interaction'),
    ('Mod1E', 'Mod1E Gene Interaction'),
    ('Mod3', 'Mod3 Candidate Merging')
]

In [None]:
# start a list of final candidates from each module
results = list()

## Workflow input

# Mod0 Input Lookup
## Get Disease associated genes from BioLink API api.monarchinitative.org

In [None]:
from Modules.Mod0_lookups import LookUp

# workflow input is a disease identifier
lu = LookUp()

In [None]:
input_disease = 'MONDO:xxxxxxx' # Some Disease 
input_object = {
    'input': input_disease,
    'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}

lu.load_input_object(input_object=input_object)
input_object = lu.input_object

In [None]:
# instantiate workflow graph
path_graph = PathGraph(input_curie=input_object['id'], input_label=input_object['label'])
path_graph.load_nodes(node_list=worklow_modules)

In [None]:
# get genes associated with disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup()
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes['hit_id'].tolist()
# # show the 
disease_associated_genes

In [None]:
# graph disease genes 2 mod1
path_graph.load_nodes(list(zip(disease_associated_genes['hit_id'], disease_associated_genes['hit_label'])))
path_graph.module_outputs(output_gene_set=input_curie_set, module_id=path_graph.conv_pref(input_disease))
for module in worklow_modules[:-1]:
    path_graph.module_inputs(input_gene_set=input_curie_set, module_id=module[0])
path_graph.path_graph

In [None]:
# add Mod0 results to ouput
for index, row in disease_associated_genes.iterrows():
    results.append({
        'input_curie': row[0],
        'input_name': row[1],
        'output_curie': row[2],
        'output_name': row[3],
        'module': 'Mod0',
        'score': None,
    }) 

# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [None]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity

# Module specification
mod1a_input_object = {
    'input': input_curie_set,
    'parameters': {
        'taxon': 'human',
        'threshold': 0.3,
    },
}
    
func_sim = FunctionalSimilarity()

In [None]:
mod1a_input_object = {
    'input': input_curie_set,
    'parameters': {
        'taxon': 'human',
        'threshold': 0.75,
    },
}
func_sim.load_input_object(mod1a_input_object)
func_sim.load_gene_set() 
func_sim.load_associations()

In [None]:
Mod1A_results = pd.DataFrame(func_sim.compute_similarity())

In [None]:
Mod1A_results = Mod1A_results[~Mod1A_results['hit_curie'].isin(input_curie_set)]

In [None]:
Mod1A_results

In [None]:
path_graph.load_nodes(list(zip(Mod1A_results['hit_curie'], Mod1A_results['hit_name'])))
path_graph.module_inputs(input_gene_set=Mod1A_results['hit_curie'].tolist(), module_id='Mod3')
path_graph.module_outputs(output_gene_set=Mod1A_results['hit_curie'].tolist(), module_id='Mod1A')
path_graph.path_graph

In [None]:
# add Mod1A results to ouput
for index, row in Mod1A_results.iterrows():
    results.append({
        'input_curie': row[0],
        'input_name': row[1],
        'output_curie': row[2],
        'output_name': row[3],
        'module': 'Mod1A',
        'score': row[4],
    }) 

# MOD1B Phenotype Similarity
## Find similar genes based on OwlSim calculated Phenotype Similarity

In [None]:
from Modules.Mod1B_phenotype_similarity import PhenotypeSimilarity

mod1b_input_object = {
    'input': input_curie_set,
     'parameters': {
        'taxon': 'human',
        'threshold': 100,
    },
}

# Module specification
pheno_sim = PhenotypeSimilarity()

In [None]:
pheno_sim.load_input_object(mod1b_input_object)
pheno_sim.load_gene_set()
pheno_sim.load_associations()

In [None]:
# compute phenotype similarity
Mod1B_results = pd.DataFrame(pheno_sim.compute_similarity(sim_type='disease'))

In [None]:
Mod1B_results

In [None]:
path_graph.load_nodes(list(zip(Mod1B_results['id'], Mod1B_results['label_x'])))
path_graph.module_outputs(output_gene_set=Mod1B_results['id'].tolist(), module_id='Mod1B')
path_graph.module_inputs(input_gene_set=Mod1B_results['id'].tolist(), module_id='Mod3')
path_graph.path_graph

In [None]:
# graph mod1B genes 2 Mod3
for index, row in Mod1B_results.iterrows():
    results.append({
        'input_curie': row[-1],
        'input_name': None,
        'output_curie': row[1],
        'output_name': row[2],
        'module': 'Mod1B',
        'score': row[3]
    }) 

# Mod1C Coexpression

In [None]:
# TODO

# Mod1D Chemical Gene Interactions

In [None]:
# TODO

# Mod1E Gene Interactions

In [None]:
from Modules.Mod1E_interactions import GeneInteractions
gene_inter = GeneInteractions()

In [None]:
mod1e_input_object = {
    'input': input_curie_set,
     'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}
gene_inter.load_input_object(input_object=mod1e_input_object)
gene_inter.load_gene_set()

In [None]:
Mod1E_results = pd.DataFrame(gene_inter.get_interactions())

In [None]:
Mod1E_results

In [None]:
Mod1E_results = Mod1E_results[~Mod1E_results['hit_name'].isin(disease_associated_genes['hit_label'])]

In [None]:
grouped_Mod1E_results = Mod1E_results.groupby(['hit_curie', 'hit_name'])['input_curie'].apply(', '.join).reset_index()
trimmed_Mod1E_results = list()
for index, row in grouped_Mod1E_results.iterrows():
    if len(row[-1].split(',')) > 3:
        trimmed_Mod1E_results.append(row)
Mod1E_results = pd.DataFrame(trimmed_Mod1E_results, columns=['hit_curie', 'hit_name', 'input_curie']).reset_index()
Mod1E_results

In [None]:
path_graph.load_nodes(list(zip(Mod1E_results['hit_curie'], Mod1E_results['hit_name']))[:10])
path_graph.module_outputs(output_gene_set=Mod1E_results['hit_curie'].tolist()[:10], module_id='Mod1E')
path_graph.module_inputs(input_gene_set=Mod1E_results['hit_curie'].tolist()[:10], module_id='Mod3')
path_graph.path_graph

In [None]:
# graph mod1E genes 2 Mod3
for index, row in Mod1E_results.iterrows():
    results.append({
        'input_curie': row[-1],
        'input_name': None,
        'output_curie': row[1],
        'output_name': row[2],
        'module': 'Mod1E',
        'score': None
    })

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
results_df.groupby(['output_name', 'output_curie'])['module'].apply(', '.join).reset_index().reset_index()