In [1]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd
from pprint import pprint
from collections import defaultdict
from mygene import MyGeneInfo
from GraphVisuals.graphviz_wrapper import PathGraph

# Workflow II Rare Disease Candidates

In [2]:
from Modules.Mod0_lookups import LookUp

# workflow input is a disease identifier
lu = LookUp()

Mod O DiseaseGeneLookup metadata:
{'data_type': 'disease',
 'input_type': {'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM']},
 'limit': None,
 'output_type': {'complexity': 'set', 'id_type': 'HGNC'},
 'predicate': 'blm:gene associated with condition',
 'source': 'Monarch Biolink',
 'taxon': 'human'}


In [3]:
input_disease = 'MONDO:0019391' # Fanconi anemia  
input_object = {
    'input': input_disease,
    'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}

lu.load_input_object(input_object=input_object)
input_object = lu.input_object

{'description': 'Fanconi anemia (FA) is a hereditary DNA repair disorder '
                'characterized by progressive pancytopenia with bone marrow '
                'failure, variable congenital malformations and predisposition '
                'to develop hematological or solid tumors.',
 'id': 'MONDO:0019391',
 'label': 'Fanconi anemia'}


In [31]:
# start a list of final candidates from each module
results = list()

# define workflow modules
worklow_modules = [
    ('Mod1A', 'Mod1A Functional Similarity'),
    ('Mod1B', 'Mod1B Phenotype Similarity'),
    ('Mod1C', 'Mod1C Coexpression'),
    ('Mod1D', 'Mod1D Gene Chemical Interaction'),
    ('Mod1E', 'Mod1E Gene Interaction'),
    ('Mod3', 'Mod3 Candidate Merging')
]

# instantiate workflow graph
path_graph = PathGraph(input_curie=input_object['id'], input_label=input_object['label'])
path_graph.load_nodes(node_list=worklow_modules)

## Disease Associated Genes

In [4]:
# get genes associated with disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup()
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes['hit_id'].tolist()
# # show the 
disease_associated_genes

Unnamed: 0,input_id,input_label,hit_id,hit_label,ncbi,sources
0,MONDO:0019391,Fanconi anemia,HGNC:1100,BRCA1,NCBIGene:672,clinvar
1,MONDO:0019391,Fanconi anemia,HGNC:1101,BRCA2,NCBIGene:675,"orphane, omim, ctd, orphane, clinvar"
2,MONDO:0019391,Fanconi anemia,HGNC:11892,TNF,NCBIGene:7124,ctd
3,MONDO:0019391,Fanconi anemia,HGNC:12829,XRCC2,NCBIGene:7516,"orphane, ctd, omim"
4,MONDO:0019391,Fanconi anemia,HGNC:20473,BRIP1,NCBIGene:83990,"ctd, orphane, omim, ctd, clinvar"
5,MONDO:0019391,Fanconi anemia,HGNC:20748,FANCL,NCBIGene:55120,"ctd, orphane, ctd, omim, clinvar"
6,MONDO:0019391,Fanconi anemia,HGNC:23168,FANCM,NCBIGene:57697,"ctd, orphane"
7,MONDO:0019391,Fanconi anemia,HGNC:23845,SLX4,NCBIGene:84464,"ctd, orphane, ctd, omim, clinvar"
8,MONDO:0019391,Fanconi anemia,HGNC:25009,UBE2T,NCBIGene:29089,"omim, ctd, clinvar, orphane"
9,MONDO:0019391,Fanconi anemia,HGNC:25539,RFWD3,NCBIGene:55159,orphane


In [5]:
## Get mouse orthologs from Monarch Initiative
from Modules.ortholog_traversal import OrthologTraversal
ot = OrthologTraversal()
mouse_orthologs = pd.DataFrame(ot.ortholog_set_by_taxid(gene_set=disease_associated_genes['hit_id'].tolist(), 
                                                        taxon_name='mouse'))
mouse_orthologs

Unnamed: 0,hit_id,hit_label,input_id,input_label,sources
0,MGI:104537,Brca1,HGNC:1100,BRCA1,[panther]
1,MGI:109337,Brca2,HGNC:1101,BRCA2,[panther]
2,MGI:104798,Tnf,HGNC:11892,TNF,[panther]
3,MGI:1927345,Xrcc2,HGNC:12829,XRCC2,[panther]
4,MGI:2442836,Brip1,HGNC:20473,BRIP1,[panther]
5,MGI:1914280,Fancl,HGNC:20748,FANCL,[panther]
6,MGI:2442306,Fancm,HGNC:23168,FANCM,[panther]
7,MGI:106299,Slx4,HGNC:23845,SLX4,[panther]
8,MGI:1914446,Ube2t,HGNC:25009,UBE2T,[panther]
9,MGI:2384584,Rfwd3,HGNC:25539,RFWD3,[panther]


# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [6]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity

## Mod1A_human

In [7]:
# Module specification
mod1a_input_object_human = {
    'input': input_curie_set,
    'parameters': {
        'taxon': 'human',
        'threshold': 0.75,  # jaccard index threshold
    },
}
    
func_sim_human = FunctionalSimilarity()

Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [8]:
func_sim_human.load_input_object(mod1a_input_object_human)
func_sim_human.load_gene_set() 


In [9]:
func_sim_human.load_associations()

In [10]:
Mod1A_results_human = pd.DataFrame(func_sim_human.compute_similarity())
Mod1A_results_human = Mod1A_results_human[~Mod1A_results_human['hit_curie'].isin(input_curie_set)]
Mod1A_results_human

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
8,HGNC:20922,SLX1A,0.785235,HGNC:23845,SLX4
15,HGNC:26171,FAAP100,0.866667,HGNC:3583,FANCB
16,HGNC:25021,ASTE1,0.806452,HGNC:3583,FANCB
17,HGNC:21700,RAD9B,0.925926,HGNC:3583,FANCB
18,HGNC:22223,EEPD1,0.8,HGNC:3583,FANCB
20,HGNC:28467,FAAP24,0.83871,HGNC:3583,FANCB
22,HGNC:24994,INIP,0.862069,HGNC:3583,FANCB
25,HGNC:26171,FAAP100,0.766667,HGNC:3586,FANCE
26,HGNC:21700,RAD9B,0.814815,HGNC:3586,FANCE
27,HGNC:22223,EEPD1,0.758621,HGNC:3586,FANCE


## Mod1A Mouse

In [11]:
# Module specification
mod1a_input_object_mouse = {
    'input': mouse_orthologs['hit_id'].tolist(),
    'parameters': {
        'taxon': 'mouse',
        'threshold': 0.65,  # jaccard index threshold
    },
}
    
func_sim_mouse = FunctionalSimilarity()

Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [12]:
func_sim_mouse.load_input_object(mod1a_input_object_mouse)
func_sim_mouse.load_gene_set() 
func_sim_mouse.load_associations()

In [13]:
Mod1A_results_mouse = pd.DataFrame(func_sim_mouse.compute_similarity())

In [14]:
### filter results and traverse back to human orthologs
Mod1A_results_mouse = Mod1A_results_mouse[~Mod1A_results_mouse['hit_name'].isin(mouse_orthologs['hit_label'])]
Mod1A_results_mouse = Mod1A_results_mouse.sort_values('hit_score', ascending=False)

def trim_mgi_prefix(curie):
    return curie[4:]

Mod1A_results_mouse['hit_id'] = Mod1A_results_mouse['hit_curie'].apply(trim_mgi_prefix)

def back2human(gene_curie):
    hits = ot.single_gene_ortholog(gene=gene_curie, taxon_name='human')
    return hits

human_hits = []
for term in Mod1A_results_mouse['hit_id'].tolist():
    human_hits = human_hits + back2human(gene_curie=term)
    
Mod1A_results_mouse_merged = pd.merge(Mod1A_results_mouse, pd.DataFrame(human_hits), how='outer', left_on='hit_id', right_on='gene_id')
Mod1A_results_mouse_traversed = pd.merge(mouse_orthologs, Mod1A_results_mouse_merged, left_on='hit_id', right_on='input_curie')
Mod1A_results_final = Mod1A_results_mouse_traversed[['input_id', 'input_label', 'gene_id', 'hit_name', 'hit_score', 'orth_id', 'orth_label']]

In [15]:
Mod1A_results_final

Unnamed: 0,input_id,input_label,gene_id,hit_name,hit_score,orth_id,orth_label
0,HGNC:23168,FANCM,MGI:894324,Cenpx,0.651786,HGNC:11422,CENPX
1,HGNC:25009,UBE2T,MGI:1920568,Ube2d2b,0.685714,HGNC:12475,UBE2D2
2,HGNC:25009,UBE2T,MGI:1930715,Ube2d2a,0.671429,HGNC:12475,UBE2D2
3,HGNC:25009,UBE2T,MGI:1914049,Ube2w,0.670213,HGNC:25616,UBE2W
4,HGNC:25009,UBE2T,MGI:1914865,Ube2r2,0.657143,HGNC:19907,UBE2R2
5,HGNC:7532,MX1,,Tgtp1,0.727273,,
6,HGNC:7532,MX1,MGI:109493,Rab33a,0.697674,HGNC:9773,RAB33A
7,HGNC:7532,MX1,MGI:109493,Rab33a,0.697674,HGNC:9768,RAB28


# MOD1B Phenotype Similarity
## Find similar genes based on OwlSim calculated Phenotype Similarity

## Mod1B Human

In [16]:
from Modules.Mod1B1_phenotype_similarity import PhenotypeSimilarity

In [17]:
# Module specification
mod1b_input_object_human = {
    'input': input_curie_set,
     'parameters': {
        'taxon': 'human',
        'threshold': .45,
    },
}


pheno_sim_human = PhenotypeSimilarity()

Mod1B Phenotype Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:has phenotype'],
 'source': 'Monarch Biolink'}


In [18]:
pheno_sim_human.load_input_object(mod1b_input_object_human)
pheno_sim_human.load_gene_set()
pheno_sim_human.load_associations()

In [19]:
# compute phenotype similarity
Mod1B_results = pd.DataFrame(pheno_sim_human.compute_similarity())
Mod1B_results = Mod1B_results[~Mod1B_results['hit_curie'].isin(input_curie_set)].sort_values('hit_score', ascending=False)

In [20]:
Mod1B_results

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
23,HGNC:6709,LTA,0.967532,HGNC:11892,TNF
69,HGNC:7499,MT-TT,0.508346,HGNC:20473,BRIP1
60,HGNC:8143,OPCML,0.508346,HGNC:20473,BRIP1
62,HGNC:17271,RRAS2,0.508346,HGNC:20473,BRIP1
216,HGNC:7499,MT-TT,0.503003,HGNC:26144,PALB2
207,HGNC:8143,OPCML,0.503003,HGNC:26144,PALB2
209,HGNC:17271,RRAS2,0.503003,HGNC:26144,PALB2
241,HGNC:20820,TPCN2,0.5,HGNC:3582,FANCA
242,HGNC:15880,CDK5RAP1,0.5,HGNC:3582,FANCA
243,HGNC:1770,CDK10,0.5,HGNC:3582,FANCA


## Mod1B Mouse

In [21]:
# Module specification
mod1b_input_object_mouse = {
    'input': mouse_orthologs['hit_id'].tolist(),
     'parameters': {
        'taxon': 'mouse',
        'threshold': .75,
    },
}


pheno_sim_mouse = PhenotypeSimilarity()

Mod1B Phenotype Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:has phenotype'],
 'source': 'Monarch Biolink'}


In [22]:
pheno_sim_mouse.load_input_object(mod1b_input_object_mouse)
pheno_sim_mouse.load_gene_set()
pheno_sim_mouse.load_associations()

In [23]:
Mod1B_results_mouse = pd.DataFrame(pheno_sim_mouse.compute_similarity())
Mod1B_results_mouse = Mod1B_results_mouse[~Mod1B_results_mouse['hit_name'].isin(
    mouse_orthologs['hit_label'].tolist())].sort_values('hit_score', ascending=False)

In [24]:
Mod1B_results_mouse

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
52,MGI:1922567,Parpbp,1.000000,MGI:2384790,
72,MGI:1915582,Sdhaf1,1.000000,MGI:2384790,
70,MGI:99207,Zfp60,1.000000,MGI:2384790,
69,MGI:2142810,Ppip5k2,1.000000,MGI:2384790,
68,MGI:1918319,Stox2,1.000000,MGI:2384790,
67,MGI:2384301,Mettl22,1.000000,MGI:2384790,
66,MGI:2449143,Gbgt1,1.000000,MGI:2384790,
64,MGI:2681306,Tas2r138,1.000000,MGI:2384790,
63,MGI:1924311,4931406P16Rik,1.000000,MGI:2384790,
61,MGI:3608413,Rhbdl2,1.000000,MGI:2384790,


In [25]:
def number_of_annotations(mgi_curie):
    return pheno_sim_mouse.associations.annotations(mgi_curie)

In [26]:
Mod1B_results_mouse['hit_annotations'] = Mod1B_results_mouse['hit_curie'].apply(number_of_annotations)
Mod1B_results_mouse['input_annotations'] = Mod1B_results_mouse['input_curie'].apply(number_of_annotations)

In [27]:
Mod1B_results_mouse

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name,hit_annotations,input_annotations
52,MGI:1922567,Parpbp,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
72,MGI:1915582,Sdhaf1,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
70,MGI:99207,Zfp60,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
69,MGI:2142810,Ppip5k2,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
68,MGI:1918319,Stox2,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
67,MGI:2384301,Mettl22,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
66,MGI:2449143,Gbgt1,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
64,MGI:2681306,Tas2r138,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
63,MGI:1924311,4931406P16Rik,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]
61,MGI:3608413,Rhbdl2,1.000000,MGI:2384790,,[MP:0011110],[MP:0011110]


In [28]:
mouse_orthologs

Unnamed: 0,hit_id,hit_label,input_id,input_label,sources
0,MGI:104537,Brca1,HGNC:1100,BRCA1,[panther]
1,MGI:109337,Brca2,HGNC:1101,BRCA2,[panther]
2,MGI:104798,Tnf,HGNC:11892,TNF,[panther]
3,MGI:1927345,Xrcc2,HGNC:12829,XRCC2,[panther]
4,MGI:2442836,Brip1,HGNC:20473,BRIP1,[panther]
5,MGI:1914280,Fancl,HGNC:20748,FANCL,[panther]
6,MGI:2442306,Fancm,HGNC:23168,FANCM,[panther]
7,MGI:106299,Slx4,HGNC:23845,SLX4,[panther]
8,MGI:1914446,Ube2t,HGNC:25009,UBE2T,[panther]
9,MGI:2384584,Rfwd3,HGNC:25539,RFWD3,[panther]
