In [1]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd
from pprint import pprint
from collections import defaultdict
from mygene import MyGeneInfo
from GraphVisuals.graphviz_wrapper import PathGraph

# Workflow II Rare Disease Candidates

In [2]:
from Modules.Mod0_lookups import LookUp

# workflow input is a disease identifier
lu = LookUp()

Mod O DiseaseGeneLookup metadata:
{'data_type': 'disease',
 'input_type': {'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM']},
 'limit': None,
 'output_type': {'complexity': 'set', 'id_type': 'HGNC'},
 'predicate': 'blm:gene associated with condition',
 'source': 'Monarch Biolink',
 'taxon': 'human'}


In [3]:
input_disease = 'MONDO:0019391' # Fanconi anemia  
input_object = {
    'input': input_disease,
    'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}

lu.load_input_object(input_object=input_object)
input_object = lu.input_object

{'description': 'Fanconi anemia (FA) is a hereditary DNA repair disorder '
                'characterized by progressive pancytopenia with bone marrow '
                'failure, variable congenital malformations and predisposition '
                'to develop hematological or solid tumors.',
 'id': 'MONDO:0019391',
 'label': 'Fanconi anemia'}


## Disease Associated Genes

In [4]:
# get genes associated with disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup()
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes['hit_id'].tolist()
# # show the 
disease_associated_genes

Unnamed: 0,input_id,input_label,hit_id,hit_label,ncbi,sources
0,MONDO:0019391,Fanconi anemia,HGNC:1100,BRCA1,NCBIGene:672,clinvar
1,MONDO:0019391,Fanconi anemia,HGNC:1101,BRCA2,NCBIGene:675,"orphane, omim, ctd, orphane, clinvar"
2,MONDO:0019391,Fanconi anemia,HGNC:11892,TNF,NCBIGene:7124,ctd
3,MONDO:0019391,Fanconi anemia,HGNC:12829,XRCC2,NCBIGene:7516,"orphane, ctd, omim"
4,MONDO:0019391,Fanconi anemia,HGNC:20473,BRIP1,NCBIGene:83990,"ctd, orphane, omim, ctd, clinvar"
5,MONDO:0019391,Fanconi anemia,HGNC:20748,FANCL,NCBIGene:55120,"ctd, orphane, ctd, omim, clinvar"
6,MONDO:0019391,Fanconi anemia,HGNC:23168,FANCM,NCBIGene:57697,"ctd, orphane"
7,MONDO:0019391,Fanconi anemia,HGNC:23845,SLX4,NCBIGene:84464,"ctd, orphane, ctd, omim, clinvar"
8,MONDO:0019391,Fanconi anemia,HGNC:25009,UBE2T,NCBIGene:29089,"omim, ctd, clinvar, orphane"
9,MONDO:0019391,Fanconi anemia,HGNC:25539,RFWD3,NCBIGene:55159,orphane


# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [5]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity



## Mod1A_human

In [6]:
# Module specification
mod1a_input_object_human = {
    'input': input_curie_set,
    'parameters': {
        'taxon': 'human',
        'threshold': 0.75,  # jaccard index threshold
    },
}
    
func_sim_human = FunctionalSimilarity()

Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [7]:
func_sim_human.load_input_object(mod1a_input_object_human)
func_sim_human.load_gene_set() 
func_sim_human.load_associations()

In [8]:
Mod1A_results_human = pd.DataFrame(func_sim_human.compute_similarity())
Mod1A_results_human = Mod1A_results_human[~Mod1A_results_human['hit_curie'].isin(input_curie_set)]
Mod1A_results_human

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
8,HGNC:20922,SLX1A,0.782313,HGNC:23845,SLX4
15,HGNC:26171,FAAP100,0.857143,HGNC:3583,FANCB
16,HGNC:25021,ASTE1,0.793103,HGNC:3583,FANCB
17,HGNC:21700,RAD9B,0.92,HGNC:3583,FANCB
18,HGNC:22223,EEPD1,0.785714,HGNC:3583,FANCB
20,HGNC:28467,FAAP24,0.827586,HGNC:3583,FANCB
22,HGNC:24994,INIP,0.851852,HGNC:3583,FANCB
25,HGNC:26171,FAAP100,0.785714,HGNC:3586,FANCE
26,HGNC:21700,RAD9B,0.84,HGNC:3586,FANCE
27,HGNC:22223,EEPD1,0.777778,HGNC:3586,FANCE


## Mod1A Mouse

In [9]:
## Get mouse orthologs from Monarch Initiative
from Modules.ortholog_traversal import OrthologTraversal
ot = OrthologTraversal()
mouse_orthologs = pd.DataFrame(ot.ortholog_set_by_taxid(gene_set=disease_associated_genes['hit_id'].tolist(), 
                                                        taxon_name='mouse'))

In [10]:
# Module specification
mod1a_input_object_mouse = {
    'input': mouse_orthologs['hit_id'].tolist(),
    'parameters': {
        'taxon': 'mouse',
        'threshold': 0.65,  # jaccard index threshold
    },
}
    
func_sim_mouse = FunctionalSimilarity()

Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [11]:
func_sim_mouse.load_input_object(mod1a_input_object_mouse)
func_sim_mouse.load_gene_set() 
func_sim_mouse.load_associations()

In [81]:
Mod1A_results_mouse = pd.DataFrame(func_sim_mouse.compute_similarity())

In [83]:
### filter results and traverse back to human orthologs
Mod1A_results_mouse = Mod1A_results_mouse[~Mod1A_results_mouse['hit_name'].isin(mouse_orthologs['hit_label'])]
Mod1A_results_mouse = Mod1A_results_mouse.sort_values('hit_score', ascending=False)

def trim_mgi_prefix(curie):
    return curie[4:]

Mod1A_results_mouse['hit_id'] = Mod1A_results_mouse['hit_curie'].apply(trim_mgi_prefix)
def back2human(gene_curie):
    hits = ot.single_gene_ortholog(gene=gene_curie, taxon_name='human')
    return hits
human_hits = []
for term in Mod1A_results_mouse['hit_id'].tolist():
    human_hits = human_hits + back2human(gene_curie=term)
    
Mod1A_results_mouse_merged = pd.merge(Mod1A_results_mouse, pd.DataFrame(human_hits), how='outer', left_on='hit_id', right_on='gene_id')
Mod1A_results_mouse_traversed = pd.merge(mouse_orthologs, Mod1A_results_mouse_merged, left_on='hit_id', right_on='input_curie')
Mod1A_results_final = Mod1A_results_mouse_traversed[['input_id', 'input_label', 'gene_id', 'hit_name', 'hit_score', 'orth_id', 'orth_label']]

In [84]:
Mod1A_results_final

Unnamed: 0,input_id,input_label,gene_id,hit_name,hit_score,orth_id,orth_label
0,HGNC:25009,UBE2T,MGI:1920568,Ube2d2b,0.676471,HGNC:12475,UBE2D2
1,HGNC:25009,UBE2T,MGI:1914049,Ube2w,0.663043,HGNC:25616,UBE2W
2,HGNC:25009,UBE2T,MGI:1930715,Ube2d2a,0.661765,HGNC:12475,UBE2D2
3,HGNC:7532,MX1,,Tgtp1,0.716981,,
4,HGNC:7532,MX1,MGI:109493,Rab33a,0.682927,HGNC:9773,RAB33A
5,HGNC:7532,MX1,MGI:109493,Rab33a,0.682927,HGNC:9768,RAB28
6,HGNC:7532,MX1,MGI:1923805,Mmaa,0.651163,HGNC:18871,MMAA
