In [1]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd
from pprint import pprint
from collections import defaultdict
from mygene import MyGeneInfo
from GraphVisuals.graphviz_wrapper import PathGraph

In [2]:
from Modules.Mod0_lookups import LookUp

# workflow input is a disease identifier
lu = LookUp()

Mod O DiseaseGeneLookup metadata:
{'data_type': 'disease',
 'input_type': {'complexity': 'single', 'id_type': ['MONDO', 'DO', 'OMIM']},
 'limit': None,
 'output_type': {'complexity': 'set', 'id_type': 'HGNC'},
 'predicate': 'blm:gene associated with condition',
 'source': 'Monarch Biolink',
 'taxon': 'human'}


In [3]:
input_disease = 'MONDO:0019391' # Fanconi anemia  
input_object = {
    'input': input_disease,
    'parameters': {
        'taxon': 'human',
        'threshold': None,
    },
}

lu.load_input_object(input_object=input_object)
input_object = lu.input_object

{'description': 'Fanconi anemia (FA) is a hereditary DNA repair disorder '
                'characterized by progressive pancytopenia with bone marrow '
                'failure, variable congenital malformations and predisposition '
                'to develop hematological or solid tumors.',
 'id': 'MONDO:0019391',
 'label': 'Fanconi anemia'}


In [4]:
# get genes associated with disease from Biolink
disease_associated_genes = lu.disease_geneset_lookup()
# create list of gene curies for downstream module input
input_curie_set = disease_associated_genes['hit_id'].tolist()
# # show the 
disease_associated_genes

Unnamed: 0,input_id,input_label,hit_id,hit_label,ncbi,sources
0,MONDO:0019391,Fanconi anemia,HGNC:1100,BRCA1,NCBIGene:672,clinvar
1,MONDO:0019391,Fanconi anemia,HGNC:1101,BRCA2,NCBIGene:675,"orphane, omim, ctd, orphane, clinvar"
2,MONDO:0019391,Fanconi anemia,HGNC:11892,TNF,NCBIGene:7124,ctd
3,MONDO:0019391,Fanconi anemia,HGNC:12829,XRCC2,NCBIGene:7516,"orphane, ctd, omim"
4,MONDO:0019391,Fanconi anemia,HGNC:20473,BRIP1,NCBIGene:83990,"ctd, orphane, omim, ctd, clinvar"
5,MONDO:0019391,Fanconi anemia,HGNC:20748,FANCL,NCBIGene:55120,"ctd, orphane, ctd, omim, clinvar"
6,MONDO:0019391,Fanconi anemia,HGNC:23168,FANCM,NCBIGene:57697,"ctd, orphane"
7,MONDO:0019391,Fanconi anemia,HGNC:23845,SLX4,NCBIGene:84464,"ctd, orphane, ctd, omim, clinvar"
8,MONDO:0019391,Fanconi anemia,HGNC:25009,UBE2T,NCBIGene:29089,"omim, ctd, clinvar, orphane"
9,MONDO:0019391,Fanconi anemia,HGNC:25539,RFWD3,NCBIGene:55159,orphane


In [5]:
from Modules.ortholog_traversal import OrthologTraversal

In [6]:
ot = OrthologTraversal()
mouse_orthologs = pd.DataFrame(ot.ortholog_set_by_taxid(gene_set=disease_associated_genes['hit_id'].tolist(), 
                                                        taxon_name='mouse'))

In [7]:
mouse_orthologs

Unnamed: 0,hit_id,hit_label,input_id,input_label,sources
0,MGI:104537,Brca1,HGNC:1100,BRCA1,[panther]
1,MGI:109337,Brca2,HGNC:1101,BRCA2,[panther]
2,MGI:104798,Tnf,HGNC:11892,TNF,[panther]
3,MGI:1927345,Xrcc2,HGNC:12829,XRCC2,[panther]
4,MGI:2442836,Brip1,HGNC:20473,BRIP1,[panther]
5,MGI:1914280,Fancl,HGNC:20748,FANCL,[panther]
6,MGI:2442306,Fancm,HGNC:23168,FANCM,[panther]
7,MGI:106299,Slx4,HGNC:23845,SLX4,[panther]
8,MGI:1914446,Ube2t,HGNC:25009,UBE2T,[panther]
9,MGI:2384584,Rfwd3,HGNC:25539,RFWD3,[panther]


# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [8]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity

# Module specification
mod1a_input_object = {
    'input': mouse_orthologs['hit_id'].tolist(),
    'parameters': {
        'taxon': 'mouse',
        'threshold': 0.55,
    },
}
    
func_sim = FunctionalSimilarity()



Mod1A Functional Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:macromolecular machine to biological process association',
               'macromolecular machine to molecular activity association'],
 'source': 'Monarch Biolink'}


In [9]:
func_sim.load_input_object(mod1a_input_object)
func_sim.load_gene_set()

In [10]:
func_sim.load_associations()

In [11]:
mouse_Mod1A_results = func_sim.compute_similarity()

In [12]:
def trim_mgi(curie):
    return curie[4:]
Mod1A_results = pd.DataFrame(mouse_Mod1A_results)
Mod1A_results['hit_curie'] = Mod1A_results['hit_curie'].apply(trim_mgi)
Mod1A_results = Mod1A_results[~Mod1A_results['hit_curie'].isin(mod1a_input_object['input'])]

In [14]:
Mod1A_results.sort_values('hit_score', ascending=False)

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
69,MGI:98734,Tgtp1,0.716981,MGI:97243,
64,MGI:109493,Rab33a,0.682927,MGI:97243,
22,MGI:1920568,Ube2d2b,0.676471,MGI:1914446,
30,MGI:1914049,Ube2w,0.663043,MGI:1914446,
21,MGI:1930715,Ube2d2a,0.661765,MGI:1914446,
60,MGI:1923805,Mmaa,0.651163,MGI:97243,
28,MGI:1914865,Ube2r2,0.647059,MGI:1914446,
11,MGI:894324,Cenpx,0.645455,MGI:2442306,
24,MGI:107412,Ube2e3,0.638889,MGI:1914446,
8,MGI:103234,Herc2,0.634146,MGI:1914280,


In [26]:
Mod1A_results

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie,input_name
4,MGI:1923051,4930447C04Rik,0.628272,MGI:2442836,
8,MGI:103234,Herc2,0.634146,MGI:1914280,
9,MGI:1914961,Nsmce1,0.573333,MGI:1914280,
10,MGI:2140313,AI481877,0.565657,MGI:2442306,
11,MGI:894324,Cenpx,0.645455,MGI:2442306,
13,MGI:3036246,Hfm1,0.608696,MGI:2442306,
14,MGI:101845,Mcm3,0.56701,MGI:2442306,
15,MGI:3045334,Mcmdc2,0.57554,MGI:2442306,
16,MGI:1915220,Slx1b,0.565517,MGI:106299,
18,MGI:1917581,Shprh,0.575758,MGI:1914446,


# Mod1B Phenotype Similarity

In [15]:
from Modules.Mod1B1_phenotype_similarity import PhenotypeSimilarity
# Module specification
mod1b_input_object = {
    'input': mouse_orthologs['hit_id'].tolist(),
    'parameters': {
        'taxon': 'mouse',
        'threshold': 0.20,
    },
}
    
pheno_sim = PhenotypeSimilarity()

Mod1B Phenotype Similarity metadata:
{'input_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'output_type': {'complexity': 'set', 'data_type': 'gene', 'id_type': 'HGNC'},
 'predicate': ['blm:has phenotype'],
 'source': 'Monarch Biolink'}


In [16]:
pheno_sim.load_input_object(mod1b_input_object)

In [17]:
pheno_sim.load_gene_set()

In [18]:
pheno_sim.load_associations()

In [19]:
mouse_Mod1B_results = pheno_sim.compute_similarity()

In [20]:
mouse_Mod1B_df = pd.DataFrame(mouse_Mod1B_results)
mouse_Mod1B_df = mouse_Mod1B_df[~mouse_Mod1B_df['hit_name'].isin(
    mouse_orthologs['hit_label'].tolist())
                               ].sort_values('hit_score', ascending=False)

In [21]:
mouse_Mod1B_df

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie
1417,MGI:1925584,Cdk19,1.000000,MGI:2384790
1956,MGI:1920774,1700088E04Rik,1.000000,MGI:2384790
2049,MGI:1914199,Trim59,1.000000,MGI:2384790
2038,MGI:2141920,Wtip,1.000000,MGI:2384790
1962,MGI:1919233,Tmco6,1.000000,MGI:2384790
1498,MGI:2444946,Tmco3,1.000000,MGI:2384790
1764,MGI:1098568,Tmem222,1.000000,MGI:2384790
1446,MGI:1922896,Rai14,1.000000,MGI:2384790
1765,MGI:1922567,Parpbp,1.000000,MGI:2384790
1882,MGI:2449143,Gbgt1,1.000000,MGI:2384790


In [22]:
pheno_sim.associations.association_map['MGI:2384790']

['MP:0011110']

In [23]:
pheno_sim.associations.association_map['MGI:1196464']

['MP:0011110']

In [24]:
mouse_Mod1B_df

Unnamed: 0,hit_curie,hit_name,hit_score,input_curie
1417,MGI:1925584,Cdk19,1.000000,MGI:2384790
1956,MGI:1920774,1700088E04Rik,1.000000,MGI:2384790
2049,MGI:1914199,Trim59,1.000000,MGI:2384790
2038,MGI:2141920,Wtip,1.000000,MGI:2384790
1962,MGI:1919233,Tmco6,1.000000,MGI:2384790
1498,MGI:2444946,Tmco3,1.000000,MGI:2384790
1764,MGI:1098568,Tmem222,1.000000,MGI:2384790
1446,MGI:1922896,Rai14,1.000000,MGI:2384790
1765,MGI:1922567,Parpbp,1.000000,MGI:2384790
1882,MGI:2449143,Gbgt1,1.000000,MGI:2384790


In [25]:
pheno_sim.associations.association_map['MGI:2448558']

['MP:0006378',
 'HP:0001508',
 'MP:0008392',
 'MP:0001265',
 'HP:0003251',
 'MP:0008280',
 'HP:0000798',
 'MP:0004901',
 'MP:0010948',
 'MP:0002776',
 'MP:0001154',
 'HP:0008734',
 'MP:0006380',
 'MP:0001153']