Load datasets used in the manuscript "A Swiss-Army Knife for Hierarchical Modeling of Biological Systems" (Yu et al.)

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '/cellar/users/mikeyu/DeepTranslate/ddot')

In [2]:
import ddot
from ddot import Ontology

# Gene-disease associations from Monarch Initiative

In [3]:
# Retrieve a table of gene-disease associations from the Monarch Initiative (reformatted and stored on NDEx)
monarch, _ = ddot.ndex_to_sim_matrix(
    ddot.MONARCH_DISEASE_GENE_SLIM_URL,
    similarity=None,
    input_fmt='cx',
    output_fmt='sparse')
monarch.head()




Unnamed: 0,Node1,Node2,disease,gene,disease_original_name,similarity
0,MIR6084,parkinson_disease_6__autosomal_recessive_early...,parkinson_disease_6__autosomal_recessive_early...,MIR6084,"Parkinson Disease 6, Autosomal Recessive Early...",1.0
1,ELANE,autosomal_dominant_severe_congenital_neutropenia,autosomal_dominant_severe_congenital_neutropenia,ELANE,Autosomal dominant severe congenital neutropenia,1.0
2,B2M,bare_lymphocyte_syndrome_type_1,bare_lymphocyte_syndrome_type_1,B2M,Bare lymphocyte syndrome type 1,1.0
3,PDCD10,familial_cerebral_cavernous_malformation,familial_cerebral_cavernous_malformation,PDCD10,Familial cerebral cavernous malformation,1.0
4,PDCD10,cerebral_cavernous_malformation,cerebral_cavernous_malformation,PDCD10,Cerebral cavernous malformation,1.0


In [5]:
# Example: get the known genes for "Caffey Disease"
seed = monarch.loc[monarch['disease']=='caffey_disease', 'gene'].tolist()
print('Seed:', seed)

Seed: ['COL1A1', 'A4GALT']


# Human gene-gene similarity network

In [6]:
# Install the simplejson package (it is recommend you run this in a separate bash terminal, not in this Jupyter notebook. If you want to use a conda virtual environment, then you first need to activate the environment)
! pip install simplejson

[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
## Download human gene-gene similarity network from NDEx
## -- WARNING: This network is very large (19,009-by-19,009 matrix). Downloading will take ~10 min for a fast internet connection.
sim, sim_names = ddot.ndex_to_sim_matrix(
    ndex_url=ddot.HUMAN_GENE_SIMILARITIES_URL,
    input_fmt='cx_matrix',
    output_fmt='matrix',
    subset=None)

import pandas as pd
sim = pd.DataFrame(sim, columns=sim_names, index=sim_names)

sim.head()

NDEx download time (sec): 0.07912540435791016


# The Gene Ontology

In [8]:
# Read Gene Ontology from NDEx. 
# -- This version has been pre-processed to contain a non-redundant set of GO terms and connections that are relevant to human genes (see Process_the_Gene_Ontology.ipynb) 
go_human = Ontology.from_ndex(ddot.GO_HUMAN_URL)
print(go_human)


19015 genes, 19343 terms, 215488 gene-term relations, 36362 term-term relations
node_attributes: ['Vis:Shape', 'Vis:Fill Color', 'Vis:Border Paint', 'Term_Description', 'name', 'Branch']
edge_attributes: ['Vis:Visible']


# Fanconi Anemia gene ontology (FanGO)

In [9]:
fango = Ontology.from_ndex(ddot.FANGO_URL)
print(fango)

IndexError: list index out of range

# Other disease gene ontologies (based on gene-disease associations in Monarch Initiative)

In [17]:
df = pd.read_table('disease_gene_ontologies.txt', header=0, index_col=False)
df.head()

Unnamed: 0,Disease,Total_Genes,Seed_Genes,Subsystems,Subsystems_Aligned_to_GO,NDEx_URL,HiView_URL
0,rickets,481,4,209,109,http://public.ndexbio.org/v2/network/b2494349-...,http://hiview.ucsd.edu/b2494349-f775-11e8-aaa6...
1,hyperlipoproteinemias,434,5,185,107,http://public.ndexbio.org/v2/network/25726bc9-...,http://hiview.ucsd.edu/25726bc9-f775-11e8-aaa6...
2,bronchitis__chronic,487,2,207,106,http://public.ndexbio.org/v2/network/caba1662-...,http://hiview.ucsd.edu/caba1662-f774-11e8-aaa6...
3,aplastic_anemia__aa_,474,2,209,101,http://public.ndexbio.org/v2/network/9a11b9c8-...,http://hiview.ucsd.edu/9a11b9c8-f774-11e8-aaa6...
4,hydronephrosis,411,3,157,97,http://public.ndexbio.org/v2/network/1b21bc44-...,http://hiview.ucsd.edu/1b21bc44-f775-11e8-aaa6...


In [None]:
# Example: get the URL to view the disease "hydronephrosis" on HiView
print(df.loc[df['Disease']=='hydronephrosis']['HiView_URL'])