# Use DDOT to download and process a human-specific Gene Ontology
1. Download the GO structure and gene-term annotations
2. Concatenate all three branches (biological process, molecular function, cellular component) into a unified ontology with an artificial new root 'GO:00SUPER'
3. Remove redundant terms that are not relevant for human
4. Convert gene IDs and symbols using mygene.info

# Import libraries

In [1]:
import requests
import gzip
import pandas as pd
import networkx as nx

import sys
sys.path = ['/cellar/users/mikeyu/DeepTranslate/ddot'] + sys.path

import ddot
from ddot import Ontology, get_gene_name_converter, parse_gaf, parse_obo, nx_to_NdexGraph, read_term_descriptions

# Set NDEx username and password

In [None]:
ndex_server = 'http://test.ndexbio.org'
ndex_user = 'mikeyu_testacct'
ndex_pass = 'GoHejVeg8'

# Download and Parse Gene Ontology files

In [3]:
# Download GO obo file
r = requests.get('http://purl.obolibrary.org/obo/go/go-basic.obo')
with open('go-basic.obo', 'wb') as f:
    f.write(r.content)

# Parse OBO file
parse_obo('go-basic.obo', 'go.tab', 'goID_2_name.tab', 'goID_2_namespace.tab', 'goID_2_alt_id.tab')

# Download gene-term annotations for human
r = requests.get('http://geneontology.org/gene-associations/goa_human.gaf.gz')
with open('goa_human.gaf.gz', 'wb') as f:
    f.write(r.content)

# Read Gene Ontology for human

In [3]:
hierarchy = pd.read_table('go.tab',
                          sep='\t',
                          header=None,
                          names=['Parent', 'Child', 'Relation', 'Namespace']).loc[:,['Child', 'Parent']].values.tolist()
with gzip.open('goa_human.gaf.gz', 'rb') as f:
    mapping = parse_gaf(f)
go_human = Ontology(hierarchy, mapping, propagate=False, add_root_name='GO:00SUPER')
assert go_human.is_dag()
print go_human.summary()

Unifying 3 roots into one super-root
19445 genes, 44626 terms, 263478 gene-term relations, 91619 term-term relations


# Collapse GO with respect to human UniProt IDs

In [5]:
go_human = go_human.collapse_ontology()
print go_human.summary()

collapse command: /cellar/users/mikeyu/alignOntology/collapseRedundantNodes /tmp/tmpfviKTE
19445 genes, 19211 terms, 263657 gene-term relations, 43410 term-term relations


# Convert UniProt IDs to Ensembl, HUGO, and Entrez IDs

In [6]:
uniprot_2_entrezgene = get_gene_name_converter(go_human.genes, scopes='uniprot', fields='entrezgene', species='human')
uniprot_2_symbol = get_gene_name_converter(go_human.genes, scopes='uniprot', fields='symbol', species='human')
uniprot_2_ensembl = get_gene_name_converter(go_human.genes, scopes='uniprot', fields='ensembl', species='human')

gene_attr = pd.DataFrame({
        'entrez_gene' : pd.Series(uniprot_2_entrezgene),
        'gene_symbol' : pd.Series(uniprot_2_symbol),
        'ensembl_gene' : pd.Series({k : ','.join(v) for k, v in uniprot_2_ensembl.items()})
})
gene_attr.index.name = 'uniprot'
gene_attr

# Write GO to local files and upload GO to NDEx

In [9]:
# Read table mapping GO ID to term descriptions
go_descriptions = read_term_descriptions('goID_2_name.tab')

name = 'Human-specific Gene Ontology'

## GO with UniProt IDs

In [10]:
# Write GO to file
go_human.to_3col_table('collapsed_go.uniprot')

# Get NdexGraph object
go_human_ndex = go_human.to_NdexGraph(name='%s, %s' % (name, 'UniProt'),
                                      gene_attr=gene_attr,
                                      term_attr=go_descriptions.rename('Label').to_frame())

# Upload to NDEx
go_human_ndex.upload_to(ndex_server, ndex_user, ndex_pass)

u'http://dev2.ndexbio.org/v2/network/577927ed-55ec-11e7-a2e2-0660b7976219'

## GO with HUGO symbols

In [13]:
go_human_symbol = go_human.rename(genes=uniprot_2_symbol)

# Write GO to file
go_human_symbol.to_3col_table('collapsed_go.symbol')

# Get NdexGraph object
go_human_ndex = go_human_symbol.to_NdexGraph(name='%s, %s' % (name, 'Symbol'),
                                        gene_attr=gene_attr.reset_index().set_index('gene_symbol'),
                                        term_attr=go_descriptions.rename('Label').to_frame())

# Upload to NDEx
go_human_ndex.upload_to(ndex_server, ndex_user, ndex_pass)

u'http://dev2.ndexbio.org/v2/network/8bfa8318-55ed-11e7-a2e2-0660b7976219'

## GO with Entrez gene IDs

In [11]:
go_human_entrez = go_human.rename(genes=uniprot_2_entrezgene)

# Write GO to file
go_human_entrez.to_3col_table('collapsed_go.entrez')

# Get NdexGraph object
go_human_ndex = go_human_entrez.to_NdexGraph(name='%s, %s' % (name, 'Entrez'),
                                        gene_attr=gene_attr.reset_index().set_index('entrez_gene'),
                                        term_attr=go_descriptions.rename('Label').to_frame())

# Upload to NDEx
go_human_ndex.upload_to(ndex_server, ndex_user, ndex_pass)

u'http://dev2.ndexbio.org/v2/network/8784cee1-55ec-11e7-a2e2-0660b7976219'

## GO with Ensembl gene IDs

In [12]:
go_human_ensembl = go_human.rename(genes=uniprot_2_ensembl)

# Write GO to file
go_human_ensembl.to_3col_table('collapsed_go.ensembl')

# Get NdexGraph object
go_human_ndex = go_human_ensembl.to_NdexGraph(name='%s, %s' % (name, 'Ensembl'),
                                      gene_attr=gene_attr.reset_index().set_index('ensembl_gene'),
                                      term_attr=go_descriptions.rename('Label').to_frame())

# Upload to NDEx
go_human_ndex.upload_to(ndex_server, ndex_user, ndex_pass)

u'http://dev2.ndexbio.org/v2/network/ac1d8d54-55ec-11e7-a2e2-0660b7976219'