# Use DDOT to download and process a human-focused Gene Ontology
1. Download the GO structure and gene-term annotations
2. Concatenate all three branches (biological process, molecular function, cellular component) into a unified ontology with an artificial new root 'GO:00SUPER'
3. Remove redundant terms that are not relevant for human
4. Convert gene IDs and symbols using mygene.info
5. Upload ontology to NDEx

# Import libraries

In [3]:
import requests
import gzip
import pandas as pd
import networkx as nx
import sys

%load_ext autoreload
%autoreload 2

sys.path.insert(0, '/cellar/users/mikeyu/DeepTranslate/ddot')
import ddot
from ddot import Ontology, get_gene_name_converter, parse_gaf, parse_obo, nx_to_NdexGraph





# Set NDEx username and password

In [4]:
ndex_server = 'http://test.ndexbio.org'
ndex_user = 'mikeyu_testacct3'
ndex_pass = 'GoHejVeg8'

# Download and Parse Gene Ontology files

In [3]:
# Download GO obo file
r = requests.get('http://purl.obolibrary.org/obo/go/go-basic.obo')
with open('go-basic.obo', 'wb') as f:
    f.write(r.content)

# Parse OBO file
parse_obo('go-basic.obo', 'go.tab', 'goID_2_name.tab', 'goID_2_namespace.tab', 'goID_2_alt_id.tab')

# Download gene-term annotations for human
r = requests.get('http://geneontology.org/gene-associations/goa_human.gaf.gz')
with open('goa_human.gaf.gz', 'wb') as f:
    f.write(r.content)

# Read Gene Ontology for human

In [5]:
hierarchy = pd.read_table('go.tab',
                          sep='\t',
                          header=None,
                          names=['Parent', 'Child', 'Relation', 'Namespace'])
with gzip.open('goa_human.gaf.gz', 'rb') as f:
    mapping = parse_gaf(f)

In [6]:
go_human = Ontology.from_table(
    table=hierarchy,
    parent='Parent',
    child='Child',
    mapping=mapping,
    mapping_child='DB Object ID',
    mapping_parent='GO ID',
    add_root_name='GO:00SUPER',
    ignore_orphan_terms=True)
go_human.clear_node_attr()
go_human.clear_edge_attr()
go_human

Unifying 3 roots into one super-root


19468 genes, 44978 terms, 266539 gene-term relations, 91366 term-term relations
node_attributes: []
edge_attributes: []

# Collapse GO with respect to human UniProt IDs

In [7]:
%time go_human = go_human.collapse_ontology(method='mhkramer')
if 'GO:00SUPER' not in go_human.terms: go_human.add_root('GO:00SUPER', inplace=True)
print go_human

collapse command: /cellar/users/mikeyu/DeepTranslate/ddot/ddot/alignOntology/collapseRedundantNodes /tmp/tmpcw5qMv
CPU times: user 19.5 s, sys: 356 ms, total: 19.9 s
Wall time: 32 s
19468 genes, 19343 terms, 216482 gene-term relations, 43296 term-term relations
node_attributes: []
edge_attributes: []


# Add descriptions of GO terms

In [8]:
go_descriptions = pd.read_table('goID_2_name.tab',
                                header=None,
                                names=['Term', 'Term_Description'],
                                index_col=0)
go_human.update_node_attr(go_descriptions)

go_branches = pd.read_table('goID_2_namespace.tab',
                                header=None,
                                names=['Term', 'Branch'],
                                index_col=0)
go_human.update_node_attr(go_branches)

# Add size
sizes = pd.DataFrame({'Size' : go_human.term_sizes}, index=go_human.terms)
go_human.update_node_attr(sizes)

# Use mygene.info to convert UniProt IDs to Ensembl, HUGO, and Entrez IDs

In [9]:
import mygene
mg = mygene.MyGeneInfo()

In [10]:
uniprot_2_entrezgene_df = mg.querymany(go_human.genes, scopes='uniprot', fields='entrezgene', species='human', as_dataframe=True)

def f(x):
    x = x['entrezgene'].astype(int).astype(str)
    if len(x)==1:
        return x[0]
    else:
        return x.tolist()
uniprot_2_entrezgene = uniprot_2_entrezgene_df.dropna(subset=['entrezgene']).groupby('query').apply(f)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19468...done.
Finished.
298 input query terms found dup hits:
	[(u'G5E9R7', 2), (u'Q6ZTI6', 2), (u'P62807', 6), (u'P62805', 10), (u'P31995', 2), (u'P50391', 3), (u
653 input query terms found no hit:
	[u'A0A075B734', u'A0A087WSY4', u'A0A087WUL8', u'A0A087WV96', u'A0A087WX78', u'A0A087X1C1', u'A0A087X
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [11]:
uniprot_2_symbol_df = mg.querymany(go_human.genes, scopes='uniprot', fields='symbol', species='human', as_dataframe=True)

def f(x):
    x = x['symbol']
    if len(x)==1:
        return x[0]
    else:
        return x.tolist()
uniprot_2_symbol = uniprot_2_symbol_df.dropna(subset=['symbol']).groupby('query').apply(f)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19468...done.
Finished.
298 input query terms found dup hits:
	[(u'G5E9R7', 2), (u'Q6ZTI6', 2), (u'P62807', 6), (u'P62805', 10), (u'P31995', 2), (u'P50391', 3), (u
653 input query terms found no hit:
	[u'A0A075B734', u'A0A087WSY4', u'A0A087WUL8', u'A0A087WV96', u'A0A087WX78', u'A0A087X1C1', u'A0A087X
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [12]:
uniprot_2_ensembl_df = mg.querymany(go_human.genes, scopes='uniprot', fields='ensembl', species='human', as_dataframe=True)

# def f(x):
#     x = x['ensembl']
#     if len(x)==1:
#         return x[0]['gene']
#     else:
#         return [y['gene'] for y in x.tolist()]
# uniprot_2_ensembl = uniprot_2_ensembl_df.dropna(subset=['ensembl']).groupby('query').apply(f)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19468...done.
Finished.
298 input query terms found dup hits:
	[(u'G5E9R7', 2), (u'Q6ZTI6', 2), (u'P62807', 6), (u'P62805', 10), (u'P31995', 2), (u'P50391', 3), (u
653 input query terms found no hit:
	[u'A0A075B734', u'A0A087WSY4', u'A0A087WUL8', u'A0A087WV96', u'A0A087WX78', u'A0A087X1C1', u'A0A087X
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [35]:
def f(x):
    x = x['ensembl']
    ret = []
    for y in x:
        if isinstance(y, (pd.Series, list)):
            for z in y: ret.append(z['gene'])
        else:
            ret.append(y['gene'])
    if len(ret)==1:
        return ret[0]
    else:
        return ret
uniprot_2_ensembl = uniprot_2_ensembl_df.dropna(subset=['ensembl']).groupby('query').apply(f)

# Write GO to local files and upload GO to NDEx

In [46]:
name = 'Human-specific Gene Ontology'

## GO with UniProt IDs

In [73]:
go_human_uniprot = go_human.copy()

# Write GO to file
go_human_uniprot.to_table('collapsed_go.uniprot', clixo_format=True)
go_human_uniprot.to_pickle('collapsed_go.uniprot.pkl')

url, G = go_human_uniprot.to_ndex(name='%s, %s' % (name, 'UniProt'),
                                  ndex_server=ndex_server,
                                  ndex_user=ndex_user,
                                  ndex_pass=ndex_pass,
                                  layout=None,
                                  visibility='PUBLIC')
print url

http://dev2.ndexbio.org/v2/network/2d4dc26f-00a3-11e8-bd69-0660b7976219


## GO with HUGO symbols

In [47]:
go_human_symbol = go_human.delete(to_delete=set(go_human.genes) - set(uniprot_2_symbol.keys()))
go_human_symbol = go_human_symbol.rename(genes=uniprot_2_symbol.to_dict())
print(go_human_symbol)

# Write GO to file
go_human_symbol.to_table('collapsed_go.symbol', clixo_format=True)
go_human_symbol.to_pickle('collapsed_go.symbol.pkl')

url, G = go_human_symbol.to_ndex(name='%s, %s' % (name, 'Symbol'),
                                 ndex_server=ndex_server,
                                 ndex_user=ndex_user,
                                 ndex_pass=ndex_pass,
                                 layout=None,
                                 visibility='PUBLIC')
print url

19205 genes, 19343 terms, 215376 gene-term relations, 43296 term-term relations
node_attributes: ['Term_Description', 'Branch', 'Size']
edge_attributes: []
http://dev2.ndexbio.org/v2/network/3030845c-00d5-11e8-bd69-0660b7976219


## GO with Entrez gene IDs

In [48]:
go_human_entrez = go_human.delete(to_delete=set(go_human.genes) - set(uniprot_2_entrezgene.keys()))
go_human_entrez = go_human_entrez.rename(genes=uniprot_2_entrezgene.to_dict())
print go_human_entrez

# Write GO to file
go_human_entrez.to_table('collapsed_go.entrez', clixo_format=True)
go_human_entrez.to_pickle('collapsed_go.entrez.pkl')

url, G = go_human_entrez.to_ndex(name='%s, %s' % (name, 'Entrez'),
                                 ndex_server=ndex_server,
                                 ndex_user=ndex_user,
                                 ndex_pass=ndex_pass,
                                 layout=None,
                                 visibility='PUBLIC')
print url

18617 genes, 19343 terms, 212832 gene-term relations, 43296 term-term relations
node_attributes: ['Term_Description', 'Branch', 'Size']
edge_attributes: []
http://dev2.ndexbio.org/v2/network/44980eef-00d5-11e8-bd69-0660b7976219


## GO with Ensembl gene IDs

In [49]:
go_human_ensembl = go_human.delete(to_delete=set(go_human.genes) - set(uniprot_2_ensembl.keys()))
go_human_ensembl = go_human_ensembl.rename(genes=uniprot_2_ensembl.to_dict())
print go_human_ensembl

# Write GO to file
go_human_ensembl.to_table('collapsed_go.ensembl', clixo_format=True)
go_human_ensembl.to_pickle('collapsed_go.ensembl.pkl')

url, G = go_human_ensembl.to_ndex(name='%s, %s' % (name, 'ENSEMBL'),
                                 ndex_server=ndex_server,
                                 ndex_user=ndex_user,
                                 ndex_pass=ndex_pass,
                                 layout=None,
                                 visibility='PUBLIC')
print url

21585 genes, 19343 terms, 238383 gene-term relations, 43296 term-term relations
node_attributes: ['Term_Description', 'Branch', 'Size']
edge_attributes: []
http://dev2.ndexbio.org/v2/network/5b269832-00d5-11e8-bd69-0660b7976219
