In [19]:
import os, sys
import urllib
import zipfile
import gzip
import pandas as pd

from collections import defaultdict
from goatools.anno.gaf_reader import GafReader
from goatools.obo_parser import GODag

import networkx as nx

data_dir = "../data/"

In [20]:
def download_and_unzip(download_url_link, dir_path, zipped_filename,destination_dir_name, unzip=True):
    #https://www.tutorialsbuddy.com/download-and-unzip-a-zipped-file-in-python
    print("Download starting")

    urllib.request.urlretrieve(
        download_url_link, os.path.join(dir_path, zipped_filename)
    )
    print("Download complete")

    if unzip:
        print("unzipping file starting")
    
        if zipped_filename.endswith(".zip"):
            with zipfile.ZipFile(os.path.join(dir_path, zipped_filename), "r") as zip_file:
                zip_file.extractall(os.path.join(dir_path, destination_dir_name))
        elif zipped_filename.endswith(".gz"):
            print("zipfile")
            with gzip.GzipFile(os.path.join(dir_path, zipped_filename), "rb") as zip_file:
                with open(os.path.join(dir_path, destination_dir_name, zipped_filename.replace(".gz", "")), "wb") as fout:
                    fout.write(zip_file.read())
        else:
            raise NotImplementedError("NO CASE")
            
    
    print("unzipping complete")

In [21]:
if not os.path.exists(os.path.join(data_dir, "goa_human.gaf")):
    download_and_unzip("http://geneontology.org/gene-associations/goa_human.gaf.gz", ".", os.path.join(data_dir, "goa_human.gaf.gz"), ".")
        

In [22]:
if not os.path.exists(os.path.join(data_dir, "go-basic.obo")):
    download_and_unzip("http://geneontology.org/ontology/go-basic.obo", ".", os.path.join(data_dir, "go-basic.obo"), ".", unzip=False)
        

In [23]:
genenamesURL = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&col=md_prot_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'

if not os.path.exists(os.path.join(data_dir, "hgnc_annot.tsv")):
    download_and_unzip(genenamesURL, ".", os.path.join(data_dir, "hgnc_annot.tsv"), ".", unzip=False)

In [24]:
ogaf = GafReader(os.path.join(data_dir, "goa_human.gaf"))
obodag = GODag(os.path.join(data_dir, "go-basic.obo"))

HMS:0:00:11.102329 626,136 annotations READ: ../data/goa_human.gaf 
../data/go-basic.obo: fmt(1.2) rel(2023-01-01) 46,739 Terms


In [25]:
hgncDF = pd.read_csv(os.path.join(data_dir, "hgnc_annot.tsv"), sep="\t")
uniprot2hgnc = defaultdict(set)
all_genes = set()

for ri, row in hgncDF.iterrows():
    
    status = row["Status"]
    
    if status == "Symbol Withdrawn":
        continue
    
    symbol = row["Approved symbol"]    
    uniprot= row["UniProt ID(supplied by UniProt)"]
    
    all_genes.add(symbol)
    
    if pd.isna(uniprot):
        continue

    uniprot2hgnc[uniprot].add(symbol)
        

In [26]:
len(all_genes)

45406

In [27]:
go2gene = defaultdict(set)

for assoc in ogaf.get_associations():

    geneID = assoc.DB_ID
    goID = assoc.GO_ID
    
    if not geneID in uniprot2hgnc:
        geneID = assoc.DB_Symbol
        geneIDs = [geneID]
        
        if not geneID in all_genes:
            print(geneID, assoc.Taxon, assoc)
            continue
    else:
        geneIDs = uniprot2hgnc[geneID]
    
    for geneSym in geneIDs:
        go2gene[goID].add((geneSym, tuple(assoc.Qualifier)))


LOC107984156 [9606] ntgafobj(DB='UniProtKB', DB_ID='A0A0G2JMH3', DB_Symbol='LOC107984156', Qualifier={'enables'}, GO_ID='GO:0003924', DB_Reference={'GO_REF:0000002'}, Evidence_Code='IEA', With_From={'InterPro:IPR006689'}, NS='MF', DB_Name={'ARL17 domain-containing protein'}, DB_Synonym={'LOC107984156'}, DB_Type='protein', Taxon=[9606], Date=datetime.date(2022, 11, 9), Assigned_By='InterPro', Extension=None, Gene_Product_Form_ID=set())
A0A0G2JMS6 [9606] ntgafobj(DB='UniProtKB', DB_ID='A0A0G2JMS6', DB_Symbol='A0A0G2JMS6', Qualifier={'enables'}, GO_ID='GO:0004867', DB_Reference={'GO_REF:0000043'}, Evidence_Code='IEA', With_From={'UniProtKB-KW:KW-0722'}, NS='MF', DB_Name={'Ovostatin'}, DB_Synonym=set(), DB_Type='protein', Taxon=[9606], Date=datetime.date(2022, 11, 9), Assigned_By='UniProt', Extension=None, Gene_Product_Form_ID=set())
A0A0G2JMS6 [9606] ntgafobj(DB='UniProtKB', DB_ID='A0A0G2JMS6', DB_Symbol='A0A0G2JMS6', Qualifier={'involved_in'}, GO_ID='GO:0010466', DB_Reference={'GO_REF:00

In [28]:
g = nx.DiGraph()

In [29]:
for gene in all_genes:
    g.add_node(gene, type="gene", score=0)

In [30]:
#add all nodes with attributes
for goEntry in obodag:
    
    termID = goEntry
    termObj = obodag[goEntry]
    
    if termObj.is_obsolete:
        continue
    
    termName = termObj.name
    termNS = termObj.namespace
    
    g.add_node(termID, id=termID, name=termName, ns=termNS, score=0)
    
for goEntry in obodag:
    
    termID = goEntry
    termObj = obodag[goEntry]
    
    for child in termObj.children:
        if child in g.nodes and not (child, goEntry) in g.edges:
            g.add_edge(child, goEntry, type="part_of", score=0)
            
    for parent in termObj.parents:
        if parent.id in g.nodes and not (goEntry, parent.id) in g.edges:
            g.add_edge(goEntry, parent.id, type="part_of", score=0)

In [31]:
print(g)

DiGraph with 92145 nodes and 74934 edges


In [32]:
[x for x in g.nodes][:5]

['DOC2GP', 'METTL26', 'ADIPOR2', 'MUSK', 'POC1B']

In [33]:
all_interactions = set()
for goID in list(go2gene):   
    for gene, interaction in go2gene[goID]:
        for x in interaction:
            all_interactions.add(x)
all_interactions

{'NOT',
 'acts_upstream_of',
 'acts_upstream_of_negative_effect',
 'acts_upstream_of_or_within',
 'acts_upstream_of_or_within_negative_effect',
 'acts_upstream_of_or_within_positive_effect',
 'acts_upstream_of_positive_effect',
 'colocalizes_with',
 'contributes_to',
 'enables',
 'involved_in',
 'is_active_in',
 'located_in',
 'part_of'}

In [34]:
interaction_harmonize = {
    'NOT': "interacts",
    'acts_upstream_of': "interacts",
    'acts_upstream_of_negative_effect': "interacts",
    'acts_upstream_of_or_within': "interacts",
    'acts_upstream_of_or_within_negative_effect': "interacts",
    'acts_upstream_of_or_within_positive_effect': "interacts",
    'acts_upstream_of_positive_effect': "interacts",
    'colocalizes_with': "interacts",
    'contributes_to': "activates",
    'enables': "activates",
    'involved_in': "activates",
    'is_active_in': "activates",
    'located_in': "interacts",
    'part_of': "interacts"
    }

In [35]:
for goID in list(go2gene)[:5]:
    print(goID, go2gene[goID])
    
    for gene, interaction in go2gene[goID]:
        g.add_edge( gene, goID, type=interaction_harmonize[interaction[0]], go_interaction = interaction[0])

GO:0003723 {('KHDRBS1', ('enables',)), ('YY1', ('enables',)), ('FIP1L1', ('enables',)), ('H1-5', ('enables',)), ('LBR', ('enables',)), ('MECP2', ('enables',)), ('NPM2', ('enables',)), ('TOP1', ('enables',)), ('ELAVL2', ('enables',)), ('ZNF622', ('enables',)), ('DUSP14', ('enables',)), ('HDGF', ('enables',)), ('UHMK1', ('enables',)), ('HNRNPU', ('enables',)), ('RBMS2', ('enables',)), ('TES', ('enables',)), ('RBFOX1', ('enables',)), ('EIF2AK2', ('enables',)), ('HNRNPF', ('enables',)), ('ZNF207', ('enables',)), ('TUT7', ('enables',)), ('MBNL2', ('enables',)), ('SPATS2L', ('enables',)), ('JUN', ('enables',)), ('ALG13', ('enables',)), ('ZNF74', ('enables',)), ('MYEF2', ('enables',)), ('SLC25A5', ('enables',)), ('MRPL12', ('enables',)), ('PDCD4', ('enables',)), ('MTCL1', ('enables',)), ('IPO5', ('enables',)), ('RPL23A', ('enables',)), ('PHAX', ('enables',)), ('FSCN1', ('enables',)), ('LRRC47', ('enables',)), ('OASL', ('enables',)), ('TFB1M', ('enables',)), ('NAF1', ('enables',)), ('GTPBP10',

In [36]:
print(g)

DiGraph with 92145 nodes and 89524 edges
