In [1]:
from data.gene_graphs import GeneManiaGraph
import pandas as pd
import networkx as nx
import os

os.chdir('..') #to avoid FileNotFoundError when trying to read gene_code_map.txt
genemania = GeneManiaGraph().nx_graph

Torrent name: genemania.pkl, Size: 9.61MB


Download Hetionet graph data

In [8]:
DATADIR = './data/hetionet/'
if not os.path.exists(DATADIR):
    os.makedirs(DATADIR)

if not os.path.isfile(DATADIR + 'hetionet-v1.0-edges.sif.gz'):
    urls = ['https://media.githubusercontent.com/media/hetio/hetionet/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz',
             'https://raw.githubusercontent.com/hetio/hetionet/master/hetnet/tsv/hetionet-v1.0-nodes.tsv']
    for link in urls:
        os.system('wget {} -P {}'.format(link, DATADIR))

Creating an object that maps the Hetio GeneID to the gene name

In [10]:
node_ids =  pd.read_csv(DATADIR + 'hetionet-v1.0-nodes.tsv', sep='\t')
node_ids = node_ids.loc[node_ids.name.isin(genemania.nodes)].drop(columns='kind')
node_ids = node_ids.set_index('id').squeeze()

# Edge list
- Only edges of type 'GiG' (Gene interacts with Gene) are retained
- Hetio GeneIDs are converted to their respective gene names 
- Any NA's are removed to prevent adding nan edges to the graph

In [11]:
edges = pd.read_csv(DATADIR + 'hetionet-v1.0-edges.sif.gz', sep='\t', compression='gzip')
edges = edges.loc[edges.metaedge == 'GiG'].drop(columns='metaedge')
edges = edges.stack().map(node_ids).unstack()
edges = edges.dropna()

Creating empty copy of genemania to retain node order.

In [12]:
hetnet = nx.classes.function.create_empty_copy(genemania, with_data=False)
hetnet.add_edges_from(edges.values.tolist())
nx.write_gpickle(hetnet, DATADIR + 'hetnet.pkl')

In [13]:
len(set(hetnet.edges).difference(genemania.edges))

25573