In [1]:
import os
import sys
import numpy as np
import networkx as nx
import itertools as it
import random as rd
import pickle as pk
import os.path
import pandas as pd
from collections import (defaultdict,Counter)
import time
import matplotlib.pyplot as plt


In [3]:
#This function returns the symbol names of a genelist with entrez ids
def conversion_entrez_to_symbol_all_species(gene_list,name_species):   #name_species must be the official entrez name in string format
    from Bio import Entrez

    entrez_to_sym_dict={}    #create a dictionary entrez to symbol
    for gene in gene_list:
        try:
        #retrieve gene ID
            request = Entrez.epost("gene", id=gene)
            result = Entrez.read(request)
            webEnv = result["WebEnv"]
            queryKey = result["QueryKey"]
            data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
            annotations = Entrez.read(data)
            gene_id = annotations['DocumentSummarySet']['DocumentSummary'][0]['Name']
            entrez_to_sym_dict[gene]=gene_id
        except:
            pass
                
    return entrez_to_sym_dict

In [2]:
#Let's import the PPI from Autocore (merging Menche interactome, Hippie and Huri)
autocore_df=pd.read_csv("input/PPI/autocore_ppi.txt",sep="\t")
#This will create a network out of it
G_autocore=nx.from_pandas_edgelist(autocore_df, "node1", 'node2')
print("The number of nodes in the PPI is %s" %G_autocore.number_of_nodes()) 
print("The number of edges in the PPI is %s" %G_autocore.number_of_edges()) 

The number of nodes in the PPI is 18853
The number of edges in the PPI is 483037


In [4]:
#Let's extract the entrez ids of the nodes in the PPI
node_list=list(G_autocore.nodes())
#This will convert the node ids into nodes symbols
node_symbol_dict = conversion_entrez_to_symbol_all_species(node_list,"Homo sapiens")
#This will rename the nodes in the network according to their official symbol
G_autocore_symbol = nx.relabel_nodes(G_autocore, node_symbol_dict)

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


In [6]:
print("The number of nodes in the PPI is %s" %G_autocore_symbol.number_of_nodes()) 
print("The number of edges in the PPI is %s" %G_autocore_symbol.number_of_edges()) 

The number of nodes in the PPI is 18831
The number of edges in the PPI is 482950


In [7]:
#Let's extract the largest connected component of this network
G_autocore_symbol_lcc = G_autocore_symbol.subgraph(max(nx.connected_components(G_autocore_symbol), key=len))  # extract lcc graph
#Let's remove the self loops from the network
#G_julia_symbol_lcc.remove_edges_from(nx.selfloop_edges(G_julia_symbol_lcc))
#We will keep them:
print(G_autocore_symbol_lcc.number_of_nodes())
print(G_autocore_symbol_lcc.number_of_edges())
#Let's save it as a csv
autocore_ppi_symbol=nx.to_pandas_edgelist(G_autocore_symbol_lcc,'symbol1', 'symbol2')
autocore_ppi_symbol.to_csv('input/PPI/autocore_symbol_lcc.csv', index=False) 

18815
482935
