<a href="https://colab.research.google.com/github/MatSpad/Advanced-Coding-for-Data-Analytics-Project-2024/blob/main/ACfDA_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
# Script settings           > Usage:
reloadRepository = False     # if true, the repository is loaded again even if already cloned

# Importing libraries       > Useful for:
import os                   # managing directories
import shutil               # deleting directories
import time                 # checking code run times
from tqdm.auto import tqdm  # generating fancy progress bars
import gzip                 # decompressing and reading .gz files
import networkx as nx       # generating and analyzing graph objects
import pandas as pd         # managing dataframes

# Importing our functions   > Useful for:
def section(x:str):         # clearly depict sections of output (to store separately)
  if x.lower() == "done":
    print('_'*74+" DONE!\n\n")
  else:
    print(x+' '+'_'*(79-len(x)))

# Cloning GitHub repository as Colab directory
repositoryLoaded = os.getcwd()[-17:] == "cloned-repository"
if repositoryLoaded and reloadRepository:
  os.chdir("..")
  shutil.rmtree("cloned-repository")
  repositoryLoaded = False
if not repositoryLoaded:
  section("Loading repository:")
  ! git clone -l -s https://github.com/MatSpad/Advanced-Coding-for-Data-Analytics-Project-2024.git cloned-repository
  %cd cloned-repository
  section("Done")
else:
  print("Using previously loaded repository!\n\n")

# Bulding a Pandas dataframe containing all nodes' information as presented by the related file
section("Loading nodes' info as dataframe:")
nodesDF = pd.read_csv("Inputs/hetionet-v1.0-nodes.tsv",sep='\t')
display(nodesDF)
nodesFileLen = len(nodesDF.index)
section("Done")

# Bulding a Pandas dataframe containing all edges as presented by the related file
section("Loading edges list as dataframe:")
edgesDF = pd.read_csv("Inputs/hetionet-v1.0-edges.sif.gz", compression='gzip', sep='\t')
display(edgesDF)
edgesFileLen = len(edgesDF.index)
section("Done")

# TO WRITE AS PRE-ANALYSIS:
print(edgesFileLen, nodesFileLen)
print(len(edgesDF.loc[:, ['source', 'target']].drop_duplicates()), len(pd.unique(nodesDF['id'])))
display(edgesDF[edgesDF.loc[:, ['source', 'target']].duplicated()])
display(edgesDF.loc[(edgesDF['source']=='Compound::DB08865') & (edgesDF['target']=='Gene::25')])

# Bulding a NetworkX directed graph by reading the edges file directly (with progress bar) [2110272 edges, 45158 nodes]
section("Loading the graph blending both nodes' and edges' data:")
G = nx.MultiDiGraph()
with gzip.open("Inputs/hetionet-v1.0-edges.sif.gz",'r') as edgesFile:
  edgesFile.readline()
  for i in tqdm(range(edgesFileLen)):
    lineElements = str(edgesFile.readline())[2:-5].split('\\t')
    G.add_edge(lineElements[0], lineElements[2], edgeType=lineElements[1])
edgesNum = G.number_of_edges()
nodesNum = G.number_of_nodes()
print(edgesNum, nodesNum)
section("Done")

'''
# Bulding a NetworkX directed graph by reading the edges file directly (w/o progress bar -> slightly faster than progress-bar-enriched graph building)
start = time.time()
G = nx.DiGraph()
with gzip.open('Inputs/hetionet-v1.0-edges.sif.gz','r') as edgesFile:
  edgesFile.readline()
  for line in edgesFile:
    lineElements = str(line)[2:-5].split('\\t')
    G.add_edge(lineElements[0], lineElements[2], edgeType=lineElements[1])
print(G.number_of_edges())
end = time.time()
print(end - start)
'''

# Bulding a NetworkX directed graph by reading both the dataframes [2107709 edges, 45158 nodes detected only, why?]
G = nx.from_pandas_edgelist(edgesDF, 'source', 'target', edge_attr='metaedge', create_using=nx.MultiDiGraph())
edgesNum = G.number_of_edges()
nodesNum = G.number_of_nodes()
nx.set_node_attributes(G, nodesDF.set_index('id').to_dict('index'), 'name') # TO CHECK
print(edgesNum, nodesNum)

Using previously loaded repository!


Loading nodes' info as dataframe: ______________________________________________


Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy
...,...,...,...
47026,Symptom::D064250,Hypertriglyceridemic Waist,Symptom
47027,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom
47028,Symptom::D065635,Benign Paroxysmal Positional Vertigo,Symptom
47029,Symptom::D065906,Hyperlactatemia,Symptom


__________________________________________________________________________ DONE!


Loading edges list as dataframe: _______________________________________________


Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780
2,Gene::19,GpBP,Biological Process::GO:0055088
3,Gene::3176,GpBP,Biological Process::GO:0010243
4,Gene::3039,GpBP,Biological Process::GO:0006898
...,...,...,...
2250192,Anatomy::UBERON:0000057,AeG,Gene::65009
2250193,Anatomy::UBERON:0000474,AeG,Gene::80279
2250194,Anatomy::UBERON:0002048,AeG,Gene::1211
2250195,Anatomy::UBERON:0002048,AeG,Gene::8843


__________________________________________________________________________ DONE!


2250197 47031
2110272 47031


Unnamed: 0,source,metaedge,target
741027,Compound::DB08865,CuG,Gene::25
741979,Compound::DB01628,CuG,Gene::5743
742578,Compound::DB00233,CuG,Gene::5743
742761,Compound::DB00773,CuG,Gene::5743
742855,Compound::DB00570,CuG,Gene::3725
...,...,...,...
2250177,Anatomy::UBERON:0000955,AeG,Gene::1121
2250181,Anatomy::UBERON:0000955,AeG,Gene::134266
2250185,Anatomy::UBERON:0003126,AeG,Gene::1499
2250195,Anatomy::UBERON:0002048,AeG,Gene::8843


Unnamed: 0,source,metaedge,target
736323,Compound::DB08865,CbG,Gene::25
741027,Compound::DB08865,CuG,Gene::25


Loading the graph blending both nodes' and edges' data: ________________________


  0%|          | 0/2250197 [00:00<?, ?it/s]

2250197 45158
__________________________________________________________________________ DONE!


2250197 45158
