### Import libraries

In [2]:
import os
import pickle
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

### Variable names

In [21]:
interactome_file_name = "data/BIOGRID-ORGANISM-Homo_sapiens-4.4.240.tab3.txt"  # PPI FILE
seeds_file_name = "data/DISEASES_Summary_GDA_CURATED_C0025202.tsv"
human_ID = 9606
interaction_of_interest = "physical"
protein_A_identifier = "Official Symbol Interactor A"
protein_B_identifier = "Official Symbol Interactor B"
PPI_list_variable = "data/PPI_of_interest.pk1"
first_gene_symbol_indicator = "Official Symbol Interactor A"
second_gene_symbol_indicator = "Official Symbol Interactor B"

### Import files and preprocess

#### PPI file

In [4]:
## File already preprocessed
if os.path.exists(PPI_list_variable):
    print("The file already exists! Importing information from the file ...")
    with open(PPI_list_variable, 'rb') as file:
        PPI = pickle.load(file)

## First time reading the file
else:
    print("The file does not exist yet. Processing... ")
    complete_interactome = pd.read_csv(interactome_file_name, delimiter='\t')

    ### Filter out non-human interactions
    human_interactome = complete_interactome[
        (complete_interactome["Organism ID Interactor A"] == human_ID)
        & (complete_interactome["Organism ID Interactor B"] == human_ID)]

    ### Filter out non-physical interactions
    human_physical_interactome = human_interactome[
        (human_interactome["Experimental System Type"] == interaction_of_interest)]

    ### Delete duplicates
    PPI_NoDuplicates = human_physical_interactome.drop_duplicates(
        subset=[first_gene_symbol_indicator, second_gene_symbol_indicator])

    ### Delete self-loops
    PPI_NoDuplicates_NoSelfLoop = PPI_NoDuplicates[
        PPI_NoDuplicates[first_gene_symbol_indicator]
        !=
        PPI_NoDuplicates[second_gene_symbol_indicator]]

    PPI = PPI_NoDuplicates_NoSelfLoop
    with open(PPI_list_variable, 'wb') as file:
        pickle.dump(PPI, file)

    del complete_interactome, human_interactome, human_physical_interactome, \
        PPI_NoDuplicates, PPI_NoDuplicates_NoSelfLoop

The file already exists! Importing information from the file ...


#### Seed genes file

In [15]:
seed_genes_df = pd.read_csv(seeds_file_name, sep = '\t')
seed_genes_symbols = seed_genes_df["Gene"]

Verify on HGNC the correctness of gene names, report deviations and inconsistencies, resolve conflicting information, if any

In [14]:
seed_genes_df[['Gene', 'UnitProt', 'geneEnsemblIDs', 'geneNcbiID', 'geneNcbiType']].drop_duplicates()

Unnamed: 0,Gene,UnitProt,geneEnsemblIDs,geneNcbiID,geneNcbiType
0,BRAF,P15056,ENSG00000157764,673,protein-coding
1,NRAS,"Q5U091,P01111",ENSG00000213281,4893,protein-coding
2,MITF,"Q8WYR3,A0A087WXU1,B4DNC7,O75030",ENSG00000187098,4286,protein-coding
3,MC1R,"Q01726,Q1JUL4",ENSG00000258839,4157,protein-coding
4,CDKN2A,"P42771,K7PML8,Q8N726",ENSG00000147889,1029,protein-coding
5,PTEN,"F6KD01,P60484",ENSG00000171862,5728,protein-coding
6,TYR,"P14679,L8B082",ENSG00000077498,7299,protein-coding
7,TP53,"K7PPA8,A0A087WT22,Q53GA5,P04637,A0A087X1Q1,A0A...",ENSG00000141510,7157,protein-coding
8,CDK4,P11802,ENSG00000135446,1019,protein-coding
9,CTLA4,P16410,ENSG00000163599,1493,protein-coding


The gene names are all approved in HGNC

### Create the Interatcome

In [20]:
PPI

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,-,NP_002018,-,-,-,-,-,-,Homo sapiens,Homo sapiens
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,-,NP_150250|NP_150253|NP_150252|NP_150247|NP_150...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,-,NP_644805|NP_003141|NP_001356447|NP_001356443|...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265579,3757040,3376,3945,109605,110137,RP11-62C3.1,-,IARS,LDHB,IARS1|ILERS|ILRS|IRS|PRO0785,...,Q5U077,NP_001302466|NP_002291|NP_001167568,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1265581,3757042,5339,3855,111355,110053,-,-,PLEC,KRT7,EBS1|EBSO|HD1|LGMD2Q|PCN|PLEC1|PLEC1b|PLTN,...,-,NP_005547,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1265582,3757043,1728,80070,108072,123100,-,-,NQO1,ADAMTS20,DHQU|DIA4|DTD|NMOR1|NMORI|QR1,...,-,NP_079279,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1265583,3757044,3007,6132,109262,112052,-,-,HIST1H1D,RPL8,H1.3|H1D|H1F3|H1s-2,...,-,NP_001304700|NP_001304711|NP_150644|NP_000964,-,-,-,-,-,-,Homo sapiens,Homo sapiens


In [22]:
graph_of_PPI = nx.from_pandas_edgelist(PPI, protein_A_identifier, protein_B_identifier)
print('Complete graph:')
print(graph_of_PPI)

Complete graph:
Graph with 19972 nodes and 861240 edges


#### Find the Large Connected Component (LCC)

In [6]:
connected_components = nx.connected_components(graph_of_PPI)
largest_cc = max(connected_components, key=len)
LCC_subgraph = nx.subgraph(graph_of_PPI, largest_cc).copy()
print('LCC graph:')
print(LCC_subgraph)

LCC graph:
Graph with 33021 nodes and 871153 edges


#### Find the graph for the diseases genes present in LCC

In [19]:
LCC_subgraph.nodes[1]

{}