In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyvis

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network



# **Dataset**

In [None]:
depod = pd.read_excel('/content/drive/MyDrive/MS Thesis/Final/DEPOD_2019/PPases_in_DEPOD_201906.xls', header=3)
depod.rename(columns={'GeneName': 'Phosphatase'}, inplace=True)
dephosphosite = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DEPOD_2019/phosphatase_substrate_dephosphosite.csv')
substrate = pd.read_excel('/content/drive/MyDrive/MS Thesis/Final/DEPOD_2019/PPase_protSubtrates_201903.xls')
phosphoSitePlus = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/kinase_substrate_phosphosite.csv')
GenePhosphositeDisease = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/disease_gene_phosphosite.csv')
ppi = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/String PPI/gene_protein.csv')
drug = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/drug_gene.csv')
disease = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/disease_gene.csv')
path = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/MSigDB/gene_pathway.csv')

#All Substrate
substrates1 = substrate['Substrate entry names'].tolist()
substrates2 = phosphoSitePlus['Substrate'].tolist()
all = substrates1 + substrates2
all_substrates = pd.DataFrame({'all_substrates': all})
all_substrates = all_substrates.drop_duplicates()

#All Phosphosite
site1 = dephosphosite['Dephosphosite'].tolist()
site2 = phosphoSitePlus['Phosphosite'].tolist()
site = site1 + site2
site = pd.DataFrame({'Phosphosite': site})
site = site.drop_duplicates()

merge1 = pd.merge(drug, depod, left_on='gene', right_on='Phosphatase', how='right')
merge2 = pd.merge(drug, phosphoSitePlus, left_on='gene', right_on='Kinase', how='right')
merge3 = pd.merge(drug, all_substrates, left_on='gene', right_on='all_substrates', how='right')

merge4 = pd.merge(disease, depod, left_on='gene', right_on='Phosphatase', how='right')
merge5 = pd.merge(disease, phosphoSitePlus, left_on='gene', right_on='Kinase', how='right')
merge6 = pd.merge(disease, all_substrates, left_on='gene', right_on='all_substrates', how='right')

merge7 = pd.merge(GenePhosphositeDisease, depod, left_on='Gene', right_on='Phosphatase', how='right')
merge8 = pd.merge(GenePhosphositeDisease, phosphoSitePlus, left_on='Gene', right_on='Kinase', how='right')
merge9 = pd.merge(GenePhosphositeDisease, all_substrates, left_on='Gene', right_on='all_substrates', how='right')


select1 = dephosphosite[['Substrate', 'Dephosphosite']].dropna().drop_duplicates()
select1.rename(columns={'Dephosphosite': 'Phosphosite'}, inplace=True)
select2 = phosphoSitePlus[['Substrate', 'Phosphosite']].dropna().drop_duplicates()
merge12 = pd.concat([select1, select2], ignore_index=True)

merge13 = pd.merge(path, depod, left_on='gene', right_on='Phosphatase', how='right')
merge14 = pd.merge(path, phosphoSitePlus, left_on='gene', right_on='Kinase', how='right')
merge15 = pd.merge(path, all_substrates, left_on='gene', right_on='all_substrates', how='right')

Pdrug = merge1[['Phosphatase', 'Drug']].dropna().drop_duplicates()
Kdrug = merge2[['Kinase', 'Drug']].dropna().drop_duplicates()
Sdrug = merge3[['all_substrates', 'Drug']].dropna().drop_duplicates()

Pdisease1 = merge4[['Phosphatase', 'Disease']].dropna().drop_duplicates()
Kdisease1 = merge5[['Kinase', 'Disease']].dropna().drop_duplicates()
Sdisease1 = merge6[['all_substrates', 'Disease']].dropna().drop_duplicates()
Pdisease2 = merge7[['Phosphatase', 'Disease']].dropna().drop_duplicates()
Kdisease2 = merge8[['Kinase', 'Disease']].dropna().drop_duplicates()
Sdisease2 = merge9[['all_substrates', 'Disease']].dropna().drop_duplicates()

Pdisease = pd.concat([Pdisease1, Pdisease2], ignore_index=True)
Pdisease = Pdisease.drop_duplicates()
Kdisease = pd.concat([Kdisease1, Kdisease2], ignore_index=True)
Kdisease = Kdisease.drop_duplicates()
Sdisease = pd.concat([Sdisease1, Sdisease2], ignore_index=True)
Sdisease = Sdisease.drop_duplicates()

PPsite = dephosphosite[['Phosphatase', 'Dephosphosite']].dropna().drop_duplicates()
KPsite = phosphoSitePlus[['Kinase', 'Phosphosite']].dropna().drop_duplicates()
SPsite = merge12.dropna().drop_duplicates()

Ppath = merge13[['Pathway', 'Phosphatase']].dropna().drop_duplicates()
Kpath = merge14[['Pathway', 'Kinase']].dropna().drop_duplicates()
Spath = merge15[['Pathway', 'all_substrates']].dropna().drop_duplicates()

# **Initial Unique Node**

In [None]:
print("Phosphatase:", depod['Phosphatase'].nunique())
print("Kinase:", phosphoSitePlus['Kinase'].nunique())
print("Substrate:", all_substrates['all_substrates'].nunique())
print("Drug:", drug['Drug'].nunique())
print("Disease:", disease['Disease'].nunique())
print("Pathway:", path['Pathway'].nunique())
print("Phosphosite:", site['Phosphosite'].nunique())

Phosphatase: 259
Kinase: 421
Substrate: 3286
Drug: 29407
Disease: 2497
Pathway: 4762
Phosphosite: 10499


# **Graph**

In [None]:
G = nx.Graph()

for a in all_substrates['all_substrates'].unique():
    G.add_node(a)

for b in depod['Phosphatase'].unique():
    G.add_node(b)

for c in phosphoSitePlus['Kinase'].unique():
    G.add_node(c)

for d in site['Phosphosite'].unique():
    G.add_node(d)

for e in drug['Drug'].unique():
    G.add_node(e)

for f in disease['Disease'].unique():
    G.add_node(f)

for g in path['Pathway'].unique():
    G.add_node(g)

In [None]:
def add_edge(G, node1, node2, relationship):
    if node1 in G.nodes and node2 in G.nodes and not G.has_edge(node1, node2):
        G.add_edge(node1, node2, relationship=relationship)

for _, row in Pdisease.iterrows():
    add_edge(G, row['Phosphatase'], row['Disease'], "P-Disease")

for _, row in Kdisease.iterrows():
    add_edge(G, row['Kinase'], row['Disease'], "K-Disease")

for _, row in Sdisease.iterrows():
    add_edge(G, row['all_substrates'], row['Disease'], "S-Disease")

for _, row in Kdrug.iterrows():
    add_edge(G, row['Kinase'], row['Drug'], "K-Drug")

for _, row in Pdrug.iterrows():
    add_edge(G, row['Phosphatase'], row['Drug'], "P-Drug")

for _, row in Sdrug.iterrows():
    add_edge(G, row['all_substrates'], row['Drug'], "S-Drug")

for _, row in PPsite.iterrows():
    add_edge(G, row['Phosphatase'], row['Dephosphosite'], "P-Psite")

for _, row in KPsite.iterrows():
    add_edge(G, row['Kinase'], row['Phosphosite'], "K-Psite")

for _, row in SPsite.iterrows():
    add_edge(G, row['Substrate'], row['Phosphosite'], "S-Psite")

for _, row in Kpath.iterrows():
    add_edge(G, row['Pathway'], row['Kinase'], "K-path")

for _, row in Ppath.iterrows():
    add_edge(G, row['Pathway'], row['Phosphatase'], "P-path")

for _, row in Spath.iterrows():
    add_edge(G, row['Pathway'], row['all_substrates'], "S-path")

for _, row in ppi.iterrows():
    gene1, gene2 = row['gene1'], row['gene2']

    if gene1 in depod['Phosphatase'].unique() and gene2 in phosphoSitePlus['Kinase'].unique():
        add_edge(G, gene1, gene2, "K-P")
    elif gene2 in depod['Phosphatase'].unique() and gene1 in phosphoSitePlus['Kinase'].unique():
        add_edge(G, gene1, gene2, "K-P")

    elif gene1 in depod['Phosphatase'].unique() and gene2 in depod['Phosphatase'].unique() and gene1 != gene2:
        add_edge(G, gene1, gene2, "P-P")

    elif gene1 in phosphoSitePlus['Kinase'].unique() and gene2 in phosphoSitePlus['Kinase'].unique() and gene1 != gene2:
        add_edge(G, gene1, gene2, "K-K")

    elif gene1 in all_substrates['all_substrates'].unique() and gene2 in all_substrates['all_substrates'].unique() and gene1 != gene2:
        add_edge(G, gene1, gene2, "S-S")

    elif gene1 in all_substrates['all_substrates'].unique() and gene2 in phosphoSitePlus['Kinase'].unique():
        add_edge(G, gene1, gene2, "K-S")
    elif gene2 in all_substrates['all_substrates'].unique() and gene1 in phosphoSitePlus['Kinase'].unique():
        add_edge(G, gene1, gene2, "K-S")

    elif gene1 in depod['Phosphatase'].unique() and gene2 in all_substrates['all_substrates'].unique():
        add_edge(G, gene1, gene2, "P-S")
    elif gene2 in depod['Phosphatase'].unique() and gene1 in all_substrates['all_substrates'].unique():
        add_edge(G, gene1, gene2, "P-S")

# Remove isolated nodes
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)

nx.write_graphml(G, '/content/drive/MyDrive/MS Thesis/Final/graph.graphml')

# **Final Unique Node and edge**

In [None]:
edge = {"K-K": 0, "S-S": 0, "P-P": 0,
        "K-P": 0, "K-S": 0, "P-S": 0,
        "P-Drug": 0, "P-Disease": 0, "P-path": 0,
        "K-Drug": 0, "K-Disease": 0, "K-path": 0,
        "S-Drug": 0, "S-Disease": 0, "S-path": 0,
        "P-Psite": 0, "K-Psite": 0, "S-Psite": 0}

# Count edges by relationship type
for _, _, attr in G.edges(data=True):
    relationship_type = attr.get('relationship', '')
    if relationship_type in edge:
        edge[relationship_type] += 1

# Node Counts
Total_node = {
    "Phosphatases": sum(1 for node in G.nodes if node in depod['Phosphatase'].unique()),
    "Kinases": sum(1 for node in G.nodes if node in phosphoSitePlus['Kinase'].unique()),
    "Substrates": sum(1 for node in G.nodes if node in all_substrates['all_substrates'].unique()),
    "Drugs": sum(1 for node in G.nodes if node in drug['Drug'].unique()),
    "Diseases": sum(1 for node in G.nodes if node in disease['Disease'].unique()),
    "Pathways": sum(1 for node in G.nodes if node in path['Pathway'].unique()),
    "Phosphosites": sum(1 for node in G.nodes if node in site['Phosphosite'].unique()),
}

# Print edge counts
print("\nEdge Counts:")
for k, v in edge.items():
    print(f"{k}: {v}")

# Print final node counts
print("\nFinal Node Counts:")
for k, v in Total_node.items():
    print(f"{k}: {v}")


Edge Counts:
K-K: 1663
S-S: 24972
P-P: 359
K-P: 752
K-S: 1797
P-S: 891
P-Drug: 345
P-Disease: 47
P-path: 7102
K-Drug: 3208
K-Disease: 711
K-path: 20977
S-Drug: 6269
S-Disease: 1433
S-path: 76965
P-Psite: 921
K-Psite: 14646
S-Psite: 10039

Final Node Counts:
Phosphatases: 250
Kinases: 421
Substrates: 3286
Drugs: 9799
Diseases: 1097
Pathways: 4612
Phosphosites: 10499
