In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import networkx as nx
import json
from networkx.readwrite import json_graph

In [8]:
def read_json_file(filename):
    with open(filename) as f:
        js_graph = json.load(f)
    return json_graph.node_link_graph(js_graph)

G = read_json_file('graphJson.json')

In [9]:
# Identify the set of peptides and the set of proteins
peptides = set(node for node in G.nodes() if G.nodes[node]['bipartite'] == 0)
proteins = set(node for node in G.nodes() if G.nodes[node]['bipartite'] == 1)

# Count the number of proteins each peptide is connected to
peptide_counts = {}
for peptide in peptides:
    peptide_counts[peptide] = len(list(G.neighbors(peptide)))

# Extract the list of peptides that are connected to more than one protein
multi_protein_peptides = [peptide for peptide, count in peptide_counts.items() if count > 1]

# Print the list of peptides that are used to construct more than one protein
print('Peptides used to construct more than one protein:', multi_protein_peptides)

Peptides used to construct more than one protein: []


In [10]:
# Identify hub proteins
hub = input("Do you want to see the hub proteins? (yes/no)")
if hub == 'yes':
    proteins = set(node for node in G.nodes() if G.nodes[node]['bipartite'] == 1)
    degree_dict = dict(G.degree(proteins))
    hub_proteins = [protein for protein, degree in degree_dict.items() if degree > 1]
    print('Hub proteins:', hub_proteins)

Hub proteins: ['O00533']


In [11]:
# Identify overlapping peptides
overlapping = input("Do you want to see if there are ovelapping proteins? (yes/no)")
peptides = set(node for node in G.nodes() if G.nodes[node]['bipartite'] == 0)
overlapping_peptides = []
for peptide in peptides:
    neighbors = list(G.neighbors(peptide))
    if len(neighbors) > 1:
        overlapping_peptides.append(peptide)
print('Overlapping peptides:', overlapping_peptides)

Overlapping peptides: []


In [12]:
G = nx.read_graphml('AMP_Parkinsons_Protein_Peptide.graphml')

# Perform operations on the graph, e.g. computing centrality measures
centrality = nx.betweenness_centrality(G)

peptide_source = input('Which peptide do you want to choose as a source?')
protein_target = input('Which protein do you want to choose as a target?')
# Find shortest path between a peptide and a protein
shortest_path = nx.shortest_path(G, source=peptide_source, target=protein_target)

print("Shortest path:", " -> ".join(shortest_path))

# Find all paths between a peptide and a protein
all_paths_vis = input("Do you want to see all the paths in the graph? (yes/no)")
if all_paths_vis.lower() == 'yes':
    all_paths = nx.all_simple_paths(G, source=peptide_source, target=protein_target, cutoff=4)
    for path in all_paths:
        print("Path:", " -> ".join(path))

centrality

Shortest path: IEIPSSVQQVPTIIK -> O00533 -> VIAVNEVGR
Path: IEIPSSVQQVPTIIK -> O00533 -> VIAVNEVGR


{'C(UniMod_4)TSTGWIPAPR': 0.0,
 'TVVQPSVGAAAGPVVPPC(UniMod_4)PGR': 0.0,
 'VDNALQSGNSQESVTEQDSK': 0.0,
 'C(UniMod_4)LVEKGDVAFVKHQTVPQNTGGK': 0.0,
 'NLHGDGIALWYTR': 0.0,
 'VTLTC(UniMod_4)VAPLSGVDFQLR': 0.0,
 'TTPEPC(UniMod_4)ELDDEDFR': 0.0,
 'IC(UniMod_4)LEDNVLMSGVK': 0.0,
 'QLKEHAVEGDC(UniMod_4)DFQLLKLDGK': 0.0,
 'FTQVTPTSLSAQWTPPNVQLTGYR': 0.0,
 'FSALEVDETYVPK': 0.0,
 'INHC(UniMod_4)RFDEFFSEGC(UniMod_4)APGSKK': 0.0,
 'ADLSGITGAR': 0.0,
 'LLRDPADASEAHESSSR': 0.0,
 'QIGSVYR': 0.0,
 'DTDTGALLFIGK': 0.0,
 'THLAPYSDELR': 0.0,
 'FLATTPNSLLVSWQPPR': 0.0,
 'FNALQYLR': 0.0,
 'MDYPKQTQVSVLPEGGETPLFK': 0.0,
 'ASYGVKPR': 0.0,
 'AQC(UniMod_4)GGGLLGVR': 0.0,
 'FQNALLVR': 0.0,
 'NFGYTLR': 0.0,
 'WKNFPSPVDAAFR': 0.0,
 'GAYPLSIEPIGVR': 0.0,
 'VLLDGVQNPR': 0.0,
 'SLAPYAQDTQEK': 0.0,
 'VSTLPAITLK': 0.0,
 'LRENELTYYC(UniMod_4)C(UniMod_4)K': 0.0,
 'SLEDLQLTHNKITK': 0.0,
 'LVWEEAMSR': 0.0,
 'DTVIKPLLVEPEGLEK': 0.0,
 'LLDNWDSVTSTFSK': 0.0,
 'LVAYYTLIGASGQR': 0.0,
 'C(UniMod_4)PNPPVQENFDVNK': 0.0,
 'RLEAGDHPV

This analysis using the graph concludes that `no protein is made up of same set of peptides. Also it is identified that a peptide used for one protein will never be used to construct some other protein`. This is identified from the overlapping peptides test run above which is a crucial information for running the ML algorithm to save time and choose a model efficiently.