In [8]:
import os
import pandas as pd
import networkx as nx
from collections import Counter

In [2]:
fname_in = "data/reactome.homo_sapiens.interactions.tab-delimited.txt"
fname_out = "data/reactome.homo_sapiens.ppi.striped.tsv"
fname_processed_tab = "data/reactome.homo_sapiens.ppi.processed.tsv"


# remove # from the first line of the file is it has not been already done.
with open(fname_in, 'r') as file:
    lines = file.readlines()
    if lines[0][0] == "#":
        # update_file = True
        lines[0] = lines[0][1:].lstrip()  # Remove the first character from the first line
        with open(fname_out, 'w') as file:
            file.writelines(lines)


In [12]:
# Reading stripped table
df = pd.read_csv(fname_out, sep='\t')
print(f"Original table size", df.shape)

rename_dict = {
        "Interactor 1 uniprot id": "protein_A_id",
        "Interactor 2 uniprot id": "protein_B_id",
        "Interaction type": "type",
        "Interaction context": "context",
        "Pubmed references": "references"
    }
    
df = df.rename(rename_dict, axis=1)

mask = df["protein_A_id"].str.contains('uniprot') & df["protein_B_id"].str.contains('uniprot')
df = df[mask]

cols = ['protein_A_id', 'protein_B_id', 'type']
df = df[cols]

df = df.drop_duplicates()

protein_ids = df["protein_A_id"].values.tolist() + df["protein_B_id"].values.tolist()
protein_ids = set(protein_ids)
id_mapping = {i:i.split(":")[-1] for i in protein_ids}
print("Total proteins:", len(id_mapping))

df['protein_A_id'] = df['protein_A_id'].replace(id_mapping)
df['protein_B_id'] = df['protein_B_id'].replace(id_mapping)

mask = df['protein_A_id'] != df['protein_B_id']
df = df[mask]
df = df.set_index(['protein_A_id', 'protein_B_id'])

print(f"Processed table size", df.shape)

Original table size (107634, 9)
Total proteins: 5788
Processed table size (25601, 1)


In [14]:
protein_ids = df.index.get_level_values(0).tolist()
protein_ids += df.index.get_level_values(1).tolist()
protein_ids = set(protein_ids)

interactions = df.index.values

print(f"- Total proteins: {len(protein_ids)}")
print(f"- Total interactions: {df.shape[0]}")

# Countintg the different type of interaction
print(f"- Total interactions by type")
print(df['type'].value_counts())

- Total proteins: 5151
- Total interactions: 25601
- Total interactions by type
type
physical association                                  21250
enzymatic reaction                                     2982
cleavage reaction                                       656
dephosphorylation reaction                              346
oxidoreductase activity electron transfer reaction      100
acetylation reaction                                     75
gtpase reaction                                          34
phospholipase reaction                                   30
nucleoside triphosphatase reaction                       28
glycosylation reaction                                   26
demethylation reaction                                   18
deubiquitination reaction                                15
deacetylation reaction                                   12
carboxylation reaction                                    9
amidation reaction                                        9
deneddylation r

In [17]:
G = nx.Graph()
G.add_nodes_from(protein_ids)
G.add_edges_from(interactions)

fname = "data/reactome.homo_sapiens.ppi.graphml"
nx.write_graphml(G, fname)

In [18]:
connected_components = sorted(nx.connected_components(G), key=len, reverse=True)

print("Distribution size for connected components:")
s = pd.Series(Counter([len(i) for i in connected_components]), name="size")
print(s)

giant_comp = G.subgraph(connected_components[0])
print()
print(f"Size of the giagant component: nodes={len(giant_comp.nodes())}, interactions/edges={len(giant_comp.edges())}")

fname = "data/reactome.homo_sapiens.ppi.giant_comp.graphml"
nx.write_graphml(giant_comp, fname)

Distribution size for connected components:
4109      1
25        1
15        1
12        1
11        5
10        3
9         3
8         9
7         9
6         6
5        14
4        38
3        63
2       148
Name: size, dtype: int64

Size of the giagant component: nodes=4109, interactions/edges=19865


In [24]:
degree_centrality = nx.centrality.degree_centrality(giant_comp)
pd.Series(degree_centrality).sort_values(ascending=False).head(10)

P62993    0.044791
P62805    0.036514
P12931    0.036027
Q07889    0.034567
P27986    0.034323
P42336    0.032863
Q06124    0.029211
P63279    0.029211
P29353    0.028481
P63165    0.026534
dtype: float64

In [28]:
degree_dict = dict(nx.degree(giant_comp))
pd.Series(degree_dict).sort_values(ascending=False).head(10)

P62993    184
P62805    150
P12931    148
Q07889    142
P27986    141
P42336    135
Q06124    120
P63279    120
P29353    117
P63165    109
dtype: int64

In [26]:
# betweenness_centrality = nx.centrality.betweenness_centrality(giant_comp)
pd.Series(betweenness_centrality).sort_values(ascending=False).head(10)

P62805    0.052600
P12931    0.049405
P03372    0.049404
Q09472    0.037075
P07900    0.037000
P63279    0.036803
P0DP23    0.035652
P62877    0.035496
P35222    0.035236
P04637    0.033808
dtype: float64

In [29]:
closeness_centrality = nx.centrality.closeness_centrality(giant_comp)
pd.Series(closeness_centrality).sort_values(ascending=False).head(10)

P12931    0.306179
P07900    0.304567
P27986    0.303263
P29353    0.302392
P42336    0.302037
P62993    0.296671
P03372    0.295348
P28482    0.293869
Q09472    0.292739
Q07889    0.291244
dtype: float64