# Protein-Protein Interaction Graph Preprocessing
Author: Cleverson Matiolli, Ph.D.

This notebook focuses on preprocessing of Protein-Protein Interaction (PPI) networks into torch graph format [PyTorch Geometric](http://www.pyg.org) for use in the Aid2GO Heterogeneous Graph Attention Network (Aid2GO-HAN).

**Key Steps:**
1. Load and Preprocess the PPI Dataset
2. Filter PPI dataset into annotated proteins (ground-truth GO terms) vs non-annotated
3. Extract PPI metadata (PPI detection methods, )
4. Construct a GO graph representation for use in the main Aid2GO model
5. Perform Exploratory Data Analysis (EDA) to analyze embedding quality


In [1]:
# Standard library imports
import os
import pickle
from pathlib import Path
import requests
import sys

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Bioinformatics and deep learning
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.utils import degree, to_undirected

# Custom imports
import aid2go.prot as aidprot

# Configuration
pd.options.mode.copy_on_write = True

# UniProt REST API endpoint
UNIPROT_API = "https://rest.uniprot.org"

## 1. Load and Preprocess the PPI Dataset

### 1.2. Anc2Vec PPI Dataset
>Edera, A.A., Milone, D.H. and Stegmayer, G. (2022) 'Anc2vec: embedding gene ontology terms by preserving ancestors relationships', Briefings in Bioinformatics, 23(2), bbac003. Available at: https://doi.org/10.1093/bib/bbac003 (Accessed: 14 October 2024).

In [72]:
# Load anc2vec PPI dataset

ppi_df = pd.read_csv("./data/anc2vec/protein.actions.v11.0.txt", sep="\t", header=None)
print(f"PPI data shape: {ppi_df.shape}")
ppi_df.head()

PPI data shape: (3740874, 4)


Unnamed: 0,0,1,2,3
0,F6T1W7!O54952,GO:0006261!GO:0043625,GO:0032355!GO:0005515!GO:0007420!GO:0005759!GO...,1
1,Q9V9M7!Q9VHE5,GO:0002181!GO:0022625!GO:0003735,GO:0002181!GO:0022625!GO:0003735,1
2,Q9V3E7!Q9VAX8,GO:0005634!GO:0005654!GO:0031965!GO:0000398,GO:0010628!GO:0030532!GO:0000398!GO:0005737,1
3,P48181!Q95XG8,GO:0046662!GO:0043051!GO:0007274!GO:0006937!GO...,GO:0055071!GO:0031090!GO:0010038!GO:0016324!GO...,1
4,O15379!Q9NQB0,GO:0045892!GO:0006325!GO:0051059!GO:0000122!GO...,GO:0045892!GO:0000122!GO:0007050!GO:0044334!GO...,1


In [73]:
# Split and rename columns

# Split interactors
ppi_df[["uniprot_id_1", "uniprot_id_2"]] = ppi_df[0].str.split("!", expand=True)

# Get GO terms into lists
ppi_df["go_terms_1"] = ppi_df[1].str.split("!", expand=False)
ppi_df["go_terms_2"] = ppi_df[2].str.split("!", expand=False)

# Drop original columns
ppi_df.drop(columns=[0, 1, 2], inplace=True)

# Rename label column
ppi_df.rename(columns={3: "label"}, inplace=True)

# Reorder columns
ppi_df = ppi_df[["uniprot_id_1", "uniprot_id_2", "go_terms_1", "go_terms_2", "label"]]

# Check for NA values again
print("NA's:")
print(ppi_df.isna().any())

print(f"PPI data shape: {ppi_df.shape}")
ppi_df.head()

NA's:
uniprot_id_1    False
uniprot_id_2    False
go_terms_1      False
go_terms_2      False
label           False
dtype: bool
PPI data shape: (3740874, 5)


Unnamed: 0,uniprot_id_1,uniprot_id_2,go_terms_1,go_terms_2,label
0,F6T1W7,O54952,"[GO:0006261, GO:0043625]","[GO:0032355, GO:0005515, GO:0007420, GO:000575...",1
1,Q9V9M7,Q9VHE5,"[GO:0002181, GO:0022625, GO:0003735]","[GO:0002181, GO:0022625, GO:0003735]",1
2,Q9V3E7,Q9VAX8,"[GO:0005634, GO:0005654, GO:0031965, GO:0000398]","[GO:0010628, GO:0030532, GO:0000398, GO:0005737]",1
3,P48181,Q95XG8,"[GO:0046662, GO:0043051, GO:0007274, GO:000693...","[GO:0055071, GO:0031090, GO:0010038, GO:001632...",1
4,O15379,Q9NQB0,"[GO:0045892, GO:0006325, GO:0051059, GO:000012...","[GO:0045892, GO:0000122, GO:0007050, GO:004433...",1


In [74]:
# Get basic information about the data

# Unique protein's set
protein_ids_set = set(pd.concat([ppi_df["uniprot_id_1"] + ppi_df["uniprot_id_2"]]))

# Unique GO terms set
go_terms_set = set()
pd.concat([ppi_df["go_terms_1"], ppi_df["go_terms_2"]]).apply(
    lambda x: go_terms_set.update(x)
)

# Label counts (interaction: 1, no interaction:0)
label_counts = ppi_df["label"].value_counts()

print(f"Protein counts: {len(protein_ids_set)}")
print(f"GO term counts: {len(go_terms_set)}")
print(f"Label counts: {label_counts}")

Protein counts: 3740874
GO term counts: 25285
Label counts: label
1    1870437
0    1870437
Name: count, dtype: int64


In [75]:
# Check duplicated PPI pairs (both directions A-B and B-A)

# Sort ids ((A-B and B-A are now identical))
ppi_df["sorted_A"] = ppi_df[["uniprot_id_1", "uniprot_id_2"]].min(axis=1)
ppi_df["sorted_B"] = ppi_df[["uniprot_id_1", "uniprot_id_2"]].max(axis=1)

# Count duplicates
duplicated_count = (
    ppi_df.shape[0] - ppi_df.drop_duplicates(subset=["sorted_A", "sorted_B"]).shape[0]
)

print(f"Duplicated PPI pairs: {duplicated_count}")

# Check for duplicates on the sorted pairs
ppi_df = ppi_df[~ppi_df.duplicated(subset=["sorted_A", "sorted_B"])]

# Drop the temporary sorted columns
ppi_df.drop(columns=["sorted_A", "sorted_B"], inplace=True)

# Save the dataset
ppi_df.reset_index(drop=True, inplace=True)
ppi_df.to_csv("./data/anc2vec/ppi_anc2vec.tsv", sep="\t", index=False)

# Display the DataFrame without the duplicates
print(f"Duplicated PPI pair counts: {duplicated_count}")
print(f"PPI data shape: {ppi_df.shape}")
ppi_df.head()

Duplicated PPI pairs: 0
Duplicated PPI pair counts: 0
PPI data shape: (3740874, 5)


Unnamed: 0,uniprot_id_1,uniprot_id_2,go_terms_1,go_terms_2,label
0,F6T1W7,O54952,"[GO:0006261, GO:0043625]","[GO:0032355, GO:0005515, GO:0007420, GO:000575...",1
1,Q9V9M7,Q9VHE5,"[GO:0002181, GO:0022625, GO:0003735]","[GO:0002181, GO:0022625, GO:0003735]",1
2,Q9V3E7,Q9VAX8,"[GO:0005634, GO:0005654, GO:0031965, GO:0000398]","[GO:0010628, GO:0030532, GO:0000398, GO:0005737]",1
3,P48181,Q95XG8,"[GO:0046662, GO:0043051, GO:0007274, GO:000693...","[GO:0055071, GO:0031090, GO:0010038, GO:001632...",1
4,O15379,Q9NQB0,"[GO:0045892, GO:0006325, GO:0051059, GO:000012...","[GO:0045892, GO:0000122, GO:0007050, GO:004433...",1


In [76]:
# Get only positive interactions
ppi_df_pos = ppi_df[ppi_df["label"] == 1]
ppi_df_pos.shape

(1870437, 5)

In [78]:
# Extract protein-GO associations

associations_df = pd.concat(
    [
        ppi_df_pos[["uniprot_id_1", "go_terms_1"]],
        ppi_df_pos[["uniprot_id_2", "go_terms_2"]].rename(
            columns={"uniprot_id_2": "uniprot_id_1", "go_terms_2": "go_terms_1"}
        ),
    ],
    axis=0,
    ignore_index=True,
    join="outer",
    verify_integrity=True,
)

associations_df.rename(
    columns={"uniprot_id_1": "uniprot_id", "go_terms_1": "go_terms"}, inplace=True
)

# Drop duplicated protein identifiers
associations_df.drop_duplicates(subset="uniprot_id", inplace=True)

# Save the associations dataset
associations_df.to_csv("./data/anc2vec/associations_anc2vec.tsv", sep="\t", index=False)

print(f"associations data shape: {associations_df.shape}")
associations_df.head()

associations data shape: (70081, 2)


Unnamed: 0,uniprot_id,go_terms
0,F6T1W7,"[GO:0006261, GO:0043625]"
1,Q9V9M7,"[GO:0002181, GO:0022625, GO:0003735]"
2,Q9V3E7,"[GO:0005634, GO:0005654, GO:0031965, GO:0000398]"
3,P48181,"[GO:0046662, GO:0043051, GO:0007274, GO:000693..."
4,O15379,"[GO:0045892, GO:0006325, GO:0051059, GO:000012..."


In [24]:
# PPI counts (unique)

# Extract interactions
ppi_pairs = list(
    tuple(sorted(x))
    for x in ppi_df_pos[["uniprot_id_1", "uniprot_id_2"]].to_numpy()
)

# Extract unique interactions
ppi_unique_pairs = list(
    set(
        tuple(sorted(x))
        for x in ppi_df_pos[["uniprot_id_1", "uniprot_id_2"]].to_numpy()
    )
)

# Extract unique uniprot ids in interactions
ppi_unique_ids = associations_df["uniprot_id"].unique()
print(f"# of unique UniProt IDs: {ppi_unique_ids.shape[0]}")
print(f"# of PPIs: {len(ppi_pairs)}")
print(f"# of unique PPIs: {len(ppi_unique_pairs)}")

# of unique UniProt IDs: 70081
# of PPIs: 1870437
# of unique PPIs: 1870437


## Extract Largest Connected Component from PPI graph (ensures connectivity)

In [57]:
def get_lcc(ppi_pairs, degree_threshold, graph_type="undirected"):
    """
    Function to create a graph (directed or undirected) from PPI pairs, filter nodes by degree (keeping nodes with degree <= degree_threshold),
    and return the largest connected component (LCC), printing graph statistics.

    Parameters:
    ppi_pairs (list of tuples): List of PPI (protein-protein interaction) pairs to form the graph edges.
    degree_threshold (int): Maximum degree of nodes to keep.
    graph_type (str): Type of graph to create, either "directed" or "undirected" (default).

    Returns:
    ppi_graph_lcc (Graph): Largest connected component (LCC) of the filtered graph.
    """

    # Create the graph based on the graph_type parameter
    if graph_type == "directed":
        ppi_graph = nx.DiGraph(ppi_pairs)  # Directed graph
    else:
        ppi_graph = nx.Graph(ppi_pairs)  # Undirected graph (default)

    # Filter nodes based on the degree threshold (keeping nodes with degree <= threshold)
    filtered_nodes = [
        node for node, degree in ppi_graph.degree() if degree <= degree_threshold
    ]

    # Create a subgraph with the filtered nodes
    ppi_graph_filtered = ppi_graph.subgraph(filtered_nodes).copy()

    # Get the largest connected component (LCC)
    if graph_type == "directed":
        # Use weakly connected components for directed graphs
        lcc_filtered = max(nx.weakly_connected_components(ppi_graph_filtered), key=len)
    else:
        # Use connected components for undirected graphs
        lcc_filtered = max(nx.connected_components(ppi_graph_filtered), key=len)

    ppi_graph_lcc = ppi_graph_filtered.subgraph(lcc_filtered).copy()

    # Extract nodes and edges for the filtered LCC graph
    ppi_nodes_filtered = list(ppi_graph_lcc.nodes)
    ppi_edges_filtered = list(ppi_graph_lcc.edges)

    # Calculate average degree
    avr_degree = (
        sum(dict(ppi_graph_lcc.degree()).values()) / ppi_graph_lcc.number_of_nodes()
    )

    # Print-out basic graph attributes after filtering by degree
    print("\nFiltered PPI graph (by degree) attributes:")
    print(
        f"# of nodes (after filtering by degree <= {degree_threshold}): {ppi_graph_filtered.number_of_nodes()}"
    )
    print(f"# of edges (after filtering): {ppi_graph_filtered.number_of_edges()}")

    print("\nLargest Connected Component (LCC) attributes:")
    print(f"# of nodes (LCC): {len(ppi_nodes_filtered)}")
    print(f"# of edges (LCC): {len(ppi_edges_filtered)}")
    print(
        f"Is connected? (LCC): {nx.is_connected(ppi_graph_lcc) if graph_type == 'undirected' else nx.is_weakly_connected(ppi_graph_lcc)}"
    )
    print(f"Average degree: {avr_degree:.2f}")

    return ppi_graph_lcc

In [58]:
ppi_graph_lcc = get_lcc(
    ppi_pairs=ppi_unique_pairs, graph_type="undirected", degree_threshold=20
)


Filtered PPI graph (by degree) attributes:
# of nodes (after filtering by degree <= 20): 33479
# of edges (after filtering): 38321

Largest Connected Component (LCC) attributes:
# of nodes (LCC): 1514
# of edges (LCC): 3094
Is connected? (LCC): True
Average degree: 4.09


## Filter-out disconnected components

In [61]:
# Filter protein-go associations with LCC nodes

# Keep full protein-GO associations for plotting
associations_full = associations_df.copy()

# Remove associations with large proteins
associations_df = associations_df[
    associations_df["uniprot_id"].isin(list(ppi_graph_lcc.nodes()))
].copy()
associations_df.reset_index(drop=True, inplace=True)

# Save filtered df
file_path = "./data/ppi/associations.tsv"
associations_df.to_csv(file_path, sep="\t", index=False)


print(f"Filtered Protein-GO associations dims: {associations_df.shape}")
print(f"# of unique UniProt IDs: {len(associations_df['uniprot_id'].unique())}")
associations_df.head()

Filtered Protein-GO associations dims: (1514, 2)
# of unique UniProt IDs: 1514


Unnamed: 0,uniprot_id,go_terms
0,P08390,"[GO:0005515, GO:0005829]"
1,P0AA37,"[GO:0120159, GO:0009982, GO:0031118, GO:000803..."
2,P60546,"[GO:0004385, GO:0042802, GO:0005829]"
3,P11553,"[GO:0008737, GO:0019571, GO:0042355]"
4,P0A867,[GO:0005829]


## Create PPI graph

In [69]:
# Create PPI graph dataset


def PPIDataset(
    source_embeddings_dir,
    dest_embeddings_dir,
    associations_df,
    ppi_edges,
    dataset="train",
):
    """
    Process protein embeddings, create a Protein-Protein Interaction (PPI) graph, and save the data.

    This function loads protein embeddings from .pt files, filters them based on the provided associations,
    constructs a PPI graph, computes degrees, plots the degree distribution, and saves the processed data.

    Args:
        data_folder (Path): Path to the main data directory.
        ppi_folder (Path): Path to the PPI-specific folder where outputs will be saved.
        associations_df (pd.DataFrame): DataFrame containing protein associations with at least a 'uniprot_id' column.
        ppi_edges (list): List of tuples representing edges in the PPI network.
        dataset (str, optional): Identifier for the dataset (e.g., 'train', 'test'). Defaults to "train".

    Returns:
        torch_geometric.data.Data: The processed PPI data containing node features and edge indices.
    """
    
    source_embeddings_dir = Path(source_embeddings_dir)
    dest_embeddings_dir = Path(dest_embeddings_dir)

    # Load embeddings directly from the folder
    source_embeddings_dir = Path(source_embeddings_dir)  # Ensures Path Object
    source_embedding_files = [
        f for f in os.listdir(source_embeddings_dir) if f.endswith(".pt")
    ]
    print(
        f"# embeddings in {source_embeddings_dir.as_posix()}: {len(source_embedding_files)}"
    )

    # Collect embeddings
    unique_proteins = set(associations_df["uniprot_id"].unique())
    protein_embed_dict = {}
    for file in tqdm(source_embedding_files, desc="Loading Protein Embeddings"):
        uniprot_id = file.split(".")[0]
        if uniprot_id in unique_proteins:
            tensor_path = source_embeddings_dir / file
            tensor = torch.load(tensor_path, weights_only=True)
            assert tensor.shape == (
                1,
                1024,
            ), f"Tensor shape mismatch: {tensor.shape} != (1, 1024)"
            protein_embed_dict[uniprot_id] = tensor.numpy().flatten()

    # Save embeddings
    with open(dest_embeddings_dir / "protein_embeddings.pkl", "wb") as file:
        pickle.dump(protein_embed_dict, file)

    # Process embeddings into a feature matrix
    protein_ids = list(protein_embed_dict.keys())
    protein_feats = torch.stack(
        [
            torch.tensor(protein_embed_dict[pid], dtype=torch.float32)
            for pid in protein_ids
        ]
    )
    protein_ids_map = {pid: idx for idx, pid in enumerate(protein_ids)}

    print(f"# of protein IDs: {len(protein_ids)}")
    print(f"Protein features shape: {protein_feats.shape}")

    print(protein_ids_map)

    # Create edge index
    edges_to_tensor = []
    missing_proteins = set()
    for e in ppi_edges:
        if e[0] in protein_ids_map and e[1] in protein_ids_map:
            edges_to_tensor.append((protein_ids_map[e[0]], protein_ids_map[e[1]]))
        else:
            missing_proteins.update([e[0], e[1]])

    if missing_proteins:
        print(f"{len(missing_proteins)} missing proteins: {missing_proteins}")

    if edges_to_tensor:
        edge_index_ppi = (
            torch.tensor(edges_to_tensor, dtype=torch.long).t().contiguous()
        )
        print(f"Edge index shape: {edge_index_ppi.shape}")
    else:
        print("No valid edges found!")

    # Convert to undirected graph and compute degrees
    edge_index_ppi = to_undirected(edge_index_ppi)
    degrees = degree(edge_index_ppi[0])

    # Plot degree distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(degrees.numpy(), bins=50, kde=True)
    plt.title("Distribution of Degrees")
    plt.xlabel("Degree")
    plt.ylabel("Frequency")
    plt.savefig(dest_embeddings_dir / f"degree_distribution_{dataset}.png")
    plt.close()

    # Create and save Data object
    ppi_data = Data(x=protein_feats, edge_index=edge_index_ppi)
    torch.save(ppi_data, dest_embeddings_dir / f"ppi_dataset_{dataset}.pt")

    return ppi_data, missing_proteins

In [70]:
source_embeddings_dir = "./data/protein/embeddings_goa-hc/per_protein/"
dest_embeddings_dir = "./data/ppi"
ppi_data_human, missing_proteins = PPIDataset(
    source_embeddings_dir=source_embeddings_dir,
    dest_embeddings_dir= "./data/ppi/",
    associations_df=associations_df,
    ppi_edges=ppi_graph_lcc.edges(),
    dataset="train"
)

# Save missing proteins list
np.save("missing_proteins.npy", np.array(list(missing_proteins)), allow_pickle=True)

print(f"Average degree: {degree(ppi_data_human.edge_index[0]).mean():.2f}")
print(f"Max degree: {degree(ppi_data_human.edge_index[0]).max():.2f}")
print(f"Min degree: {degree(ppi_data_human.edge_index[0]).min():.2f}")
ppi_data_human

# embeddings in data/protein/embeddings_goa-hc/per_protein: 78020


Loading Protein Embeddings:   0%|          | 0/78020 [00:00<?, ?it/s]

# of protein IDs: 1508
Protein features shape: torch.Size([1508, 1024])
{'P0ADV5': 0, 'P76192': 1, 'P0AGJ9': 2, 'P77608': 3, 'P0ABV6': 4, 'P0A9S7': 5, 'P04395': 6, 'P0A710': 7, 'P04995': 8, 'P27247': 9, 'P39377': 10, 'P77308': 11, 'P33129': 12, 'P52006': 13, 'P68398': 14, 'P68567': 15, 'P0A9J0': 16, 'P0ABJ3': 17, 'P0AEY5': 18, 'P0ADR8': 19, 'P0AE70': 20, 'P08179': 21, 'P03004': 22, 'P37353': 23, 'P45756': 24, 'Q6BF16': 25, 'P08190': 26, 'P30864': 27, 'P09099': 28, 'P0A6V8': 29, 'P0A7F3': 30, 'P75733': 31, 'P32684': 32, 'P0A6W5': 33, 'P52599': 34, 'P77329': 35, 'P0AF24': 36, 'P0AFU8': 37, 'P60546': 38, 'P39353': 39, 'P77165': 40, 'Q46927': 41, 'P0ABY4': 42, 'P37338': 43, 'P0AEL3': 44, 'P0AAW9': 45, 'P0A6N4': 46, 'P0A6N8': 47, 'P16431': 48, 'P76049': 49, 'P16678': 50, 'P0AEJ2': 51, 'P32057': 52, 'P0A944': 53, 'P39452': 54, 'P15977': 55, 'P0AC81': 56, 'P21437': 57, 'P37769': 58, 'P37146': 59, 'P15723': 60, 'P0A991': 61, 'P0AFW2': 62, 'P06610': 63, 'P0AGH3': 64, 'P11557': 65, 'P39411': 66,

Data(x=[1508, 1024], edge_index=[2, 6136])

### 1.1. STRING DB

In [None]:
# Load STRING PPI data
string_hs_df = pd.read_csv("./data/ppi/string/9606.protein.physical.links.v12.0.txt", sep=" ")
print(string_hs_df.info())
string_hs_df.head()

In [None]:
# Filter by score
string_hs_df = string_hs_df[string_hs_df["combined_score"] >= 800]
string_hs_df.sort_values(by="combined_score", inplace=True)
string_hs_df.reset_index(drop=True, inplace=True)
print(f"shape after filtering: {string_hs_df.shape}")
string_hs_df.head()

In [None]:
# Split column IDs
string_hs_df["protein1"] = string_hs_df["protein1"].str.extract(r"\.(.*)$")[0]
string_hs_df["protein2"] = string_hs_df["protein2"].str.extract(r"\.(.*)$")[0]

# Remove self-loops
string_hs_df = string_hs_df[string_hs_df["protein1"]!= string_hs_df["protein2"]]
print(f"shape after removing self-loops: {string_hs_df.shape}")

# Remove redundant entries
string_hs_df = string_hs_df.drop_duplicates()
print(f"shape after dropping duplicates: {string_hs_df.shape}")

# Remove repeated entries
string_hs_df = string_hs_df[~string_hs_df.duplicated(subset=["protein1", "protein2"])]
print(f"shape after removing duplicated entries: {string_hs_df.shape}")

print(string_hs_df.shape)
string_hs_df.head()

In [None]:
# Drop duplicated ppi pairs (A-B and B-A)

# Create a new DataFrame with sorted IDs to handle A-B and B-A equivalency
string_hs_df["sorted_A"] = string_hs_df[["protein1", "protein2"]].min(axis=1)
string_hs_df["sorted_B"] = string_hs_df[["protein1", "protein2"]].max(axis=1)

# Check for duplicates on the sorted pairs (A-B and B-A are now identical)
string_hs_df = string_hs_df[~string_hs_df.duplicated(subset=["sorted_A", "sorted_B"])]

# Drop the temporary sorted columns
string_hs_df.drop(columns=["sorted_A", "sorted_B"], inplace=True)

print(f"shape after removing duplicated pairs: {string_hs_df.shape}")
string_hs_df

In [None]:
# Check score distribution
string_hs_df["combined_score"].iloc[:].plot(kind="hist")
plt.title("Combined Score Distribution")
plt.xlabel("Combined Score")
plt.show()

In [None]:
# Get unique protein identifiers (ENSEMBLE)
unique_protein_ids = pd.concat(
    [string_hs_df["protein1"], string_hs_df["protein2"]], ignore_index=True
).unique()
unique_protein_ids = list(unique_protein_ids)

print(f"# unique proteins in PPI data: {len(unique_protein_ids)}")

In [None]:
# Submit job to UniProt REST API
job_id = aidprot.submit_id_mapping(
    from_db="Ensembl_Protein", to_db="UniProtKB-Swiss-Prot", ids=unique_protein_ids
)
print(job_id)

In [None]:
# Fetch all mapping results (paginated results)
mappings = aidprot.fetch_all_results(job_id, UNIPROT_API)
mappings

In [None]:
# Save mappings to df
protein_map_df = pd.DataFrame.from_dict(mappings)
protein_map_df.columns = ["ensembl_id", "uniprot_id"]
protein_map_df.to_csv("./data/ppi/uniprot_map.csv", index=False)
protein_map_df

In [None]:
protein_map_df = pd.read_csv("./data/ppi/uniprot_map.csv")
# Check duplicate mapping
duplicate_mapping_df = protein_map_df[protein_map_df.duplicated(subset="uniprot_id")].sort_values(
    by="uniprot_id"
)

print(f"# duplicated UniProt ids: {len(duplicate_mapping_df["uniprot_id"].unique())}")

counts = pd.DataFrame(duplicate_mapping_df.groupby("uniprot_id").value_counts()).sort_values(by="uniprot_id")
counts

In [None]:
# Map Uniprot IDs to PPI dataset
string_hs_df_mapped = pd.merge(
    string_hs_df, protein_map_df, left_on="protein1", right_on="ensembl_id", how="left"
)
string_hs_df_mapped = pd.merge(
    string_hs_df_mapped,
    protein_map_df,
    left_on="protein2",
    right_on="ensembl_id",
    how="left",
)

# Drop NAs (unmapped ids)
string_hs_df_mapped.dropna(subset=["uniprot_id_x", "uniprot_id_y"], inplace=True)
string_hs_df_mapped.reset_index(drop=True, inplace=True)

# Remove ununsed columns and rename
string_hs_df_mapped.drop(
    columns=["ensembl_id_x", "ensembl_id_y", "protein1", "protein2"], inplace=True
)
string_hs_df_mapped.rename(
    columns={"uniprot_id_x": "uniprot_id_A", "uniprot_id_y": "uniprot_id_B"},
    inplace=True,
)

# Rename and reorder columns
string_hs_df_mapped = string_hs_df_mapped[
    ["uniprot_id_A", "uniprot_id_B", "combined_score"]
]

# Save mapped PPI data
string_hs_df_mapped.to_csv("./data/ppi/ppi_string_hs_mapped.tsv", sep="\t", index=False)

print(string_hs_df_mapped.info())
string_hs_df_mapped.head()

In [None]:
# Load ground-truth annotations (SwissProt)
associations_df = pd.read_csv("./data/goa/goa_hc_annot_prop.tsv", sep="\t")
print(f"Annotations df shape: {associations_df.shape}")
associations_df.head()

In [None]:
# Extract unique protein ids
protein_ids_annot = associations_df["EntryID"].unique()  # Get all annotated proteins

# Extract pairs where both aren't annotated
unknown_df = string_hs_df_mapped[
    ~((string_hs_df_mapped["uniprot_id_A"].isin(protein_ids_annot))
    & (string_hs_df_mapped["uniprot_id_B"].isin(protein_ids_annot)))
]
unknown_df.reset_index(drop=True, inplace=True)
unknown_df.to_csv("./data/ppi/unknown.tsv", sep="\t", index=False)
print(unknown_df.shape)

# Extract pairs where both are annotated
string_hs_df_mapped_annot = string_hs_df_mapped[
    ((string_hs_df_mapped["uniprot_id_A"].isin(protein_ids_annot))
    & (string_hs_df_mapped["uniprot_id_B"].isin(protein_ids_annot)))
]
string_hs_df_mapped_annot

In [None]:
# Remove self-loops
string_hs_df_mapped_annot = string_hs_df_mapped_annot[
    (string_hs_df_mapped_annot["uniprot_id_A"] != string_hs_df_mapped_annot["uniprot_id_B"])
]  # Drop self-interactions (A = B)

string_hs_df_mapped_annot.dropna(inplace=True)  # Drop unpaired ids

print(string_hs_df_mapped_annot.shape)

In [None]:
# Drop duplicated ppi pairs

# Step 1: Create a new DataFrame with sorted IDs to handle A-B and B-A equivalency
string_hs_df_mapped_annot["sorted_A"] = string_hs_df_mapped_annot[
    ["uniprot_id_A", "uniprot_id_B"]
].min(axis=1)
string_hs_df_mapped_annot["sorted_B"] = string_hs_df_mapped_annot[
    ["uniprot_id_A", "uniprot_id_B"]
].max(axis=1)

# Step 2: Check for duplicates on the sorted pairs (A-B and B-A are now identical)
string_hs_df_mapped_annot = string_hs_df_mapped_annot[
    ~string_hs_df_mapped_annot.duplicated(subset=["sorted_A", "sorted_B"])
]

# Step 3: Drop the temporary sorted columns if you no longer need them
string_hs_df_mapped_annot.drop(columns=["sorted_A", "sorted_B"], inplace=True)

# Display the DataFrame without the duplicates
print(string_hs_df_mapped_annot.shape)
string_hs_df_mapped_annot

In [None]:
# Save the final PPI data
string_hs_df_mapped_annot.reset_index(drop=True, inplace=True)
string_hs_df_mapped_annot.to_csv("./data/ppi/ppi_string_hs.tsv", sep="\t", index=False)
string_hs_df_mapped_annot.shape

In [None]:
# PPI counts (unique)

# Extract interactions
ppi_pairs = list(
    tuple(sorted(x))
    for x in string_hs_df_mapped_annot[["uniprot_id_A", "uniprot_id_B"]].to_numpy()
)

# Extract unique interactions
ppi_unique_pairs = list(
    set(
        tuple(sorted(x))
        for x in string_hs_df_mapped_annot[["uniprot_id_A", "uniprot_id_B"]].to_numpy()
    )
)

# Extract unique uniprot ids in interactions
ppi_unique_ids = pd.concat(
    [
        string_hs_df_mapped_annot["uniprot_id_A"],
        string_hs_df_mapped_annot["uniprot_id_B"],
    ]
).unique()

print(f"# of unique UniProt IDs: {ppi_unique_ids.shape[0]}")
print(f"# of PPIs: {len(ppi_pairs)}")
print(f"# of unique PPIs: {len(ppi_unique_pairs)}")