# Download and Generation of Protein-Protein Interaction (PPI) Networks

This notebook demonstrates the process of downloading, processing, and filtering PPI networks from the STRING and BioGRID databases.

The following packages are required for this script: `pandas`, `numpy`, `requests`, `zipfile`, `gzip`, `networkx`, `shutil`, and `mygene`.


In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
import requests
import zipfile
import gzip
import shutil
import mygene
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

import BioNetTools as tools  # Custom module for handling network operations


## BioGRID PPI Network

We download the complete BioGRID PPI network, version 4.4.243, from:
[BioGRID Download](https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.243/BIOGRID-ALL-4.4.243.mitab.zip)

For further documentation, visit [BioGRID](https://thebiogrid.org/).


In [None]:
# Define the URL for the BioGRID MITAB file
biogrid_url = "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.241/BIOGRID-ALL-4.4.241.mitab.zip"

# Define paths for temporary files
biogrid_zip_path = './tmp_biogrid/biogrid.zip'

# Download and extract the BioGRID data
print("Downloading BioGRID dataset...")
tools.download_file(biogrid_url, biogrid_zip_path)

print("Uncompressing BioGRID dataset...")
tools.unzip_file(biogrid_zip_path, "./tmp_biogrid/biogrid_data")

print("Reading BioGRID dataset...")
biogrid_df = pd.read_csv("./tmp_biogrid/biogrid_data/BIOGRID-ALL-4.4.241.mitab.txt", sep="\t", low_memory=False)

# Clean up temporary files
shutil.rmtree("./tmp_biogrid")

# Filter for human physical interaction associations
print("Filtering human-specific interactions...")
biogrid_df = biogrid_df.query("`Taxid Interactor A` == 'taxid:9606' and `Taxid Interactor B` == 'taxid:9606'")
biogrid_df = biogrid_df[biogrid_df["Interaction Types"].str.contains("physical association", case=False)]

# Select relevant columns
biogrid_df = biogrid_df[['Alt IDs Interactor A', 'Alt IDs Interactor B', 'Aliases Interactor A', 'Aliases Interactor B', 'Confidence Values']]

# Extract HGNC gene symbols
biogrid_df['Gene_Name_A'] = biogrid_df['Alt IDs Interactor A'].apply(tools.extract_hgnc_biogrid)
biogrid_df['Gene_Name_B'] = biogrid_df['Alt IDs Interactor B'].apply(tools.extract_hgnc_biogrid)
biogrid_df = biogrid_df.query("Gene_Name_A != Gene_Name_B")

# Extract BioGRID confidence scores
biogrid_df['Score'] = biogrid_df['Confidence Values'].apply(tools.extract_score_biogrid)
biogrid_df = biogrid_df[['Gene_Name_A', 'Gene_Name_B', 'Score']]

# Save the processed PPI network
print("Saving filtered BioGRID dataset...")
biogrid_df.to_csv("../../sup_data/alternative_ppi/ppi_biogrid.csv", index=False)


## Convert DataFrame to NetworkX Graph


In [None]:
# Rename 'Score' to 'weight' for NetworkX compatibility
biogrid_df = biogrid_df.rename(columns={"Score": "weight"})

# Create an undirected graph from the DataFrame
G = nx.from_pandas_edgelist(biogrid_df, 'Gene_Name_A', 'Gene_Name_B', edge_attr='weight', create_using=nx.Graph())

print(f"Nodes: {len(G.nodes)}")
print(f"Edges: {len(G.edges)}")


## Degree Distribution


In [None]:
degree_sequence = [d for _, d in G.degree()]
plt.figure(figsize=(7, 5))
sns.scatterplot(
    x=range(len(degree_sequence)), 
    y=sorted(degree_sequence, reverse=True), 
    edgecolor=None,
    legend=False
)
plt.xlabel("Node Rank")  
plt.ylabel("Degree")  
plt.xscale("log")
plt.yscale("log")
plt.show()


## Score Distribution


In [None]:
plt.figure(figsize=(7, 5))
sns.histplot(np.log10(biogrid_df['weight']), bins=50, kde=True)
plt.xlabel(r"$log_{10}$(Score)")
plt.ylabel("Frequency")
plt.title("Score Distribution in BioGRID PPI Network")
plt.yscale("log")
plt.show()


## STRING PPI Network

We download the complete STRING PPI network, version 12.0, from:
[STRING Download](https://stringdb-downloads.org/download/protein.physical.links.v12.0/9606.protein.physical.links.v12.0.txt.gz)

For further documentation, visit [STRING Database](https://string-db.org/).


In [None]:
# Define the URL for the STRING PPI dataset
string_url = "https://stringdb-downloads.org/download/protein.physical.links.v12.0/9606.protein.physical.links.v12.0.txt.gz"

# Define paths for temporary files
string_gz_path = './tmp_string/string.gz'

# Download and extract STRING data
print("Downloading STRING dataset...")
tools.download_file(string_url, string_gz_path)
tools.ungz_file(string_gz_path, "./tmp_string/string_data")

print("Reading STRING dataset...")
string_df = pd.read_csv("./tmp_string/string_data/string", sep="\s+", engine="python")

# Clean up temporary files
shutil.rmtree("./tmp_string")

# Remove prefixes from protein names
print("Processing protein names...")
string_df["protein1"] = string_df["protein1"].str.replace("9606.", "", regex=False)
string_df["protein2"] = string_df["protein2"].str.replace("9606.", "", regex=False)

# Convert Ensembl IDs to HGNC symbols
ens_to_hgnc = tools.ensembl_to_hgnc(string_df)
string_df["HGNC1"] = string_df["protein1"].map(ens_to_hgnc)
string_df["HGNC2"] = string_df["protein2"].map(ens_to_hgnc)

# Remove entries with unknown gene mappings
string_df = string_df.query("HGNC1 != 'Unknown' and HGNC2 != 'Unknown'")

# Save the processed PPI network
print("Saving filtered STRING dataset...")
string_df.to_csv("../../sup_data/alternative_ppi/ppi_string.csv", index=False)


## Convert STRING DataFrame to NetworkX Graph


In [None]:
# Rename 'combined_score' to 'weight' for NetworkX compatibility
string_df = string_df.rename(columns={"combined_score": "weight"})

# Create an undirected graph from the DataFrame
G_string = nx.from_pandas_edgelist(string_df, 'HGNC1', 'HGNC2', edge_attr='weight', create_using=nx.Graph())

print(f"Nodes: {len(G_string.nodes)}")
print(f"Edges: {len(G_string.edges)}")


## Degree Distribution of STRING Network


In [None]:
degree_sequence = [d for _, d in G_string.degree()]
plt.figure(figsize=(7, 5))
sns.scatterplot(
    x=range(len(degree_sequence)), 
    y=sorted(degree_sequence, reverse=True), 
    edgecolor=None,
    legend=False
)
plt.xlabel("Node Rank")  
plt.ylabel("Degree")  
plt.xscale("log")
plt.yscale("log")
plt.show()


## Score Distribution of STRING Network


In [None]:
plt.figure(figsize=(7, 5))
sns.histplot(string_df['weight'], bins=50, kde=True)
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.title("Score Distribution in STRING PPI Network")
plt.yscale("log")
plt.show()
