In [None]:
import pandas as pd
import sys
import pickle
import networkx as nx
import itertools as it
import os

## Initial data preprocessing (renaming, only humans)

In [None]:
df = pd.read_csv("./BIOGRID-ALL-4.4.222.trunc.tsv", sep="\t")
df.columns = ["Symbol A", "Symbol B", "Synonym A", "Synonym B", "Uniprot A", "Uniprot B", "Organism A", "Organism B"]

df = df[(df["Organism A"] == "Homo sapiens") & (df["Organism B"] == "Homo sapiens")].reset_index(drop=True)
df = df.drop(columns=["Organism A", "Organism B"])

df["Uniprot A"] = "uniprot:" + df["Uniprot A"].astype(str)
df["Uniprot B"] = "uniprot:" + df["Uniprot B"].astype(str)

df = df.drop_duplicates()

### Handle unmapped UniProt IDs
* Drop the ones that couldn't be recovered
* Requires using the UniProt mapping web tool. Could implement using the API.

In [None]:
# Get unmapped uniprot IDs and export them to map them using UniProt's web tool
pd.DataFrame(list(set(pd.concat([
    df[df["Uniprot A"] == "uniprot:-"]["Symbol A"],
    df[df["Uniprot B"] == "uniprot:-"]["Symbol B"]
])))).to_csv("./no-uniprot.csv", header=False, index=False)

In [None]:
# After recovering IDs and collecting the right columns from UniProt's tool
# Required columns: ["From", "Entry", Organism"]
up_map = pd.read_csv("./no-uniprot_mapping.tsv", sep="\t")
up_map = up_map[up_map["Organism"] == "Homo sapiens (Human)"]
up_map = up_map[["From", "Entry"]].drop_duplicates(subset=["From"])
up_map["Entry"] = "uniprot:" + up_map["Entry"].astype(str)
up_map.columns = ["Symbol", "Uniprot"]

In [None]:
# Recover unmapped UniProt IDs. Separated by A and B
noup_a = df[df["Uniprot A"] == "uniprot:-"].copy().reset_index(drop=True)
noup_a["Uniprot A"] = noup_a.merge(up_map, left_on=["Symbol A"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot A"] != "uniprot:-"]
df = pd.concat([df, noup_a])

noup_b = df[df["Uniprot B"] == "uniprot:-"].copy().reset_index(drop=True)
noup_b["Uniprot B"] = noup_b.merge(up_map, left_on=["Symbol B"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot B"] != "uniprot:-"]
df = pd.concat([df, noup_b])

df = df.dropna()
df

## Export edges

In [None]:
edges_df = df[["Uniprot A", "Uniprot B"]].rename(columns={"Uniprot A": "source", "Uniprot B": "target"})

G = nx.from_pandas_edgelist(edges_df, source="source", target="target", create_using=nx.Graph())

with open("BIOGRID_edges.pkl", "wb") as p:
    pickle.dump(edges_df, p)

with open("BIOGRID_graph.pkl", "wb") as p:
    pickle.dump(G, p)

## Synonyms for each listed entity

In [None]:
# The code is not pretty, but overall just get all the synonyms of all the mentioned entities
syn_df = pd.concat([df[["Symbol A", "Synonym A", "Uniprot A"]].rename(columns={"Symbol A": "Symbol",
                                                                               "Synonym A": "name",
                                                                               "Uniprot A": "Id"
                                                                              }),
                    df[["Symbol B", "Synonym B", "Uniprot B"]].rename(columns={"Symbol B": "Symbol",
                                                                               "Synonym B": "name",
                                                                               "Uniprot B": "Id"
                                                                              })],
                   axis=0).drop_duplicates()

syn_df["name"] = syn_df.Symbol.str.cat(syn_df.name, sep="|")
syn_df = syn_df[syn_df["name"] != "-"].copy()
syn_df["name"] = syn_df["name"].str.split("|")
syn_df = syn_df.explode("name").reset_index(drop=True)
syn_df = syn_df[syn_df["name"] != "-"]
syn_df = syn_df.drop(columns="Symbol").drop_duplicates().reset_index(drop=True)
syn_df

In [None]:
with open("BIOGRID_nodes.pkl", "wb") as p:
    pickle.dump(syn_df, p)