In [5]:
# Note: it may be necessary to use pandas 1.1.5 as pickling is incompatible between this version and later versions.

In [2]:
import pandas as pd
import sys
import pickle
import networkx as nx
import itertools as it
import math
import numpy as np
import os

from Visualization import layeredConcentric
import seaborn as sns

## Initial data preprocessing (renaming, only humans)

In [3]:
df = pd.read_csv("./BIOGRID-ALL-4.4.222.trunc.tsv", sep="\t")
df.columns = [
    "Symbol A", "Symbol B",
    "Synonym A", "Synonym B",
    "System", "SysType", "Author", "Publication",
    "Uniprot A", "Uniprot B",
    "Organism A", "Organism B"
]

df = df[(df["Organism A"] == "Homo sapiens") & (df["Organism B"] == "Homo sapiens")].reset_index(drop=True)
df = df.drop(columns=["Organism A", "Organism B"])

df["Uniprot A"] = "uniprot:" + df["Uniprot A"].astype(str)
df["Uniprot B"] = "uniprot:" + df["Uniprot B"].astype(str)

In [4]:
# Should be (1082654, 10), then (846133, 10)
print(df.shape)
print(df.drop_duplicates(subset=["Uniprot A", "Uniprot B"]).shape)

(1082654, 10)
(846133, 10)


### Handle unmapped UniProt IDs
* Drop the ones that couldn't be recovered
* Requires using the UniProt mapping web tool. Attempted implementing with API, but takes too long and complicated output structure.

In [40]:
# Get unmapped uniprot IDs and export them to map them using UniProt's web tool
pd.DataFrame(list(set(pd.concat([
    df[df["Uniprot A"] == "uniprot:-"]["Symbol A"],
    df[df["Uniprot B"] == "uniprot:-"]["Symbol B"]
])))).to_csv("./no-uniprot.csv", header=False, index=False)

#### Stop here. Use UniProt's mapping tool at https://www.uniprot.org/id-mapping on the `no-uniprot.csv` file. Then, retrieve and name the output `no-uniprot_mapping.tsv` and ensure the file has the "Organism", "From", and "Entry" fields.

In [60]:
# After recovering IDs and collecting the right columns from UniProt's tool
# Required columns: ["From", "Entry", Organism"]
up_map = pd.read_csv("./no-uniprot_mapping.tsv", sep="\t")
up_map = up_map[up_map["Organism"] == "Homo sapiens (Human)"]
up_map = up_map[["From", "Entry"]].drop_duplicates(subset=["From"])
up_map["Entry"] = "uniprot:" + up_map["Entry"].astype(str)
up_map.columns = ["Symbol", "Uniprot"]

In [61]:
# Recover unmapped UniProt IDs. Separated by A and B
noup_a = df[df["Uniprot A"] == "uniprot:-"].copy().reset_index(drop=True)
noup_a["Uniprot A"] = noup_a.merge(up_map, left_on=["Symbol A"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot A"] != "uniprot:-"]
df = pd.concat([df, noup_a])

noup_b = df[df["Uniprot B"] == "uniprot:-"].copy().reset_index(drop=True)
noup_b["Uniprot B"] = noup_b.merge(up_map, left_on=["Symbol B"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot B"] != "uniprot:-"]
df = pd.concat([df, noup_b])

df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,Symbol A,Symbol B,Synonym A,Synonym B,System,SysType,Author,Publication,Uniprot A,Uniprot B
0,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,ABP-280|ABP280A|ABPA|ABPL|FLN2|MFM5|MPD4,Two-hybrid,physical,Marti A (1997),PUBMED:9006895,uniprot:P45985,uniprot:Q14315
1,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,CMD1AA,Two-hybrid,physical,Bang ML (2001),PUBMED:11309420,uniprot:Q86TC9,uniprot:P35609
2,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,FPTA|PGGT1A|PTAR2,Two-hybrid,physical,Wang T (1996),PUBMED:8599089,uniprot:Q04771,uniprot:P49354
3,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,MYL|PP8675|RNF71|TRIM19,Two-hybrid,physical,Tsuzuki S (2000),PUBMED:10938104,uniprot:P23769,uniprot:P29590
4,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,ADMIO|APRF|HIES,Two-hybrid,physical,Kim J (2000),PUBMED:10875894,uniprot:P15927,uniprot:P40763
...,...,...,...,...,...,...,...,...,...,...
1063774,NSUN5P1,NSUN5P2,NSUN5B|WBSCR20B,NOL1R2|NSUN5C|WBSCR20B|WBSCR20C,Affinity Capture-MS,physical,Huttlin EL (2021),PUBMED:33961781,uniprot:Q3KNT7,uniprot:Q63ZY6
1063775,DHRS4L2,DHRS4L1,SDR25C3,SDR25C4,Affinity Capture-MS,physical,Huttlin EL (2021),PUBMED:33961781,uniprot:Q6PKH6,uniprot:P0CG22
1063776,TRGV3,CNTNAP3B,TCRGV3|V1S3,-,Affinity Capture-MS,physical,Huttlin EL (2021),PUBMED:33961781,uniprot:P03979,uniprot:Q96NU0
1063777,NSUN5P1,HELZ,NSUN5B|WBSCR20B,DHRC|DRHC|HUMORF5,Affinity Capture-MS,physical,Huttlin EL (2021),PUBMED:33961781,uniprot:Q3KNT7,uniprot:P42694


### BIOGRID does not care about the order of A or B. So, switch things so A is always alphabetized earlier than B
I checked this by pulling out Uniprot A and Uniprot B, then ordering all A to be earlier in alphabet than all B. Then, I dropped duplicates. If BIOGRID did this beforehand, then dropping duplicates of original BIOGRID and dropping duplicates of the ordered should result in the same number of rows. This was not true.

Approach: Create a new dataframe of df where A and B are all swapped. Create a boolean vector where each entry describes if the row in A is sorted (Uniprot) earlier than B already. All 1-valued (np.where(vec == 1)) rows/indeices corresponding to A become B instead.

In [5]:
flipped_df = df.copy()

flipped_df["Symbol A"] = df.copy()["Symbol B"]
flipped_df["Symbol B"] = df.copy()["Symbol A"]

flipped_df["Synonym A"] = df.copy()["Synonym B"]
flipped_df["Synonym B"] = df.copy()["Synonym A"]

flipped_df["Uniprot A"] = df.copy()["Uniprot B"]
flipped_df["Uniprot B"] = df.copy()["Uniprot A"]

In [6]:
# Get sorted version of Uniprot A and B
id_dict = df[["Uniprot A", "Uniprot B"]].T.to_dict(orient="list")
id_dict = {k: np.sort(v) for k, v in id_dict.items()}
sorted_id_df = pd.DataFrame.from_dict(id_dict)
sorted_id_df_t = sorted_id_df.T.rename(columns={0: "Uniprot A", 1: "Uniprot B"})

# Which rows in original were not already sorted
sorted_hstack = np.array(sorted_id_df_t["Uniprot A"] + sorted_id_df_t["Uniprot B"])
unsorted_hstack = np.array(df["Uniprot A"] + df["Uniprot B"])

unsorted_vec = sorted_hstack != unsorted_hstack

# Replace all unsorted rows with the flipped one
df.iloc[unsorted_vec] = flipped_df.iloc[unsorted_vec]

df = df.drop_duplicates(subset=["Uniprot A", "Uniprot B",
                                "Publication", "System", "Author"]).reset_index(drop=True)

# Export evidence

In [22]:
ev_df = df[["Uniprot A", "Uniprot B"]].copy()
ev_df.columns = ["source", "target"]

ev_df["evidence"] = df["Author"] + \
    "%%%" + df["System"] + \
    "%%%" + df["SysType"].str.capitalize() + \
    "%%%" + df["Publication"]

ev_df = ev_df.groupby(["source", "target"]).agg({"evidence": lambda x: "&&&".join(x)})
ev_df = ev_df.reset_index()

print(ev_df.shape)

with open("./PMC_OA_pickles/BIOGRID_evidence.pkl", "wb") as p:
    pickle.dump(ev_df, p)

(791186, 3)


## Export edges

In [25]:
edges_df = df.groupby(["Uniprot A", "Uniprot B"]).size().reset_index()
edges_df.columns = ["source", "target", "thickness"]

G = nx.from_pandas_edgelist(
    edges_df, edge_attr=True,
    source="source",
    target="target",
    create_using=nx.Graph()
)

print(edges_df.shape)

with open("./PMC_OA_pickles/BIOGRID_edges.pkl", "wb") as p:
    pickle.dump(edges_df, p)

with open("./PMC_OA_pickles/BIOGRID_graph.pkl", "wb") as p:
    pickle.dump(G, p)

(791186, 3)


## Synonyms for each listed entity

In [27]:
# The code is not pretty, but overall just get all the synonyms of all the mentioned entities
syn_df = pd.concat([df[["Symbol A", "Synonym A", "Uniprot A"]].rename(columns={"Symbol A": "Symbol",
                                                                               "Synonym A": "name",
                                                                               "Uniprot A": "Id"
                                                                              }),
                    df[["Symbol B", "Synonym B", "Uniprot B"]].rename(columns={"Symbol B": "Symbol",
                                                                               "Synonym B": "name",
                                                                               "Uniprot B": "Id"
                                                                              })],
                   axis=0).drop_duplicates()

syn_df["name"] = syn_df.Symbol.str.cat(syn_df.name, sep="|")
syn_df = syn_df[syn_df["name"] != "-"].copy()
syn_df["name"] = syn_df["name"].str.split("|")
syn_df = syn_df.explode("name").reset_index(drop=True)
syn_df = syn_df[syn_df["name"] != "-"]
syn_df = syn_df.drop(columns="Symbol").drop_duplicates().reset_index(drop=True)
syn_df

with open("./PMC_OA_pickles/BIOGRID_nodes.pkl", "wb") as p:
    pickle.dump(syn_df, p)