In [133]:
import pandas as pd
import sys
import pickle
import networkx as nx
import itertools as it
import os

## Initial data preprocessing (renaming, only humans)

In [492]:
df = pd.read_csv("./BIOGRID-ALL-4.4.222.trunc.tsv", sep="\t")
df.columns = ["Symbol A", "Symbol B", "Synonym A", "Synonym B", "Uniprot A", "Uniprot B", "Organism A", "Organism B"]

df = df[(df["Organism A"] == "Homo sapiens") & (df["Organism B"] == "Homo sapiens")].reset_index(drop=True)
df = df.drop(columns=["Organism A", "Organism B"])

df["Uniprot A"] = "uniprot:" + df["Uniprot A"].astype(str)
df["Uniprot B"] = "uniprot:" + df["Uniprot B"].astype(str)

df = df.drop_duplicates()

### Handle unmapped UniProt IDs
* Drop the ones that couldn't be recovered
* Requires using the UniProt mapping web tool. Could implement using the API.

In [228]:
# Get unmapped uniprot IDs and export them to map them using UniProt's web tool
pd.DataFrame(list(set(pd.concat([
    df[df["Uniprot A"] == "uniprot:-"]["Symbol A"],
    df[df["Uniprot B"] == "uniprot:-"]["Symbol B"]
])))).to_csv("./no-uniprot.csv", header=False, index=False)

In [493]:
# After recovering IDs and collecting the right columns from UniProt's tool
# Required columns: ["From", "Entry", Organism"]
up_map = pd.read_csv("./no-uniprot_mapping.tsv", sep="\t")
up_map = up_map[up_map["Organism"] == "Homo sapiens (Human)"]
up_map = up_map[["From", "Entry"]].drop_duplicates(subset=["From"])
up_map["Entry"] = "uniprot:" + up_map["Entry"].astype(str)
up_map.columns = ["Symbol", "Uniprot"]

In [494]:
# Recover unmapped UniProt IDs. Separated by A and B
noup_a = df[df["Uniprot A"] == "uniprot:-"].copy().reset_index(drop=True)
noup_a["Uniprot A"] = noup_a.merge(up_map, left_on=["Symbol A"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot A"] != "uniprot:-"]
df = pd.concat([df, noup_a])

noup_b = df[df["Uniprot B"] == "uniprot:-"].copy().reset_index(drop=True)
noup_b["Uniprot B"] = noup_b.merge(up_map, left_on=["Symbol B"], right_on="Symbol", how="left")["Uniprot"]
df = df[df["Uniprot B"] != "uniprot:-"]
df = pd.concat([df, noup_b])

df = df.dropna()
df

Unnamed: 0,Symbol A,Symbol B,Synonym A,Synonym B,Uniprot A,Uniprot B
0,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,ABP-280|ABP280A|ABPA|ABPL|FLN2|MFM5|MPD4,uniprot:P45985,uniprot:Q14315
1,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,CMD1AA,uniprot:Q86TC9,uniprot:P35609
2,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,FPTA|PGGT1A|PTAR2,uniprot:Q04771,uniprot:P49354
3,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,MYL|PP8675|RNF71|TRIM19,uniprot:P23769,uniprot:P29590
4,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,ADMIO|APRF|HIES,uniprot:P15927,uniprot:P40763
...,...,...,...,...,...,...
8693,NSUN5P1,NSUN5P2,NSUN5B|WBSCR20B,NOL1R2|NSUN5C|WBSCR20B|WBSCR20C,uniprot:Q3KNT7,uniprot:Q63ZY6
8694,DHRS4L2,DHRS4L1,SDR25C3,SDR25C4,uniprot:Q6PKH6,uniprot:P0CG22
8695,TRGV3,CNTNAP3B,TCRGV3|V1S3,-,uniprot:P03979,uniprot:Q96NU0
8698,NSUN5P1,HELZ,NSUN5B|WBSCR20B,DHRC|DRHC|HUMORF5,uniprot:Q3KNT7,uniprot:P42694


## Export edges

In [495]:
edges_df = df[["Uniprot A", "Uniprot B"]].rename(columns={"Uniprot A": "source", "Uniprot B": "target"})

G = nx.from_pandas_edgelist(edges_df, source="source", target="target", create_using=nx.Graph())

with open("BIOGRID_edges.pkl", "wb") as p:
    pickle.dump(edges_df, p)

with open("BIOGRID_graph.pkl", "wb") as p:
    pickle.dump(G, p)

## Synonyms for each listed entity

In [504]:
# The code is not pretty, but overall just get all the synonyms of all the mentioned entities
syn_df = pd.concat([df[["Symbol A", "Synonym A", "Uniprot A"]].rename(columns={"Symbol A": "Symbol",
                                                                               "Synonym A": "name",
                                                                               "Uniprot A": "Id"
                                                                              }),
                    df[["Symbol B", "Synonym B", "Uniprot B"]].rename(columns={"Symbol B": "Symbol",
                                                                               "Synonym B": "name",
                                                                               "Uniprot B": "Id"
                                                                              })],
                   axis=0).drop_duplicates()

syn_df["name"] = syn_df.Symbol.str.cat(syn_df.name, sep="|")
syn_df = syn_df[syn_df["name"] != "-"].copy()
syn_df["name"] = syn_df["name"].str.split("|")
syn_df = syn_df.explode("name").reset_index(drop=True)
syn_df = syn_df[syn_df["name"] != "-"]
syn_df = syn_df.drop(columns="Symbol").drop_duplicates().reset_index(drop=True)
syn_df

Unnamed: 0,name,Id
0,MAP2K4,uniprot:P45985
1,JNKK,uniprot:P45985
2,JNKK1,uniprot:P45985
3,MAPKK4,uniprot:P45985
4,MEK4,uniprot:P45985
...,...,...
64979,IGLV4-60,uniprot:A0A075B6I1
64980,IGLV460,uniprot:A0A075B6I1
64981,V5-4,uniprot:A0A075B6I1
64982,TMEM78,uniprot:Q5T7P6


In [505]:
with open("BIOGRID_nodes.pkl", "wb") as p:
    pickle.dump(syn_df, p)