In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [4]:
dti_folder = "/biodata/nyanovsky/datasets/dti/"

In [5]:
ChCh_fname = input()
GG_fname = input()
dataset_version = input() # v1, v2, ... d

In [6]:
print(ChCh_fname)
print(GG_fname)
print(dataset_version)

circ_tani_20k
pfam_w_proy_net
v2


In [7]:
ChG_df = pd.read_csv(dti_folder+"ChG/ChG_df.csv")
ChCh_df = pd.read_csv(dti_folder+f"ChCh/{ChCh_fname}.csv")
GG_df = pd.read_csv(dti_folder+f"/GG/{GG_fname}.csv")

In [8]:
ChG_df.head()

Unnamed: 0,DrugID (PubChem CID),GeneID (NCBI)
0,155831,5243
1,24762158,213
2,24762158,506
3,24762158,563
4,24762158,13884


In [9]:
ChCh_df.head()

Unnamed: 0,src,trgt
0,5353272,11976122
1,5313476,448653
2,6436079,6436082
3,11430588,5287617
4,5311211,23654841


In [10]:
if "Similarity" in ChCh_df.columns:
    ChCh_df.drop(columns="Similarity", inplace=True)

In [11]:
GG_df.head()

Unnamed: 0,src,trgt
0,5243,4363
1,5243,10257
2,5243,1080
3,5243,64240
4,5243,64241


In [12]:
ChG_df = ["C","G"]+ChG_df.astype(str)
ChCh_df = ["C","C"]+ChCh_df.astype(str)
GG_df = ["G","G"]+GG_df.astype(str)

ChCh_df["sorted_edge"] = ChCh_df.apply(lambda row: sorted(row), axis=1)
GG_df["sorted_edge"] = GG_df.apply(lambda row: sorted(row), axis=1)


ChG_df.drop_duplicates(inplace=True)
ChCh_df = ChCh_df.drop_duplicates(subset="sorted_edge").drop(columns="sorted_edge")
GG_df = GG_df.drop_duplicates(subset="sorted_edge").drop(columns="sorted_edge")

In [9]:
ChG_df.to_csv(dti_folder+"ChG/ChG_final_df.csv")
ChCh_df.to_csv(dti_folder+"ChCh/ChCh_final_df.csv")
GG_df.to_csv(dti_folder+"GG/GG_final_df.csv")

In [13]:
ChG_df.rename(columns={"DrugID (PubChem CID)":"src_id", "GeneID (NCBI)": "trgt_id" }, inplace=True)

ChCh_df.columns = ["src_id", "trgt_id"]

GG_df.columns= ["src_id", "trgt_id"]

In [14]:
ChG_df["edge_type"] = "chg"
ChG_df["src_node_type"] = "chem"
ChG_df["trgt_node_type"] = "gene"

In [15]:
ChCh_df["edge_type"] = "chch"
ChCh_df["src_node_type"] = "chem"
ChCh_df["trgt_node_type"] = "chem"

In [16]:
GG_df["edge_type"] = "gg"
GG_df["src_node_type"] = "gene"
GG_df["trgt_node_type"] = "gene"

In [17]:
edge_df = pd.concat([ChG_df, ChCh_df, GG_df])

In [18]:
edge_df

Unnamed: 0,src_id,trgt_id,edge_type,src_node_type,trgt_node_type
0,C155831,G5243,chg,chem,gene
1,C24762158,G213,chg,chem,gene
2,C24762158,G506,chg,chem,gene
3,C24762158,G563,chg,chem,gene
4,C24762158,G13884,chg,chem,gene
...,...,...,...,...,...
21825,G10239,G1175,gg,gene,gene
21826,G11154,G1175,gg,gene,gene
21827,G128240,G374887,gg,gene,gene
21828,G128240,G80153,gg,gene,gene


In [19]:
import networkx as nx

In [20]:
G = nx.from_pandas_edgelist(edge_df, source="src_id", target="trgt_id",edge_attr="edge_type")

In [21]:
def get_edgetype_subgraph(G: nx.Graph, edge_type: str) -> nx.Graph:
    rel_edges = [(u,v) for (u,v,r_dict) in G.edges.data() if r_dict["edge_type"]==edge_type]
    rel_subgraph = G.edge_subgraph(rel_edges).copy()
    return rel_subgraph

In [22]:
ChG_subgraph = get_edgetype_subgraph(G, "chg")
ChCh_subgraph = get_edgetype_subgraph(G, "chch")
GG_subgraph = get_edgetype_subgraph(G, "gg")

In [23]:
def node_deg_info(node):
    ChG_deg, ChCh_deg, GG_deg = 0,0,0
    if node in ChG_subgraph.nodes():
        ChG_deg = ChG_subgraph.degree(node)
    if node.startswith("C") and node in ChCh_subgraph.nodes():
        ChCh_deg = ChCh_subgraph.degree(node)
    elif node in GG_subgraph.nodes():
        GG_deg = GG_subgraph.degree(node)
    
    return pd.Series([ChG_deg, ChCh_deg, GG_deg])

In [24]:
node_df = pd.DataFrame()
node_df["node_id"] = list(G.nodes())
node_df[["ChG_deg","ChCh_deg", "GG_deg"]] = node_df["node_id"].apply(node_deg_info)
node_df["total_deg"] = node_df[["ChG_deg","ChCh_deg", "GG_deg"]].sum(axis=1)
node_df["node_type"] = node_df["node_id"].apply(lambda x: "chem" if x.startswith("C") else "gene")

In [25]:
# add src node index and trgt node index to edge_df as columns
node_idxs = {node_id:index for index, node_id in enumerate(node_df["node_id"])}
edge_df["src_node_index"] = edge_df["src_id"].apply(lambda id: node_idxs[id])
edge_df["trgt_node_index"] = edge_df["trgt_id"].apply(lambda id: node_idxs[id])

In [26]:
edge_df

Unnamed: 0,src_id,trgt_id,edge_type,src_node_type,trgt_node_type,src_node_index,trgt_node_index
0,C155831,G5243,chg,chem,gene,0,1
1,C24762158,G213,chg,chem,gene,2,3
2,C24762158,G506,chg,chem,gene,2,4
3,C24762158,G563,chg,chem,gene,2,5
4,C24762158,G13884,chg,chem,gene,2,6
...,...,...,...,...,...,...,...
21825,G10239,G1175,gg,gene,gene,11643,11645
21826,G11154,G1175,gg,gene,gene,11644,11645
21827,G128240,G374887,gg,gene,gene,11651,11652
21828,G128240,G80153,gg,gene,gene,11651,11653


In [27]:
from itertools import permutations
def make_undirected(df,column_pairs):
    new_combinations = [item for sublist in [list(permutations(column_pair,2)) for column_pair in column_pairs] for item in sublist]
    reversed_columns = {old:new for (old,new) in new_combinations}
    reversed_dataframe = df.rename(columns=reversed_columns)
    df = pd.concat([df,reversed_dataframe])
    return df

In [28]:
edge_df = make_undirected(edge_df,[("src_id", "trgt_id"), ("src_node_type","trgt_node_type"),
                                       ("src_node_index", "trgt_node_index")])

In [31]:
edge_df.to_csv(dti_folder+f"/processed/{dataset_version}/edge_df.csv", index=False)
node_df.to_csv(dti_folder+f"/processed/{dataset_version}/node_df.csv", index=False)

In [32]:
import pickle

In [33]:
pickle.dump(G, open(dti_folder+f'/processed/{dataset_version}/dti_graph.pickle', 'wb'))