In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
dti_folder = "/biodata/nyanovsky/datasets/dti/"

In [3]:
ChG_df = pd.read_csv(dti_folder+"ChG/ChG_df.csv")
ChCh_df = pd.read_csv(dti_folder+"ChCh/tani_net_05.csv")
GG_df = pd.read_csv(dti_folder+"/GG/pfam_proy_net.csv")

In [4]:
ChG_df.head()

Unnamed: 0,DrugID (PubChem CID),GeneID (NCBI)
0,155831,5243
1,24762158,213
2,24762158,506
3,24762158,563
4,24762158,13884


In [5]:
ChCh_df.head()

Unnamed: 0,CID1,CID2,Similarity
0,155831,148124,0.844262
1,155831,36314,0.822581
2,155831,9854073,0.609929
3,155831,6918473,0.578616
4,54692492,54680692,0.550725


In [6]:
ChCh_df.drop(columns="Similarity", inplace=True)

In [7]:
GG_df.head()

Unnamed: 0,src,trgt
0,1394,1395
1,1394,266977
2,1394,2692
3,1394,2696
4,1394,2740


In [8]:
ChG_df = ["C","G"]+ChG_df.astype(str)
ChCh_df = ["C","C"]+ChCh_df.astype(str)
GG_df = ["G","G"]+GG_df.astype(str)

ChCh_df["sorted_edge"] = ChCh_df.apply(lambda row: sorted(row), axis=1)
GG_df["sorted_edge"] = GG_df.apply(lambda row: sorted(row), axis=1)


ChG_df.drop_duplicates(inplace=True)
ChCh_df = ChCh_df.drop_duplicates(subset="sorted_edge").drop(columns="sorted_edge")
GG_df = GG_df.drop_duplicates(subset="sorted_edge").drop(columns="sorted_edge")

In [9]:
ChG_df.to_csv(dti_folder+"ChG/ChG_final_df.csv")
ChCh_df.to_csv(dti_folder+"ChCh/ChCh_final_df.csv")
GG_df.to_csv(dti_folder+"GG/GG_final_df.csv")

In [10]:
ChG_df.rename(columns={"DrugID (PubChem CID)":"src_id", "GeneID (NCBI)": "trgt_id" }, inplace=True)

ChCh_df.rename(columns={"CID1": "src_id", "CID2":"trgt_id"}, inplace=True)

GG_df.rename(columns={"src":"src_id", "trgt":"trgt_id"}, inplace=True)

In [11]:
ChG_df["edge_type"] = "Chem-Gene"
ChG_df["src_node_type"] = "Chem"
ChG_df["trgt_node_type"] = "Gene"

In [13]:
ChCh_df["edge_type"] = "Chem-Chem"
ChCh_df["src_node_type"] = "Chem"
ChCh_df["trgt_node_type"] = "Chem"



In [14]:
GG_df["edge_type"] = "Gene-Gene"
GG_df["src_node_type"] = "Gene"
GG_df["trgt_node_type"] = "Gene"

In [15]:
GG_df

Unnamed: 0,src_id,trgt_id,edge_type,src_node_type,trgt_node_type
0,G1394,G1395,Gene-Gene,Gene,Gene
1,G1394,G266977,Gene-Gene,Gene,Gene
2,G1394,G2692,Gene-Gene,Gene,Gene
3,G1394,G2696,Gene-Gene,Gene,Gene
4,G1394,G2740,Gene-Gene,Gene,Gene
...,...,...,...,...,...
12644,G5649,G7143,Gene-Gene,Gene,Gene
12645,G10417,G84870,Gene-Gene,Gene,Gene
12646,G270,G271,Gene-Gene,Gene,Gene
12647,G270,G272,Gene-Gene,Gene,Gene


In [16]:
edge_df = pd.concat([ChG_df, ChCh_df, GG_df])

In [17]:
import networkx as nx

In [18]:
G = nx.from_pandas_edgelist(edge_df, source="src_id", target="trgt_id",edge_attr="edge_type")

In [19]:
def get_edgetype_subgraph(G: nx.Graph, edge_type: str) -> nx.Graph:
    rel_edges = [(u,v) for (u,v,r_dict) in G.edges.data() if r_dict["edge_type"]==edge_type]
    rel_subgraph = G.edge_subgraph(rel_edges).copy()
    return rel_subgraph

In [20]:
ChG_subgraph = get_edgetype_subgraph(G, "Chem-Gene")
ChCh_subgraph = get_edgetype_subgraph(G, "Chem-Chem")
GG_subgraph = get_edgetype_subgraph(G, "Gene-Gene")

In [21]:
def node_deg_info(node):
    ChG_deg, ChCh_deg, GG_deg = 0,0,0
    if node in ChG_subgraph.nodes():
        ChG_deg = ChG_subgraph.degree(node)
    if node.startswith("C") and node in ChCh_subgraph.nodes():
        ChCh_deg = ChCh_subgraph.degree(node)
    elif node in GG_subgraph.nodes():
        GG_deg = GG_subgraph.degree(node)
    
    return pd.Series([ChG_deg, ChCh_deg, GG_deg])

In [22]:
node_df = pd.DataFrame()
node_df["node_id"] = list(G.nodes())
node_df[["ChG_deg","ChCh_deg", "GG_deg"]] = node_df["node_id"].apply(node_deg_info)
node_df["total_deg"] = node_df[["ChG_deg","ChCh_deg", "GG_deg"]].sum(axis=1)

In [23]:
edge_df.to_csv(dti_folder+"edge_df.csv")
node_df.to_csv(dti_folder+"node_df.csv")

In [24]:
import pickle

In [25]:
pickle.dump(G, open(dti_folder+'dti_graph.pickle', 'wb'))