In [1]:
import os
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
print(os.popen("pwd").read())
import argparse
parser = argparse.ArgumentParser()
args, _ = parser.parse_known_args()
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from collections import defaultdict
import gseapy as gp


args.workdir = "/data/project/minwoo"
args.datadir = os.path.join(args.workdir, 'Data')

/data/project/minwoo/feature_selection/phase_5_selection_method_comparison_split_corrected



# Heterogeneous Network

In [2]:
# === Import PS profile === #
# drug - genes : netgp scores
args.ps_profile_fpath = os.path.join(args.datadir, 'drug_target_profile_original.tsv')
ps_profile = pd.read_csv(args.ps_profile_fpath, sep='\t')
drug_list = ps_profile['drug_name'].to_list()
ps_profile = ps_profile.set_index('drug_name')

# === Import PPI Template === #
args.ppi_template_fpath = os.path.join(args.datadir, '9606.protein.links.symbols.v11.5.txt')
ppi = pd.read_csv(args.ppi_template_fpath, sep='\t')
ppi_genes = list(set(ppi['source'].to_list() + ppi['target'].to_list()))
print('#PPI Genes:', len(ppi_genes))

# === Import Drug Target Info === #
# drugbank + GDSC target information
args.drug_target_fpath = os.path.join(args.datadir, 'dti_info_final_common_drugs_only.tsv')
drug_target_info = pd.read_csv(args.drug_target_fpath, sep='\t')
drug_target_info = drug_target_info[['drug_name', 'gene_name']]
drug_target_info = drug_target_info[drug_target_info['drug_name'].isin(ps_profile.index)]
drug_target_info = drug_target_info[drug_target_info['gene_name'].isin(ppi_genes)]

#PPI Genes: 19127


In [3]:
n_indirect_targets = 20
args.resultdir = os.path.join(args.datadir, f'drug_networks_{n_indirect_targets}_indirect_targets')
createFolder(args.resultdir)

### 1) Direct Target - Indirect Target

In [4]:
# ================================================ #
# ====== 1) Direct Target - Indirect Target ====== #
# ================================================ #
# === For each drug: get Top 100 ps genes (indirect targets) : Save edges === #
lvl1_edges = defaultdict(list)
for drug in drug_list:
    # === Direct Targets === #
    direct_targets = drug_target_info.query('drug_name == @drug')['gene_name'].to_list()

    # === Indirect Targets === #
    drug_ps_profile = ps_profile.loc[drug]
    indirect_targets = drug_ps_profile.sort_values(ascending=False).head(n_indirect_targets).index.to_list()

    for direct_target in direct_targets:
        for indirect_target in indirect_targets:
            lvl1_edges['direct'].append(direct_target)
            lvl1_edges['indirect'].append(indirect_target)

lvl1_edges_df = pd.DataFrame(lvl1_edges).drop_duplicates()
print(f"All direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")

# === leave only the edges that are in the PPI network === #
ppi_edges = pd.concat([
    pd.DataFrame(ppi[['source', 'target']].values), 
    pd.DataFrame(ppi[['target', 'source']].values)
]).drop_duplicates()
ppi_edges.columns = ['direct', 'indirect']

lvl1_edges_df = pd.merge(lvl1_edges_df, ppi_edges, on=['direct', 'indirect'], how='inner')
print(f"PPI-included direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")

lvl1_edges_df = pd.concat([
    pd.DataFrame(lvl1_edges_df[['direct', 'indirect']].values),
    pd.DataFrame(lvl1_edges_df[['indirect', 'direct']].values)
]).drop_duplicates()
print(f"Undirected PPI-included direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")
lvl1_edges_df.columns = ['source', 'target']
lvl1_edges_df

All direct/indirect gene pairs: 14,966
PPI-included direct/indirect gene pairs: 11,597
Undirected PPI-included direct/indirect gene pairs: 20,380


Unnamed: 0,source,target
0,ABL1,KIT
1,ABL1,NTRK1
2,ABL1,PDGFRB
3,ABL1,CSF1R
4,ABL1,PDGFRA
...,...,...
11592,CHMP4A,PIKFYVE
11593,APP,PIKFYVE
11594,TLR4,PIKFYVE
11595,LRRK2,PIKFYVE


### 2) Indirect Target - Pathway

In [5]:
# ================================================= #
# ====== 2) Direct/Indirect Target - Pathway ====== #
# ================================================= #
# gp.get_library_name()
# ['GO_Biological_Process_2023', 'GO_Cellular_Component_2023' 'GO_Molecular_Function_2023', 'KEGG_2021_Human', 'Reactome_2022']
kegg_gmt = gp.parser.get_library('KEGG_2021_Human', 
								  organism='Human', 
								  min_size=3, 
								  max_size=2000, 
								  gene_list=None)
print('KEGG #Terms:\t', len(kegg_gmt.keys()))
kegg_genes = [gene for genes in kegg_gmt.values() for gene in genes]
print("KEGG #Genes:\t", len(set(kegg_genes)))

# === Filter KEGG genes by Lvl1 edges genes === #
total_genes = list(pd.concat([lvl1_edges_df['source'], lvl1_edges_df['target']]).unique())

kegg_df = defaultdict(list)
for term, genes in kegg_gmt.items():
    for gene in genes:
        if gene not in total_genes:
            continue
        kegg_df['term'].append(term)
        kegg_df['gene'].append(gene)
kegg_df = pd.DataFrame(kegg_df)
kegg_df.columns = ['source', 'target']
kegg_df

KEGG #Terms:	 320
KEGG #Genes:	 8078


Unnamed: 0,source,target
0,ABC transporters,ABCB6
1,AGE-RAGE signaling pathway in diabetic complic...,TGFB1
2,AGE-RAGE signaling pathway in diabetic complic...,MAPK13
3,AGE-RAGE signaling pathway in diabetic complic...,MAPK14
4,AGE-RAGE signaling pathway in diabetic complic...,BAX
...,...,...
5594,p53 signaling pathway,MDM4
5595,p53 signaling pathway,MDM2
5596,p53 signaling pathway,CDKN1A
5597,p53 signaling pathway,CCNB2


### 3) Merge Graph

In [6]:
# ============================ #
# ====== 3) Merge Graph ====== #
# ============================ #
trial1_nwk = pd.concat([lvl1_edges_df, kegg_df], axis=0)
print(lvl1_edges_df.shape, kegg_df.shape)
trial1_nwk

(20380, 2) (5599, 2)


Unnamed: 0,source,target
0,ABL1,KIT
1,ABL1,NTRK1
2,ABL1,PDGFRB
3,ABL1,CSF1R
4,ABL1,PDGFRA
...,...,...
5594,p53 signaling pathway,MDM4
5595,p53 signaling pathway,MDM2
5596,p53 signaling pathway,CDKN1A
5597,p53 signaling pathway,CCNB2


In [7]:
# ====== Construct Adjacency Matrix ====== #
nodes = pd.Series(pd.concat([trial1_nwk['source'], trial1_nwk['target']]).unique())

# create empty adjacency matrix filled with zeros with the number of nodes as dimension
adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
# add self-loops
for node in nodes:
    adjacency_matrix.loc[node, node] = 1


adj_fpath = os.path.join(args.datadir, f'template_adjacency_matrix_{n_indirect_targets}_indirect_targets.tsv')
adjacency_matrix.to_csv(adj_fpath, sep='\t', index=True, header=True)
adjacency_matrix

Unnamed: 0,ABL1,PDGFRB,KIT,PDGFRA,CSF1R,NTRK1,RET,DDR1,BCR,PPAT,...,Viral myocarditis,Viral protein interaction with cytokine and cytokine receptor,Wnt signaling pathway,Yersinia infection,beta-Alanine metabolism,cAMP signaling pathway,cGMP-PKG signaling pathway,mRNA surveillance pathway,mTOR signaling pathway,p53 signaling pathway
ABL1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRB,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIT,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRA,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSF1R,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cAMP signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
cGMP-PKG signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
mRNA surveillance pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
mTOR signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
# ========================================== #
# ====== Create Drug-Specific Network ====== #
# ========================================== #
for drug in drug_list:
    # =========================== #
    # ====== For Each Drug ====== #
    # =========================== #
    drug_lvl1_edges = defaultdict(list)

    # === Create empty adjacency matrix filled with zeros with the number of nodes as dimension === #
    adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
    # add self-loops
    for node in nodes:
        adjacency_matrix.loc[node, node] = 1

    # ====== Level 1 Edges ====== #
    # === 1. Define Direct Targets === #
    direct_targets = drug_target_info.query('drug_name == @drug')['gene_name'].to_list()

    # === 2. Define Indirect Targets === #
    drug_ps_profile = ps_profile.loc[drug]
    indirect_targets = drug_ps_profile.sort_values(ascending=False).iloc[:n_indirect_targets].index.to_list()

    for direct_target in direct_targets:
        for indirect_target in indirect_targets:
            drug_lvl1_edges['direct'].append(direct_target)
            drug_lvl1_edges['indirect'].append(indirect_target)

    drug_lvl1_edges_df = pd.DataFrame(drug_lvl1_edges).drop_duplicates()
    print(f"All direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")

    # === 3. Define connections: Leave only the edges that are in the PPI network === #
    ppi_edges = pd.concat([
        pd.DataFrame(ppi[['source', 'target']].values), 
        pd.DataFrame(ppi[['target', 'source']].values)
    ]).drop_duplicates()

    ppi_edges.columns = ['direct', 'indirect']
    drug_lvl1_edges_df = pd.merge(drug_lvl1_edges_df, ppi_edges, on=['direct', 'indirect'], how='inner')
    print(f"PPI-included direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")

    drug_lvl1_edges_df = pd.concat([
        pd.DataFrame(drug_lvl1_edges_df[['direct', 'indirect']].values),
        pd.DataFrame(drug_lvl1_edges_df[['indirect', 'direct']].values)
    ]).drop_duplicates()
    print(f"Undirected PPI-included direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")
    drug_lvl1_edges_df.columns = ['source', 'target']

    # ====== Level 2 Edges ====== #
    # === 1. Pathways of the Indirect Targets === #
    drug_lvl2_edges_df = kegg_df    # .query('target in @indirect_targets')
    drug_lvl2_edges_df = pd.concat([
        pd.DataFrame(drug_lvl2_edges_df[['source', 'target']].values),
        pd.DataFrame(drug_lvl2_edges_df[['target', 'source']].values)
    ]).drop_duplicates()
    drug_lvl2_edges_df.columns = ['source', 'target']

    # === 2. Save Edges: Indirect Targets - Pathways === #
    drug_specific_edges = pd.concat([drug_lvl1_edges_df, drug_lvl2_edges_df], axis=0)

    # ====== Fill in the adjacency matrix with the edges from the drug-specific edges (undirected) ====== #
    for node1, node2 in drug_specific_edges.values:
        adjacency_matrix.loc[node1, node2] = 1

    drug_nwk_fpath = os.path.join(args.resultdir, f'{drug}_adjacency_matrix_{n_indirect_targets}_indirect_targets.tsv')
    adjacency_matrix.to_csv(drug_nwk_fpath, sep='\t', index=True, header=True)

All direct/indirect gene pairs: 450
PPI-included direct/indirect gene pairs: 418
Undirected PPI-included direct/indirect gene pairs: 766
All direct/indirect gene pairs: 1,150
PPI-included direct/indirect gene pairs: 875
Undirected PPI-included direct/indirect gene pairs: 1,432
All direct/indirect gene pairs: 100
PPI-included direct/indirect gene pairs: 97
Undirected PPI-included direct/indirect gene pairs: 192
All direct/indirect gene pairs: 550
PPI-included direct/indirect gene pairs: 463
Undirected PPI-included direct/indirect gene pairs: 840
All direct/indirect gene pairs: 750
PPI-included direct/indirect gene pairs: 701
Undirected PPI-included direct/indirect gene pairs: 1,204
All direct/indirect gene pairs: 500
PPI-included direct/indirect gene pairs: 480
Undirected PPI-included direct/indirect gene pairs: 870
All direct/indirect gene pairs: 450
PPI-included direct/indirect gene pairs: 430
Undirected PPI-included direct/indirect gene pairs: 790
All direct/indirect gene pairs: 300


In [9]:
drug_specific_edges

Unnamed: 0,source,target
0,AURKA,AURKB
1,AURKA,CDK1
2,AURKA,TP53
3,AURKA,CCNB1
4,AURKA,CCNA2
...,...,...
5594,MDM4,p53 signaling pathway
5595,MDM2,p53 signaling pathway
5596,CDKN1A,p53 signaling pathway
5597,CCNB2,p53 signaling pathway


In [10]:
adjacency_matrix

Unnamed: 0,ABL1,PDGFRB,KIT,PDGFRA,CSF1R,NTRK1,RET,DDR1,BCR,PPAT,...,Viral myocarditis,Viral protein interaction with cytokine and cytokine receptor,Wnt signaling pathway,Yersinia infection,beta-Alanine metabolism,cAMP signaling pathway,cGMP-PKG signaling pathway,mRNA surveillance pathway,mTOR signaling pathway,p53 signaling pathway
ABL1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
PDGFRB,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIT,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRA,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSF1R,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cAMP signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
cGMP-PKG signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
mRNA surveillance pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
mTOR signaling pathway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


---
---

# Heterogeneous Network (Indirect - Direct) - No Pathway

#### Smaller Size

In [38]:
# === Import PS profile === #
args.ps_profile_fpath = os.path.join(args.datadir, 'drug_target_profile_original.tsv')
ps_profile = pd.read_csv(args.ps_profile_fpath, sep='\t')
drug_list = ps_profile['drug_name'].to_list()
ps_profile = ps_profile.set_index('drug_name')

# === Import PPI Template === #
args.ppi_template_fpath = os.path.join(args.datadir, '9606.protein.links.symbols.v11.5.txt')
ppi = pd.read_csv(args.ppi_template_fpath, sep='\t')
ppi_genes = list(set(ppi['source'].to_list() + ppi['target'].to_list()))
print('#PPI Genes:', len(ppi_genes))

# === Import Drug Target Info === #
args.drug_target_fpath = os.path.join(args.datadir, 'dti_info_final_common_drugs_only.tsv')
drug_target_info = pd.read_csv(args.drug_target_fpath, sep='\t')
drug_target_info = drug_target_info[['drug_name', 'gene_name']]
drug_target_info = drug_target_info[drug_target_info['drug_name'].isin(ps_profile.index)]
drug_target_info = drug_target_info[drug_target_info['gene_name'].isin(ppi_genes)]

#PPI Genes: 19127


In [39]:
n_indirect_targets = 20
args.resultdir = os.path.join(args.datadir, f'drug_networks_{n_indirect_targets}_indirect_targets_no_pathway')
createFolder(args.resultdir)

### 1) Direct Target - Indirect Target

In [40]:
# ================================================ #
# ====== 1) Direct Target - Indirect Target ====== #
# ================================================ #
# === For each drug: get Top 100 ps genes (indirect targets) : Save edges === #
lvl1_edges = defaultdict(list)
for drug in drug_list:
    # === Direct Targets === #
    direct_targets = drug_target_info.query('drug_name == @drug')['gene_name'].to_list()

    # === Indirect Targets === #
    drug_ps_profile = ps_profile.loc[drug]
    indirect_targets = drug_ps_profile.sort_values(ascending=False).head(n_indirect_targets).index.to_list()

    for direct_target in direct_targets:
        for indirect_target in indirect_targets:
            lvl1_edges['direct'].append(direct_target)
            lvl1_edges['indirect'].append(indirect_target)

lvl1_edges_df = pd.DataFrame(lvl1_edges).drop_duplicates()
print(f"All direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")

# === leave only the edges that are in the PPI network === #
ppi_edges = pd.concat([
    pd.DataFrame(ppi[['source', 'target']].values), 
    pd.DataFrame(ppi[['target', 'source']].values)
]).drop_duplicates()
ppi_edges.columns = ['direct', 'indirect']

lvl1_edges_df = pd.merge(lvl1_edges_df, ppi_edges, on=['direct', 'indirect'], how='inner')
print(f"PPI-included direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")

lvl1_edges_df = pd.concat([
    pd.DataFrame(lvl1_edges_df[['direct', 'indirect']].values),
    pd.DataFrame(lvl1_edges_df[['indirect', 'direct']].values)
]).drop_duplicates()
print(f"Undirected PPI-included direct/indirect gene pairs: {lvl1_edges_df.shape[0]:,}")
print(lvl1_edges_df)
lvl1_edges_df.columns = ['source', 'target']
lvl1_edges_df

All direct/indirect gene pairs: 6,552
PPI-included direct/indirect gene pairs: 5,250
Undirected PPI-included direct/indirect gene pairs: 8,640
          0        1
0      ABL1      KIT
1      ABL1    NTRK1
2      ABL1   PDGFRB
3      ABL1    CSF1R
4      ABL1   PDGFRA
...     ...      ...
5243   TP53  PIKFYVE
5244  BECN1  PIKFYVE
5245  HSPA8  PIKFYVE
5246  MAPK3  PIKFYVE
5247  MAPK1  PIKFYVE

[8640 rows x 2 columns]


Unnamed: 0,source,target
0,ABL1,KIT
1,ABL1,NTRK1
2,ABL1,PDGFRB
3,ABL1,CSF1R
4,ABL1,PDGFRA
...,...,...
5243,TP53,PIKFYVE
5244,BECN1,PIKFYVE
5245,HSPA8,PIKFYVE
5246,MAPK3,PIKFYVE


In [41]:
# ====== Construct Adjacency Matrix ====== #
nodes = pd.Series(pd.concat([lvl1_edges_df['source'], lvl1_edges_df['target']]).unique())

# create empty adjacency matrix filled with zeros with the number of nodes as dimension
adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
# add self-loops
for node in nodes:
    adjacency_matrix.loc[node, node] = 1


adj_fpath = os.path.join(args.datadir, f'template_adjacency_matrix_{n_indirect_targets}_indirect_targets_no_pathway.tsv')
adjacency_matrix.to_csv(adj_fpath, sep='\t', index=True, header=True)
adjacency_matrix

Unnamed: 0,ABL1,PDGFRB,KIT,PDGFRA,CSF1R,NTRK1,RET,DDR1,BCR,PPAT,...,STX4,COPB2,CLTB,TRAPPC3,KDELR2,KIF20A,SUZ12,EED,PIK3C3,BECN1
ABL1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRB,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIT,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRA,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSF1R,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KIF20A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
SUZ12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
EED,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
PIK3C3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [42]:
# ========================================== #
# ====== Create Drug-Specific Network ====== #
# ========================================== #
for drug in drug_list:
    # =========================== #
    # ====== For Each Drug ====== #
    # =========================== #
    drug_lvl1_edges = defaultdict(list)

    # === Create empty adjacency matrix filled with zeros with the number of nodes as dimension === #
    adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
    # add self-loops
    for node in nodes:
        adjacency_matrix.loc[node, node] = 1

    # ====== Level 1 Edges ====== #
    # === 1. Define Direct Targets === #
    direct_targets = drug_target_info.query('drug_name == @drug')['gene_name'].to_list()

    # === 2. Define Indirect Targets === #
    drug_ps_profile = ps_profile.loc[drug]
    indirect_targets = drug_ps_profile.sort_values(ascending=False).iloc[:n_indirect_targets].index.to_list()

    for direct_target in direct_targets:
        for indirect_target in indirect_targets:
            drug_lvl1_edges['direct'].append(direct_target)
            drug_lvl1_edges['indirect'].append(indirect_target)

    drug_lvl1_edges_df = pd.DataFrame(drug_lvl1_edges).drop_duplicates()
    print(f"All direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")

    # === 3. Define connections: Leave only the edges that are in the PPI network === #
    ppi_edges = pd.concat([
        pd.DataFrame(ppi[['source', 'target']].values), 
        pd.DataFrame(ppi[['target', 'source']].values)
    ]).drop_duplicates()

    ppi_edges.columns = ['direct', 'indirect']
    drug_lvl1_edges_df = pd.merge(drug_lvl1_edges_df, ppi_edges, on=['direct', 'indirect'], how='inner')
    print(f"PPI-included direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")

    drug_lvl1_edges_df = pd.concat([
        pd.DataFrame(drug_lvl1_edges_df[['direct', 'indirect']].values),
        pd.DataFrame(drug_lvl1_edges_df[['indirect', 'direct']].values)
    ]).drop_duplicates()
    print(f"Undirected PPI-included direct/indirect gene pairs: {drug_lvl1_edges_df.shape[0]:,}")
    drug_lvl1_edges_df.columns = ['source', 'target']

    # # ====== Level 2 Edges ====== #
    # # === 1. Pathways of the Indirect Targets === #
    # drug_lvl2_edges_df = kegg_df.query('target in @indirect_targets')
    # drug_lvl2_edges_df = pd.concat([
    #     pd.DataFrame(drug_lvl2_edges_df[['source', 'target']].values),
    #     pd.DataFrame(drug_lvl2_edges_df[['target', 'source']].values)
    # ]).drop_duplicates()
    # drug_lvl2_edges_df.columns = ['source', 'target']

    # === 2. Save Edges: Indirect Targets - Pathways === #
    drug_specific_edges = drug_lvl1_edges_df    # pd.concat([drug_lvl1_edges_df, drug_lvl2_edges_df], axis=0)

    # ====== Fill in the adjacency matrix with the edges from the drug-specific edges (undirected) ====== #
    for node1, node2 in drug_specific_edges.values:
        adjacency_matrix.loc[node1, node2] = 1

    drug_nwk_fpath = os.path.join(args.resultdir, f'{drug}_adjacency_matrix_{n_indirect_targets}_indirect_targets.tsv')
    adjacency_matrix.to_csv(drug_nwk_fpath, sep='\t', index=True, header=True)

All direct/indirect gene pairs: 180
PPI-included direct/indirect gene pairs: 167
Undirected PPI-included direct/indirect gene pairs: 264
All direct/indirect gene pairs: 460
PPI-included direct/indirect gene pairs: 330
Undirected PPI-included direct/indirect gene pairs: 364
All direct/indirect gene pairs: 40
PPI-included direct/indirect gene pairs: 38
Undirected PPI-included direct/indirect gene pairs: 74
All direct/indirect gene pairs: 220
PPI-included direct/indirect gene pairs: 182
Undirected PPI-included direct/indirect gene pairs: 278
All direct/indirect gene pairs: 300
PPI-included direct/indirect gene pairs: 273
Undirected PPI-included direct/indirect gene pairs: 348
All direct/indirect gene pairs: 200
PPI-included direct/indirect gene pairs: 190
Undirected PPI-included direct/indirect gene pairs: 290
All direct/indirect gene pairs: 180
PPI-included direct/indirect gene pairs: 169
Undirected PPI-included direct/indirect gene pairs: 268
All direct/indirect gene pairs: 120
PPI-incl

In [43]:
drug_specific_edges

Unnamed: 0,source,target
0,AURKA,AURKB
1,AURKA,CDK1
2,AURKA,TP53
3,AURKA,CCNB1
4,AURKA,CCNA2
...,...,...
33,PCNA,AURKB
34,UBE2C,AURKB
35,RAD51,AURKB
36,MCM3,AURKB


In [44]:
adjacency_matrix

Unnamed: 0,ABL1,PDGFRB,KIT,PDGFRA,CSF1R,NTRK1,RET,DDR1,BCR,PPAT,...,STX4,COPB2,CLTB,TRAPPC3,KDELR2,KIF20A,SUZ12,EED,PIK3C3,BECN1
ABL1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRB,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIT,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PDGFRA,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSF1R,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KIF20A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
SUZ12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
EED,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
PIK3C3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# PPI Smaller Size Network

In [173]:
import networkx as nx

# === Import PPI Template === #
args.ppi_template_fpath = os.path.join(args.datadir, '9606.protein.links.symbols.v11.5.txt')
ppi = pd.read_csv(args.ppi_template_fpath, sep='\t')
ppi_genes = list(set(ppi['source'].to_list() + ppi['target'].to_list()))
# === Import Drug Target Info === #
args.drug_target_fpath = os.path.join(args.datadir, 'dti_info_final_common_drugs_only.tsv')
drug_target_info = pd.read_csv(args.drug_target_fpath, sep='\t')
drug_target_info = drug_target_info[['drug_name', 'gene_name']]
drug_target_genes = drug_target_info['gene_name'].to_list()
assert drug_target_info['gene_name'].isin(ppi_genes).all()

In [174]:
args.resultdir = os.path.join(args.workdir, 'Data', f'drug_networks_target_ppi_990')
createFolder(args.resultdir)

In [175]:
# === filter ppi to have combined_score higher than 990 or include drug target genes === #
ppi_filt = ppi.query('combined_score >= 990 and (source in @drug_target_genes or target in @drug_target_genes)')
ppi_filt_genes = list(set(ppi_filt['source'].to_list() + ppi_filt['target'].to_list()))
print('#PPI Genes:', len(ppi_filt_genes))

G = nx.from_pandas_edgelist(ppi_filt, 'source', 'target')
print(len(G.nodes()), len(G.edges()))

# ====== Construct Adjacency Matrix ====== #
nodes = pd.Series(list(G.nodes))

# === Create empty adjacency matrix filled with zeros with the number of nodes as dimension === #
adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
# add self-loops
for node in nodes:
    adjacency_matrix.loc[node, node] = 1

adj_fpath = os.path.join(args.datadir, f'template_adjacency_matrix_target_ppi_990.tsv')
adjacency_matrix.to_csv(adj_fpath, sep='\t', index=True, header=True)
adjacency_matrix

#PPI Genes: 1272
1272 2187


Unnamed: 0,FKBP4,AR,HSP90AA1,CFTR,HSPA8,DVL2,CSNK1E,PLK1,RPAP3,CRY1,...,TNFAIP3,CCL5,PHB2,LAMP2,PPP2CA,VDR,NR1H4,BAG1,CD19,GTF2I
FKBP4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AR,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HSP90AA1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFTR,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HSPA8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VDR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
NR1H4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
BAG1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
CD19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [176]:
# ========================================== #
# ====== Create Drug-Specific Network ====== #
# ========================================== #
for drug in drug_list:

    # =========================== #
    # ====== For Each Drug ====== #
    # =========================== #
    drug_edges = defaultdict(list)

    # === Create empty adjacency matrix filled with zeros with the number of nodes as dimension === #
    adjacency_matrix = pd.DataFrame(0, index=nodes, columns=nodes)
    # add self-loops
    for node in nodes:
        adjacency_matrix.loc[node, node] = 1

    # === 1. Define Direct Targets === #
    direct_targets = drug_target_info.query('drug_name == @drug')['gene_name'].to_list()

    H = G.subgraph(drug_target_genes)
    #print('#Nodes:', len(H.nodes()), '#Edges:', len(H.edges()))

    # === 2. Define connections === #
    drug_specific_edges = nx.to_pandas_edgelist(H)

    # === 3. Save Edges: PPI Network > 990 score & include target genes === #
    drug_specific_edges = pd.concat([
        pd.DataFrame(drug_specific_edges[['source', 'target']].values),
        pd.DataFrame(drug_specific_edges[['target', 'source']].values)
    ]).drop_duplicates()
    drug_specific_edges.columns = ['source', 'target']

    # === Fill in the adjacency matrix with the edges from the drug-specific edges (undirected) === #
    for node1, node2 in drug_specific_edges.values:
        adjacency_matrix.loc[node1, node2] = 1

    drug_nwk_fpath = os.path.join(args.resultdir, f'{drug}_adjacency_matrix_target_ppi_990.tsv')
    adjacency_matrix.to_csv(drug_nwk_fpath, sep='\t', index=True, header=True)

In [177]:
drug_specific_edges

Unnamed: 0,source,target
0,KDR,HSP90AA1
1,KDR,CDH5
2,KDR,SRC
3,KDR,FLT1
4,HSP90AB1,HSP90AA1
...,...,...
193,CDK1,WEE1
194,ESR1,EGFR
195,STAT5B,EGFR
196,ERBB4,STAT5B


In [178]:
adjacency_matrix

Unnamed: 0,FKBP4,AR,HSP90AA1,CFTR,HSPA8,DVL2,CSNK1E,PLK1,RPAP3,CRY1,...,TNFAIP3,CCL5,PHB2,LAMP2,PPP2CA,VDR,NR1H4,BAG1,CD19,GTF2I
FKBP4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AR,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HSP90AA1,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFTR,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HSPA8,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VDR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
NR1H4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
BAG1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
CD19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
