Run regression using identified NETPHIX modules to predict drug response

In [12]:
import pandas as pd
import os

import cv_utils 

# current directory
netphix_dir = os.getcwd() + "/"

In [9]:
# read X (alt)

# drugs in netphix
# only the ones in ctrp
netphix_modules = pd.read_csv(netphix_dir+"results/max_sig_combined_modules_ctrp_cv_0.05.tsv", sep="\t")
# all sig moduls
all_netphix_modules = pd.read_csv(netphix_dir+"results/max_sig_combined_modules_0.05.tsv", sep="\t")

# drug name/id 
drug_id_df = pd.read_csv(netphix_dir+"data/drug_target_id.txt", sep="\t")
drug_id_dic = dict(zip(drug_id_df.drug, drug_id_df.id))

# depmap
target_prefix=netphix_dir + "data/gdsc_auc/Target"
alt_file = netphix_dir + "data/AlterationsV2_final.txt.gz"
alt_df = pd.read_csv(alt_file, sep="\t", index_col=0)

# read CTRP data
ctrp_auc_df = pd.read_csv(netphix_dir +"data/ctrp_auc_processed.txt", sep="\t", index_col=0)

In [7]:
# CTRP alteration data using the same table as GDSC
cell_dic = dict([(x, x.split("_")[0]) for x in alt_df.columns])
ctrp_alt_df = alt_df.rename(cell_dic, axis="columns")

common_cells = set(ctrp_auc_df.columns).intersection(ctrp_alt_df.columns)
ctrp_alt_df  =ctrp_alt_df[common_cells]
ctrp_auc_df = ctrp_auc_df[common_cells]

In [8]:
def merge_modules(netphix_modules):
    """
    merge modules for the same drug and create a new netphix module df
    """
    rows = []
    for drug, group in netphix_modules.groupby("drug"):
        all_inc = reduce(lambda x, y: x+y, group.inc.astype(str).str.split(","))
        new_inc = ",".join(filter(lambda x: x != "nan", set(all_inc)))
        all_dec = reduce(lambda x, y: x+y, group.dec.astype(str).str.split(","))
        new_dec = ",".join(filter(lambda x: x != "nan", set(all_dec)))
        rows.append((drug, new_dec, new_inc, ))
    return pd.DataFrame(data=rows, columns=["drug", "dec", "inc"])

merged_netphix_modules = merge_modules(netphix_modules)
merged_all_netphix_modules = merge_modules(all_netphix_modules)

# merged_netphix_modules.to_csv("results/merged_netphix_modules.tsv", sep="\t")

In [10]:
# Global Parameters
# target_prefix: target file prefix
# drug_id_dic: drug > id mapping
# alt_df: gdsc alteration for each gene +mutation type
# ctrp_drug_ids: drug ids for drugs in ctrp
# ctrp_auc_df: auc for ctrp
# ctrp_alt_df: alteration for ctrp

# Learning hyperparameters with GDSC
def netphix_ctrp_cv(method, score, netphix_modules):
    best_model_dic = {}
    ctrp_score_dic = {}
    score_param = cv_utils.assign_score_param(score)
    
    for module_idx in range(netphix_modules.shape[0]):
        print(module_idx)
        row = netphix_modules.iloc[module_idx].fillna("") # in case of nan 

        drug = row.drug
        drug_id = drug_id_dic[drug]

        # read drug response
        gdsc_target_file = target_prefix+str(drug_id)+".txt"
        target_df = pd.read_csv(gdsc_target_file, sep="\t", index_col=0)
        
        # netphix module - merge dec and inc, and remove empty string
        module = list(filter(lambda x: len(x) > 0, row.dec.split(",")+row.inc.split(","))) 
        print(drug+", "+str(drug_id), module)

        best_model = cv_utils.fit_gdsc_model(target_df, alt_df, module, method, score_param, cv=4)
        best_model_dic[drug] = best_model
        
        ctrp_score = cv_utils.comp_ctrp_score(best_model, ctrp_auc_df, ctrp_alt_df, drug, module)
        ctrp_score_dic[drug] = ctrp_score
        
    return best_model_dic, ctrp_score_dic

In [None]:
rfr_model_dic, rfr_ctrp_score_dic = netphix_ctrp_cv("rfr", "pci", merged_netphix_modules)

score_list = [(drug, rfr_ctrp_score_dic[drug]) for drug in rfr_ctrp_score_dic]
cv_results = pd.DataFrame(score_list, columns=["drug", "ctrp_score"])
# cv_results.set_index("drug").to_csv("results/netphix_rfr_ctrp_test_scores.tsv", sep="\t")

In [None]:
svr_model_dic, svr_ctrp_score_dic = netphix_ctrp_cv("svr", "pci", merged_netphix_modules)

score_list = [(drug, svr_ctrp_score_dic[drug]) for drug in svr_ctrp_score_dic]
cv_results = pd.DataFrame(score_list, columns=["drug", "ctrp_score"])
cv_results.set_index("drug").to_csv("results/netphix_svr_ctrp_test_scores.tsv", sep="\t")

In [14]:
# Learning hyperparameters with GDSC and test nested
def netphix_nested_cv( method, score, netphix_modules):
    nested_scores = {}
    cv_results = []
    score_param = cv_utils.assign_score_param(score)
    
    for module_idx in range(netphix_modules.shape[0]):
        row = netphix_modules.iloc[module_idx].fillna("") # in case of nan 
        drug = row.drug
        drug_id = drug_id_dic[drug]
        
        # netphix module - merge dec and inc, and remove empty string
        module = list(filter(lambda x: len(x) > 0, row.dec.split(",")+row.inc.split(","))) 
        print(drug+", "+str(drug_id), module)

        # read drug response
        target_df = pd.read_csv(target_prefix+str(drug_id)+".txt", sep="\t", index_col=0)
        
        nested_scores[drug] = cv_utils.nested_gdsc_cv(target_df, alt_df, module,  method, score_param, 
                                                      i_cv=3, o_cv=3, r_cv=2, r_state=drug_id)
        print(nested_scores[drug]['test_score'].mean())
    return nested_scores


In [None]:
rfr_nested_scores = netphix_nested_cv("rfr", "pci", merged_all_netphix_modules)

score_list = [(drug, rfr_nested_scores[drug]['test_score'].mean(), rfr_nested_scores[drug]['train_score'].mean()) for drug in rfr_nested_scores]
cv_results = pd.DataFrame(score_list, columns=["drug", "test_score", "train_score"])
# cv_results.set_index("drug").to_csv("results/netphix_rfr_nested_gdsc_mean_test_scores.tsv", sep="\t")

In [None]:
svr_nested_scores = netphix_nested_cv("svr", "pci", merged_all_netphix_modules)

score_list = [(drug, svr_nested_scores[drug]['test_score'].mean(), svr_nested_scores[drug]['train_score'].mean()) for drug in svr_nested_scores]
cv_results = pd.DataFrame(score_list, columns=["drug", "test_score", "train_score"])
cv_results.set_index("drug").to_csv("results/netphix_svr_nested_gdsc_mean_test_scores.tsv", sep="\t")