# Methods Comparison
### Drug Split

In [15]:
import os
import argparse
import pandas as pd

parser = argparse.ArgumentParser()
args, _ = parser.parse_known_args()

args.rootdir = '/data/project/minwoo/'
args.resultdir = os.path.join(args.rootdir, 'drug_specific_gene_selection', 'Results', 'performance_logs')

split_type = 'drug'
n_cv = 5

comparison_results = []
for result in os.listdir(args.resultdir):
    if '_drug' in result:
        split = 'drug'
    elif '_cell' in result:
        split = 'cell'

    model_name = '_'.join(result.split('_')[:-1])
    
    if split == split_type:
        filename = os.path.join(args.resultdir, result)
        test_result = pd.read_csv(filename, sep='\t', header=None)
        test_result = test_result.drop(0, axis='columns')

        column_names = []
        for col in test_result:
            column_name = test_result[col].apply(lambda x: x.split(':')[0])[0]
            test_result[col] = test_result[col].apply(lambda x: x.split(': ')[1]).astype(float)
            column_names.append(column_name)
        test_result.columns = column_names
        test_result = test_result.iloc[:n_cv]
        test_result.insert(0, 'Model', model_name)
        test_result.insert(0, 'CV', [f'{cv:02d}' for cv in range(1, test_result.shape[0]+1)])

        comparison_results.append(test_result)
comparison_results = pd.concat(comparison_results).reset_index(drop=True)
comparison_results = comparison_results.rename(columns={'CORR': 'PCC', 'SPEARMAN': 'SCC'})

In [16]:
models = comparison_results['Model'].value_counts()[comparison_results['Model'].value_counts() == 5].index
models

Index(['MLP_High_NetGP_4567_v2_231111_drug_gpu2',
       'MLP_L1000_Landmark_1234_v2_231111_drug_gpu4',
       'MLP_L1000_Landmark_4567_v2_231111_drug_gpu4',
       'MLP_10K_Baseline_1234_v2_231111_drug_gpu4',
       'HeteroNet_Top1000_231111_4567_20_indirect_targets_no_pathway_drug',
       'MLP_L1000_Landmark_1024_v2_231111_drug_gpu4',
       'MLP_L1_Selected_1234_v2_231111_drug_gpu4',
       'MLP_High_NetGP_1024_v2_231111_drug_gpu2',
       'HeteroNet_Top1000_231111_4567_target_ppi_990_drug',
       'HeteroNet_Top1000_231111_1024_20_indirect_targets_drug',
       'HeteroNet_Top1000_231111_4567_20_indirect_targets_drug',
       'HeteroNet_Top1000_231111_1024_target_ppi_990_drug',
       'MLP_10K_Baseline_4567_v2_231111_drug_gpu4',
       'MLP_High_Variance_1024_v2_231111_drug_gpu4',
       'HeteroNet_Top1000_231111_1234_20_indirect_targets_no_pathway_drug',
       'MLP_10K_Baseline_1024_v2_231111_drug_gpu4',
       'HeteroNet_Top1000_231111_1024_20_indirect_targets_no_pathway_drug',


In [18]:
comparison_results = comparison_results.query('Model in @models')

def categorize_model_name(model_name):
    if 'no_pathway' in model_name:
        return 'HeteroNet - Pathway'
    elif '20_indirect_targets' in model_name:
        return 'HeteroNet'
    elif 'target_ppi' in model_name:
        return 'HeteroNet - Pathway - PPI'
    elif 'High_NetGP' in model_name:
        return 'High NetGP'
    elif 'High_Variance' in model_name:
        return 'High Variance'
    elif 'Landmark' in model_name:
        return 'Landmark Genes'
    elif 'L1_Selected' in model_name:
        return 'Data-Driven (L1)'
    elif '10K_Baseline' in model_name:
        return 'All Genes (10,000)'

comparison_results['Category'] = comparison_results['Model'].apply(categorize_model_name)

In [19]:
comparison_results['Category'].value_counts()

Category
High NetGP                   15
Landmark Genes               15
All Genes (10,000)           15
HeteroNet - Pathway          15
Data-Driven (L1)             15
HeteroNet - Pathway - PPI    15
HeteroNet                    15
High Variance                15
Name: count, dtype: int64

In [20]:
comparison_results

Unnamed: 0,CV,Model,Loss,RMSE,PCC,SCC,Category
0,01,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.801251,1.949632,0.515983,0.472794,High NetGP
1,02,MLP_High_NetGP_4567_v2_231111_drug_gpu2,4.329735,2.080836,0.424253,0.400104,High NetGP
2,03,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.546576,1.883253,0.483457,0.492730,High NetGP
3,04,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.992457,1.998124,0.433148,0.411037,High NetGP
4,05,MLP_High_NetGP_4567_v2_231111_drug_gpu2,5.658648,2.378768,0.226010,0.202182,High NetGP
...,...,...,...,...,...,...,...
115,01,HeteroNet_Top1000_231111_1234_target_ppi_990_drug,5.152550,2.269742,0.550355,0.448556,HeteroNet - Pathway - PPI
116,02,HeteroNet_Top1000_231111_1234_target_ppi_990_drug,6.531762,2.556039,0.440379,0.339186,HeteroNet - Pathway - PPI
117,03,HeteroNet_Top1000_231111_1234_target_ppi_990_drug,5.397410,2.323199,0.507223,0.446490,HeteroNet - Pathway - PPI
118,04,HeteroNet_Top1000_231111_1234_target_ppi_990_drug,4.969573,2.229246,0.579409,0.479470,HeteroNet - Pathway - PPI


In [23]:
ablation_result = comparison_results[comparison_results['Category'].str.contains("HeteroNet")]
ablation_result['Category'].value_counts()

Category
HeteroNet - Pathway          15
HeteroNet - Pathway - PPI    15
HeteroNet                    15
Name: count, dtype: int64

In [25]:
method_comparison_result = comparison_results[~comparison_results['Category'].str.contains("HeteroNet")]
method_comparison_result['Category'].value_counts()

Category
High NetGP            15
Landmark Genes        15
All Genes (10,000)    15
Data-Driven (L1)      15
High Variance         15
Name: count, dtype: int64

In [29]:
heteronet_result = comparison_results[comparison_results['Category'] == 'HeteroNet']
method_comparison_result = pd.concat([method_comparison_result, heteronet_result]).reset_index(drop=True)
method_comparison_result

Unnamed: 0,CV,Model,Loss,RMSE,PCC,SCC,Category
0,01,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.801251,1.949632,0.515983,0.472794,High NetGP
1,02,MLP_High_NetGP_4567_v2_231111_drug_gpu2,4.329735,2.080836,0.424253,0.400104,High NetGP
2,03,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.546576,1.883253,0.483457,0.492730,High NetGP
3,04,MLP_High_NetGP_4567_v2_231111_drug_gpu2,3.992457,1.998124,0.433148,0.411037,High NetGP
4,05,MLP_High_NetGP_4567_v2_231111_drug_gpu2,5.658648,2.378768,0.226010,0.202182,High NetGP
...,...,...,...,...,...,...,...
85,01,HeteroNet_Top1000_231111_1234_20_indirect_targ...,5.125114,2.263526,0.547069,0.432594,HeteroNet
86,02,HeteroNet_Top1000_231111_1234_20_indirect_targ...,5.894965,2.428082,0.478829,0.378965,HeteroNet
87,03,HeteroNet_Top1000_231111_1234_20_indirect_targ...,5.914532,2.432000,0.455501,0.403282,HeteroNet
88,04,HeteroNet_Top1000_231111_1234_20_indirect_targ...,4.790263,2.188471,0.574545,0.466130,HeteroNet


### Ablation Study

In [31]:
ablation_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='PCC', ascending=False)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.515483,2.318018,0.414087
HeteroNet - Pathway - PPI,0.50343,2.346977,0.405413
HeteroNet - Pathway,0.493964,2.365618,0.399404


In [30]:
ablation_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='RMSE', ascending=True)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.515483,2.318018,0.414087
HeteroNet - Pathway - PPI,0.50343,2.346977,0.405413
HeteroNet - Pathway,0.493964,2.365618,0.399404


In [32]:
ablation_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='SCC', ascending=False)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.515483,2.318018,0.414087
HeteroNet - Pathway - PPI,0.50343,2.346977,0.405413
HeteroNet - Pathway,0.493964,2.365618,0.399404


In [None]:
ablation_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].std()

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.045581,0.083069,0.063259
HeteroNet - Pathway,0.063677,0.118783,0.069289
HeteroNet - Pathway - PPI,0.065508,0.118095,0.072911


### Method Comparison Result

In [33]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='PCC', ascending=False)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.515483,2.318018,0.414087
Data-Driven (L1),0.462157,2.402022,0.445496
High NetGP,0.447672,2.392676,0.375959
High Variance,0.384902,2.452223,0.301379
Landmark Genes,0.382087,2.440812,0.305116
"All Genes (10,000)",0.36557,2.504847,0.293133


In [38]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].std().loc[['HeteroNet', 'Data-Driven (L1)', 'High NetGP', 'High Variance', 'Landmark Genes', 'All Genes (10,000)']]

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.045581,0.083069,0.063259
Data-Driven (L1),0.058828,0.22597,0.058506
High NetGP,0.077855,0.280712,0.073276
High Variance,0.075472,0.218667,0.056535
Landmark Genes,0.072091,0.254803,0.069
"All Genes (10,000)",0.076451,0.278098,0.060855


In [34]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='RMSE', ascending=True)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.515483,2.318018,0.414087
High NetGP,0.447672,2.392676,0.375959
Data-Driven (L1),0.462157,2.402022,0.445496
Landmark Genes,0.382087,2.440812,0.305116
High Variance,0.384902,2.452223,0.301379
"All Genes (10,000)",0.36557,2.504847,0.293133


In [39]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].std().loc[['HeteroNet', 'High NetGP', 'Data-Driven (L1)', 'Landmark Genes', 'High Variance', 'All Genes (10,000)']]

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HeteroNet,0.045581,0.083069,0.063259
High NetGP,0.077855,0.280712,0.073276
Data-Driven (L1),0.058828,0.22597,0.058506
Landmark Genes,0.072091,0.254803,0.069
High Variance,0.075472,0.218667,0.056535
"All Genes (10,000)",0.076451,0.278098,0.060855


In [35]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].mean().sort_values(by='SCC', ascending=False)

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Data-Driven (L1),0.462157,2.402022,0.445496
HeteroNet,0.515483,2.318018,0.414087
High NetGP,0.447672,2.392676,0.375959
Landmark Genes,0.382087,2.440812,0.305116
High Variance,0.384902,2.452223,0.301379
"All Genes (10,000)",0.36557,2.504847,0.293133


In [40]:
method_comparison_result.groupby('Category')[['PCC', 'RMSE', 'SCC']].std().loc[['Data-Driven (L1)', 'HeteroNet', 'High NetGP', 'Landmark Genes', 'High Variance', 'All Genes (10,000)']]

Unnamed: 0_level_0,PCC,RMSE,SCC
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Data-Driven (L1),0.058828,0.22597,0.058506
HeteroNet,0.045581,0.083069,0.063259
High NetGP,0.077855,0.280712,0.073276
Landmark Genes,0.072091,0.254803,0.069
High Variance,0.075472,0.218667,0.056535
"All Genes (10,000)",0.076451,0.278098,0.060855
