In [1]:
import os
import numpy as np
import pandas as pd

from itertools import combinations
from scipy.stats import ttest_ind as TTEST

In [2]:
resDir = '../../data/results'

In [3]:
clTests = [f for f in os.listdir(resDir) if 'cellLineTest' in f]
clNews = [f for f in os.listdir(resDir) if 'cellLineNew' in f]
ctTests = [f for f in os.listdir(resDir) if 'cancerTest' in f]
ctNews = [f for f in os.listdir(resDir) if 'cancerNew' in f]

In [13]:
cols = ['p1', 'p1Which', 'p2', 'p2Which', 'p3', 'p3Which',
        'p4', 'p4Which', 'p5', 'p5Which', 'p0', 'p0Which',
        'wrg', 'wrgWhich', 'unq', 'unqWhich', 'var', 'varWhich']
idx = ['RF_DSC', 'LM_DSC', 'DNN_DSC', 'RF_LM', 'RF_DNN', 'LM_DNN']
pvalDF = pd.DataFrame(index=idx, columns=cols)

In [10]:
for i, j in combinations(clTests, 2):
    print(i, j)

embedDrugEmbedCellConcatRF-cellLineTest.csv rawDrugEmbedCellConcatLogistic-cellLineTest.csv
embedDrugEmbedCellConcatRF-cellLineTest.csv rawDrugEmbedCellConcatDNN-cellLineTest.csv
embedDrugEmbedCellConcatRF-cellLineTest.csv DeepDSC-cellLineTest.csv
rawDrugEmbedCellConcatLogistic-cellLineTest.csv rawDrugEmbedCellConcatDNN-cellLineTest.csv
rawDrugEmbedCellConcatLogistic-cellLineTest.csv DeepDSC-cellLineTest.csv
rawDrugEmbedCellConcatDNN-cellLineTest.csv DeepDSC-cellLineTest.csv


In [43]:
def getPvals(filePattern, resDir='../../data/results', alpha=0.05, nfolds=5, correct=False):
    files = [f for f in os.listdir(resDir) if filePattern in f]
    if 'cancer' in filePattern:
        pvalDF = pd.DataFrame(index=idx, columns=cols[:10])
    else:
        pvalDF = pd.DataFrame(index=idx, columns=cols)
    
    for i, j in combinations(files, 2):
        if 'RF' in i:
            if 'DSC' in j:
                row = 'RF_DSC'
            elif 'Log' in j:
                row = 'RF_LM'
            else:
                row = 'RF_DNN'
        elif 'Log' in i:
            if 'DSC' in j:
                row = 'LM_DSC'
            else:
                row='LM_DNN'
        else:
            row = 'DNN_DSC'
        
        models = row.split('_')
        df1 = pd.read_csv(os.path.join(resDir, i))
        df2 = pd.read_csv(os.path.join(resDir, j))
        pvals = []
        for col in df1.columns:
            if 'Wrong' in col:
                models.reverse()
                
            p = TTEST(df1[col][:nfolds], df2[col][:nfolds]).pvalue
            if str(p) == 'nan':
                p = 1.
            pvals.append(p)
            if p <= alpha:
                if df1[col].mean() > df2[col].mean():
                    pvals.append(models[0])
                else:
                    pvals.append(models[1])
            else:
                pvals.append('')
                
            if 'Wrong' in col:
                models.reverse()

        pvalDF.loc[row, :] = pvals
    
    if correct:
        for i in [c for c in pvalDF.columns if 'Which' not in c]:
            corrected = round(pvalDF.loc[:, i].astype(float) * len(files),4)
            pvalDF.loc[:, i] = np.where(corrected > 1., 1.0000, corrected)
    
    return pvalDF
            

In [44]:
clTest = getPvals('cellLineTest', correct=True)
clTest

  pvalDF.loc[:, i] = np.where(corrected > 1., 1.0000, corrected)


Unnamed: 0,p1,p1Which,p2,p2Which,p3,p3Which,p4,p4Which,p5,p5Which,p0,p0Which,wrg,wrgWhich,unq,unqWhich,var,varWhich
RF_DSC,0.1064,RF,0.061,RF,0.0506,RF,0.0465,RF,0.033,RF,0.0013,RF,0.2637,,0.7808,,0.0,RF
LM_DSC,0.1225,LM,0.0609,LM,0.0574,LM,0.0456,LM,0.0358,LM,0.0013,LM,0.2637,,0.3757,,0.0259,DSC
DNN_DSC,0.1229,DNN,0.0686,DNN,0.0645,DNN,0.0519,DNN,0.0374,DNN,0.0013,DNN,0.2637,,0.5179,,0.0259,DSC
RF_LM,0.4364,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,0.1663,RF,0.0,RF
RF_DNN,0.9021,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,0.8594,,0.0,RF
LM_DNN,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,


In [45]:
ctTest = getPvals('cancerTest', correct=True)
ctTest

  pvalDF.loc[:, i] = np.where(corrected > 1., 1.0000, corrected)


Unnamed: 0,p1,p1Which,p2,p2Which,p3,p3Which,p4,p4Which,p5,p5Which
RF_DSC,0.0991,RF,0.0618,RF,0.0507,RF,0.0484,RF,0.033,RF
LM_DSC,0.1053,LM,0.0579,LM,0.0582,LM,0.0454,LM,0.0384,LM
DNN_DSC,0.1087,DNN,0.0677,DNN,0.0609,DNN,0.0523,DNN,0.0372,DNN
RF_LM,1.0,,1.0,,1.0,,1.0,,1.0,
RF_DNN,1.0,,1.0,,1.0,,1.0,,1.0,
LM_DNN,1.0,,1.0,,1.0,,1.0,,1.0,


In [46]:
clNew = getPvals('cellLineNew', correct=True)
clNew

  pvalDF.loc[:, i] = np.where(corrected > 1., 1.0000, corrected)


Unnamed: 0,p1,p1Which,p2,p2Which,p3,p3Which,p4,p4Which,p5,p5Which,p0,p0Which,wrg,wrgWhich,unq,unqWhich,var,varWhich
RF_DSC,0.0436,RF,0.0777,RF,0.1111,RF,0.0888,RF,0.0554,RF,0.0019,RF,1.0,,1.0,,0.0,RF
LM_DSC,0.066,LM,0.1052,LM,0.0874,LM,0.0619,LM,0.0265,LM,0.0016,LM,0.733,,0.4053,,0.0864,DSC
DNN_DSC,0.0612,DNN,0.0906,DNN,0.1098,DNN,0.075,DNN,0.0355,DNN,0.0012,DNN,0.7979,,0.5379,,0.0377,DSC
RF_LM,0.0019,RF,0.0061,RF,0.0282,LM,0.0003,LM,0.0,LM,0.7197,,0.0158,LM,0.0,RF,0.0,RF
RF_DNN,0.5117,,0.5645,,1.0,,0.9113,,0.0325,DNN,0.0413,DNN,0.2668,,0.0001,RF,0.0,RF
LM_DNN,1.0,,0.3774,,0.9685,,1.0,,0.1789,LM,0.169,DNN,1.0,,0.1411,DNN,0.966,


In [47]:
ctNew = getPvals('cancerNew', correct=True)
ctNew

  pvalDF.loc[:, i] = np.where(corrected > 1., 1.0000, corrected)


Unnamed: 0,p1,p1Which,p2,p2Which,p3,p3Which,p4,p4Which,p5,p5Which
RF_DSC,0.0548,RF,0.0816,RF,0.1243,RF,0.1144,RF,0.0694,RF
LM_DSC,0.0722,LM,0.113,LM,0.0576,LM,0.0732,LM,0.0359,LM
DNN_DSC,0.0658,DNN,0.0918,DNN,0.1054,DNN,0.0981,DNN,0.0393,DNN
RF_LM,0.2384,,0.0175,RF,0.0,LM,0.0049,LM,0.0007,LM
RF_DNN,1.0,,1.0,,1.0,,1.0,,0.0194,DNN
LM_DNN,1.0,,0.2196,,0.0691,LM,0.8773,,1.0,
