# Assess the marginal performance of every feature for primary observations

In [1]:
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')
feature_type_df = feature_type_df[['feature', 'feature_type']]
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
features = list(feature_type_df['feature'])

In [3]:
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0_perm-2,DB00014,DOID:1024,0,1,leprosy,Goserelin,0.006101,2,2,...,2,0.0,0.000545,0.000299,0.001267,0.0,0.001198,0.003134,0.0,0.014391
1,rephetio-v2.0_perm-1,DB00136,DOID:1024,0,1,leprosy,Calcitriol,0.002948,4,1,...,2,0.0,0.00251,8.1e-05,0.000907,0.0,0.004485,0.002194,0.000736,0.0


In [4]:
def compute_metrics(y_true, y_score):
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

def columnar_performance(df):
    y_true = df['status']
    perf_df = df[features].apply(lambda x: compute_metrics(y_true, x), axis='index')
    perf_df = perf_df.T
    perf_df.index.name = 'feature'
    return perf_df.reset_index()

perf_df = feature_df.groupby('hetnet').apply(columnar_performance).reset_index(level='hetnet')
perf_df = perf_df.merge(feature_type_df)
perf_df['permuted'] = perf_df.hetnet.str.contains('_perm').astype(int)

In [5]:
perf_df.merge(feature_type_df).head()

Unnamed: 0,hetnet,feature,nonzero,auroc,auprc,feature_type,permuted
0,rephetio-v2.0,prior_prob,1.0,0.841798,0.57895,prior,0
1,rephetio-v2.0_perm-1,prior_prob,1.0,0.798387,0.369178,prior,1
2,rephetio-v2.0_perm-2,prior_prob,1.0,0.800009,0.36282,prior,1
3,rephetio-v2.0_perm-3,prior_prob,1.0,0.797623,0.359071,prior,1
4,rephetio-v2.0_perm-4,prior_prob,1.0,0.800589,0.351372,prior,1


In [6]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = perf_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = feature_type_df.merge(compare_df)

In [7]:
compare_df.head(3)

Unnamed: 0,feature,feature_type,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
0,prior_prob,prior,1.0,0.841798,0.798573,0.043224,6.611432e-07,4e-06
1,CbG,degree,0.987642,0.537471,0.536827,0.000645,0.7302797,0.73028
2,CtD,degree,1.0,0.690826,0.677169,0.013657,3.990638e-05,6.2e-05


In [8]:
# Save datasets
perf_df.to_csv('data/feature-performance/primary-aucs.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/feature-performance/primary-aurocs.tsv', sep='\t', index=False, float_format='%.5g')