# Assess the marginal performance of every feature for primary observations

In [1]:
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')
feature_type_df = feature_type_df[['feature', 'feature_type']]
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
features = list(feature_type_df['feature'])

In [3]:
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CduftD,CpiwP,...,CsdiCpiwPeGgaD,CsdiCpiwPpiwCduftD,CsdiCsdiC<hpCduftD,CsdiCsdiC<ioCduftD,CsdiCsdiCduftD,CsdiCsdiCduftD<soD,CsdiCsdiCduftDso>D,CsdiCsdiChp>CduftD,CsdiCsdiCio>CduftD,CsdiCsdiCsdiCduftD
0,wikidata-v0.1_perm-1,Q10354103,Q1048084,1,1,opisthorchiasis,probucol,0.001998,3,1,...,0,0,0,0,0.010103,0,0,0,0,0.014973
1,wikidata-v0.1_perm-5,Q118551,Q1048084,1,1,opisthorchiasis,clarithromycin,0.006174,9,0,...,0,0,0,0,0.0,0,0,0,0,0.0


In [4]:
def compute_metrics(y_true, y_score):
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

def columnar_performance(df):
    y_true = df['status']
    perf_df = df[features].apply(lambda x: compute_metrics(y_true, x), axis='index')
    perf_df = perf_df.T
    perf_df.index.name = 'feature'
    return perf_df.reset_index()

perf_df = feature_df.groupby('hetnet').apply(columnar_performance).reset_index(level='hetnet')
perf_df = perf_df.merge(feature_type_df)
perf_df['permuted'] = perf_df.hetnet.str.contains('_perm').astype(int)

In [5]:
perf_df.merge(feature_type_df).head(3)

Unnamed: 0,hetnet,feature,nonzero,auroc,auprc,feature_type,permuted
0,wikidata-v0.1,prior_prob,1,0.837134,0.601458,prior,0
1,wikidata-v0.1_perm-1,prior_prob,1,0.801189,0.322215,prior,1
2,wikidata-v0.1_perm-2,prior_prob,1,0.801228,0.328423,prior,1


In [6]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = perf_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = feature_type_df.merge(compare_df)

In [7]:
compare_df.head(3)

Unnamed: 0,feature,feature_type,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
0,prior_prob,prior,1.0,0.837134,0.801685,0.035449,6.540402e-08,
1,CduftD,degree,1.0,0.707652,0.685922,0.02173,6.888891e-08,
2,CpiwP,degree,0.363153,0.497974,0.498203,-0.00023,0.5651983,


In [8]:
# Save datasets
perf_df.to_csv('data/feature-performance/primary-aucs.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/feature-performance/primary-aurocs.tsv', sep='\t', index=False, float_format='%.5g')