In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import average_precision_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "gonzalobenegas/siraj_gwas_highpip"
V = load_dataset(dataset_path, split="test").to_pandas()
V

Unnamed: 0,chrom,pos,ref,alt,label
0,1,3080038,T,C,True
1,1,3774964,A,G,True
2,1,6616131,C,T,False
3,1,7665224,C,A,False
4,1,8407293,G,A,False
...,...,...,...,...,...
1778,22,47019717,G,T,False
1779,22,47990921,C,T,True
1780,22,50007172,T,C,False
1781,22,50190508,G,A,True


In [3]:
features = "CADD_Annot"
df = pd.read_parquet(f"https://huggingface.co/datasets/{dataset_path}/resolve/main/features/{features}.parquet")
df = df.fillna(df.mean())
feature_cols = df.columns
V = pd.concat([V, df], axis=1)
V

Unnamed: 0,chrom,pos,ref,alt,label,GC,CpG,minDistTSS,minDistTSE,priPhCons,...,RegSeq3,RegSeq4,RegSeq5,RegSeq6,RegSeq7,ZooPriPhyloP,ZooVerPhyloP,Roulette-MR,RawScore,PHRED
0,1,3080038,T,C,True,0.570,0.093,10827,106554,0.001,...,0.009910,0.015640,0.106421,0.151314,-0.066309,0.000,-2.697,0.105,-1.010255,0.024
1,1,3774964,A,G,True,0.616,0.053,2139,425,0.003,...,-0.001023,-0.000963,-0.003149,-0.004854,0.003128,0.050,-0.078,0.117,0.027796,1.192
2,1,6616131,C,T,False,0.576,0.133,2369,6515,0.001,...,0.001668,0.000788,0.002209,-0.001845,-0.005821,0.003,-0.943,0.211,-0.080996,0.782
3,1,7665224,C,A,False,0.662,0.080,403,842,0.029,...,-0.002230,-0.000337,-0.001056,-0.001997,-0.004303,0.006,-0.031,0.020,0.389293,4.270
4,1,8407293,G,A,False,0.497,0.013,16127,41417,0.075,...,0.001087,0.005582,-0.053076,-0.053857,0.043259,0.053,1.230,0.198,0.527651,5.745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1778,22,47019717,G,T,False,0.556,0.053,245790,154367,0.010,...,-0.006515,0.002971,-0.000897,-0.009989,0.018943,0.000,-2.979,0.073,-0.661621,0.083
1779,22,47990921,C,T,True,0.417,0.027,304318,22898,0.991,...,0.018466,0.073097,-0.008445,-0.028927,-0.002782,0.998,-0.561,1.661,1.543736,14.440
1780,22,50007172,T,C,False,0.397,0.000,5488,15361,0.014,...,0.003384,0.043081,0.012929,0.022809,-0.030787,0.000,-0.431,0.128,0.123385,1.715
1781,22,50190508,G,A,True,0.728,0.120,91,8968,0.036,...,-0.076514,-0.105828,-0.132242,-0.097705,0.024942,0.320,0.752,0.186,0.855265,8.817


In [4]:
res = []
for c in tqdm(feature_cols):
    res.append([c, max(average_precision_score(V["label"], V[c]), average_precision_score(V["label"], -V[c]))])
res = pd.DataFrame(res, columns=["feature", "AUPRC"]).sort_values("AUPRC", ascending=False)
res

100%|████████████████████████████████████████████████████████████| 85/85 [00:00<00:00, 325.21it/s]


Unnamed: 0,feature,AUPRC
60,EncodeDNase-sum,0.693898
61,EncodeDNase-max,0.692225
83,RawScore,0.665985
84,PHRED,0.665961
40,EncodeH3K4me2-sum,0.660635
...,...,...
11,cHmm_E1,0.505633
12,cHmm_E2,0.501473
15,cHmm_E5,0.500938
18,cHmm_E8,0.499230


In [7]:
res.head(10)

Unnamed: 0,feature,AUPRC
60,EncodeDNase-sum,0.693898
61,EncodeDNase-max,0.692225
83,RawScore,0.665985
84,PHRED,0.665961
40,EncodeH3K4me2-sum,0.660635
41,EncodeH3K4me2-max,0.659535
43,EncodeH3K4me3-max,0.651904
58,EncodeH2AFZ-sum,0.651254
80,ZooPriPhyloP,0.649233
59,EncodeH2AFZ-max,0.646902


In [6]:
res[res.feature.str.startswith("RegSeq")]

Unnamed: 0,feature,AUPRC
74,RegSeq2,0.571245
72,RegSeq0,0.565304
75,RegSeq3,0.559193
76,RegSeq4,0.555357
78,RegSeq6,0.553714
79,RegSeq7,0.550014
73,RegSeq1,0.548216
77,RegSeq5,0.540976
