In [1]:
import numpy as np
import pandas as pd

In [20]:
datasets = [
    "mendelian_matched_9",
    "gwas_matched_9",
]

subsets = [
    "all",
    #"missense_variant",
    #"non_missense",
]

modalities = [
    "Zero-shot",
    "Linear probing",
]

models = [
    "Ensemble",
    "CADD",
    "GPN-MSA",
    "NucleotideTransformer",
    "HyenaDNA",
    "Caduceus",
    "GPN",
    #"Enformer",
    #"Borzoi",
]

def get_model_path(model, modality, dataset, subset):
    supervised_suffix = "LogisticRegression.chrom"
    if model == "CADD":
        predictor = "CADD.plus.RawScore" if modality == "Zero-shot" else f"CADD.{supervised_suffix}"
    elif model in ["Enformer", "Borzoi"]:
        predictor = f"{model}_L2_L2.plus.all" if modality == "Zero-shot" else f"{model}.{supervised_suffix}"
    elif model == "Ensemble":
        if "mendelian" in dataset:
            prefix = "OMIM_Ensemble_v2" 
        else:
            prefix = "Enformer+GPN-MSA+CADD"
        predictor = f"{prefix}.{supervised_suffix}"
    else:
        if "mendelian" in dataset:
            llr_version = "LLR"
            sign = "minus"
        elif "gwas" in dataset:
            llr_version = "absLLR"
            sign = "plus"
        predictor = f"{model}_{llr_version}.{sign}.score" if modality == "Zero-shot" else f"{model}_{llr_version}+InnerProducts.{supervised_suffix}"
    return f"../../results/dataset/{dataset}/metrics/{subset}/{predictor}.csv"

In [21]:
rows = []
for dataset in datasets:
    for subset in subsets:
        for modality in modalities:
            for model in models:
                if model == "Ensemble" and modality == "Zero-shot":
                    continue
                path = get_model_path(model, modality, dataset, subset)
                df = pd.read_csv(path).iloc[0]
                rows.append([dataset, subset, modality, model, df["score"], df["se"]])
df = pd.DataFrame(rows, columns=["dataset", "subset", "modality", "model", "score", "se"])
df

Unnamed: 0,dataset,subset,modality,model,score,se
0,mendelian_matched_9,all,Zero-shot,CADD,0.696187,0.031779
1,mendelian_matched_9,all,Zero-shot,GPN-MSA,0.728062,0.025896
2,mendelian_matched_9,all,Zero-shot,NucleotideTransformer,0.110495,0.011018
3,mendelian_matched_9,all,Zero-shot,HyenaDNA,0.095783,0.001784
4,mendelian_matched_9,all,Zero-shot,Caduceus,0.091912,0.00199
5,mendelian_matched_9,all,Zero-shot,GPN,0.194784,0.029165
6,mendelian_matched_9,all,Linear probing,Ensemble,0.922642,0.013339
7,mendelian_matched_9,all,Linear probing,CADD,0.909995,0.015521
8,mendelian_matched_9,all,Linear probing,GPN-MSA,0.735767,0.031769
9,mendelian_matched_9,all,Linear probing,NucleotideTransformer,0.149904,0.008942


In [22]:
def format_score(x):
    return (x * 100).round().astype(int).apply(lambda y: f"{y:02d}")

def format_se(x):
    assert (x * 100).max() < 10
    return (x * 100).round().astype(int).apply(lambda y: f"{y:01d}")

df["value"] = format_score(df.score) + "$\pm$" + format_se(df.se)
#df["value"] = df.score.apply(lambda x: f"{x:.2f}") + "$\pm$" + df.se.apply(lambda x: f"{x:.2f}")

In [23]:
#df.loc[(df.model.isin(["Enformer", "Borzoi", "GPN"])) & (df.subset!="non_missense"), "value"] = "-"

In [24]:
bold_values = [
    ("mendelian_matched_9", "all", "Zero-shot", "CADD"),
    ("mendelian_matched_9", "all", "Zero-shot", "GPN-MSA"),
    ("mendelian_matched_9", "missense_variant", "Zero-shot", "CADD"),
    ("mendelian_matched_9", "missense_variant", "Zero-shot", "GPN-MSA"),
    ("mendelian_matched_9", "non_missense", "Zero-shot", "CADD"),
    ("mendelian_matched_9", "non_missense", "Zero-shot", "GPN-MSA"),

    ("mendelian_matched_9", "all", "Linear probing", "CADD"),
    ("mendelian_matched_9", "all", "Linear probing", "Ensemble"),
    ("mendelian_matched_9", "missense_variant", "Linear probing", "CADD"),
    ("mendelian_matched_9", "missense_variant", "Linear probing", "Ensemble"),
    ("mendelian_matched_9", "non_missense", "Linear probing", "CADD"),
    ("mendelian_matched_9", "non_missense", "Linear probing", "Ensemble"),

    ("gwas_matched_9", "all", "Zero-shot", "CADD"),
    ("gwas_matched_9", "all", "Zero-shot", "GPN-MSA"),
    ("gwas_matched_9", "missense_variant", "Zero-shot", "CADD"),
    #("gwas_matched_9", "missense_variant", "Zero-shot", "GPN-MSA"),
    ("gwas_matched_9", "non_missense", "Zero-shot", "Enformer"),
    ("gwas_matched_9", "non_missense", "Zero-shot", "Borzoi"),

    ("gwas_matched_9", "all", "Linear probing", "Ensemble"),
    ("gwas_matched_9", "missense_variant", "Linear probing", "Ensemble"),
    ("gwas_matched_9", "non_missense", "Linear probing", "Ensemble"),
]

for dataset, subset, modality, model in bold_values:
    mask = (
        (df.dataset==dataset) & (df.subset==subset) &
        (df.modality==modality) & (df.model==model)
    )
    df.loc[mask, "value"] = r"\textbf{" + df.loc[mask, "value"] + "}"

In [25]:
df.dataset = df.dataset.map({
    "mendelian_matched_9": r"\textbf{Mendelian traits}",
    "gwas_matched_9": r"\textbf{Complex traits}",
})
df.subset = df.subset.map({
    "all": "All",
    "missense_variant": "Coding",
    "non_missense": "Non-coding",
})
df.modality = df.modality.map({
    "Zero-shot": r"\textbf{Zero-shot}",
    "Linear probing": r"\textbf{Linear probing}",
})

In [26]:
df = df.pivot_table(
    index=["modality", "model"],
    columns=[
        "dataset",
        #"subset",
    ],
    values="value", aggfunc="first", sort=False,
)
df

Unnamed: 0_level_0,dataset,\textbf{Mendelian traits},\textbf{Complex traits}
modality,model,Unnamed: 2_level_1,Unnamed: 3_level_1
\textbf{Zero-shot},CADD,\textbf{70$\pm$3},\textbf{16$\pm$0}
\textbf{Zero-shot},GPN-MSA,\textbf{73$\pm$3},\textbf{16$\pm$1}
\textbf{Zero-shot},NucleotideTransformer,11$\pm$1,09$\pm$0
\textbf{Zero-shot},HyenaDNA,10$\pm$0,10$\pm$0
\textbf{Zero-shot},Caduceus,09$\pm$0,09$\pm$0
\textbf{Zero-shot},GPN,19$\pm$3,10$\pm$0
\textbf{Linear probing},CADD,\textbf{91$\pm$2},22$\pm$1
\textbf{Linear probing},GPN-MSA,74$\pm$3,23$\pm$1
\textbf{Linear probing},NucleotideTransformer,15$\pm$1,15$\pm$1
\textbf{Linear probing},HyenaDNA,13$\pm$1,11$\pm$0


In [27]:
df.index.names = [None, None]
#df.columns.names = [None, None]
df.columns.name = None

In [28]:
print(df.to_latex(multicolumn_format='c', escape=False))

\begin{tabular}{llll}
\toprule
 &  & \textbf{Mendelian traits} & \textbf{Complex traits} \\
\midrule
\multirow[t]{6}{*}{\textbf{Zero-shot}} & CADD & \textbf{70$\pm$3} & \textbf{16$\pm$0} \\
 & GPN-MSA & \textbf{73$\pm$3} & \textbf{16$\pm$1} \\
 & NucleotideTransformer & 11$\pm$1 & 09$\pm$0 \\
 & HyenaDNA & 10$\pm$0 & 10$\pm$0 \\
 & Caduceus & 09$\pm$0 & 09$\pm$0 \\
 & GPN & 19$\pm$3 & 10$\pm$0 \\
\cline{1-4}
\multirow[t]{7}{*}{\textbf{Linear probing}} & CADD & \textbf{91$\pm$2} & 22$\pm$1 \\
 & GPN-MSA & 74$\pm$3 & 23$\pm$1 \\
 & NucleotideTransformer & 15$\pm$1 & 15$\pm$1 \\
 & HyenaDNA & 13$\pm$1 & 11$\pm$0 \\
 & Caduceus & 11$\pm$1 & 13$\pm$1 \\
 & GPN & 26$\pm$3 & 16$\pm$1 \\
 & Ensemble & \textbf{92$\pm$1} & \textbf{33$\pm$1} \\
\cline{1-4}
\bottomrule
\end{tabular}

