In [1]:
import numpy as np
import pandas as pd

In [2]:
model_metadata = pd.read_csv("../../config/models.csv")
palette = model_metadata.set_index("description")["color"].to_dict()
model_renaming = model_metadata.set_index("name")["description"].to_dict()
model_metadata

Unnamed: 0,name,description,color
0,CADD,CADD,C0
1,GPN-MSA,GPN-MSA,C1
2,Borzoi,Borzoi,C2
3,Enformer,Enformer,C3
4,GPN_final,GPN-Promoter,C4
5,NucleotideTransformer,NT,C5
6,HyenaDNA,HyenaDNA,C6
7,Caduceus,Caduceus,C7
8,CADD+GPN-MSA+Borzoi,Ensemble,C8
9,Sei,Sei,C9


In [3]:
dataset_renaming = {
    "mendelian_traits_matched_9": "Mendelian traits",
    "complex_traits_matched_9": "Complex traits",
}

subset_renaming = {}

score_renaming = {
    "LLR.minus.score": "LLR",
    "absLLR.plus.score": "abs(LLR)",
    "Embeddings.plus.euclidean_distance": "L2 dist.",
    "Embeddings.plus.cosine_distance": "Cosine dist.",
    "Embeddings.minus.inner_product": "Inner prod.",
}

In [4]:
datasets = [
    "mendelian_traits_matched_9",
    "complex_traits_matched_9",
]

subsets = [
    "all",
]

models = [
    "GPN-MSA",
    "GPN_final",
    "NucleotideTransformer",
    "HyenaDNA",
    "Caduceus",
    "SpeciesLM",
]

scores = [
    "LLR.minus.score",
    "absLLR.plus.score",
    "Embeddings.plus.euclidean_distance",
    "Embeddings.plus.cosine_distance",
    "Embeddings.minus.inner_product",
]

def get_model_path(model, score, dataset, subset):
    return f"../../results/dataset/{dataset}/AUPRC_by_chrom_weighted_average/{subset}/{model}_{score}.csv"

In [5]:
rows = []
for dataset in datasets:
    for subset in subsets:
        for model in models:
            for score in scores:
                path = get_model_path(model, score, dataset, subset)
                df = pd.read_csv(path).iloc[0]
                rows.append([
                    dataset_renaming.get(dataset, dataset),
                    subset_renaming.get(subset, subset),
                    model_renaming.get(model, model),
                    score_renaming.get(score, score),
                    df["score"],
                    df["se"],
                ])
df = pd.DataFrame(rows, columns=["dataset", "subset", "model", "score", "AUPRC", "se"])
df

Unnamed: 0,dataset,subset,model,score,AUPRC,se
0,Mendelian traits,all,GPN-MSA,LLR,0.694475,0.041907
1,Mendelian traits,all,GPN-MSA,abs(LLR),0.654223,0.044973
2,Mendelian traits,all,GPN-MSA,L2 dist.,0.206924,0.020884
3,Mendelian traits,all,GPN-MSA,Cosine dist.,0.208002,0.020988
4,Mendelian traits,all,GPN-MSA,Inner prod.,0.301125,0.03169
5,Mendelian traits,all,GPN-Promoter,LLR,0.421676,0.069664
6,Mendelian traits,all,GPN-Promoter,abs(LLR),0.378816,0.068633
7,Mendelian traits,all,GPN-Promoter,L2 dist.,0.344996,0.0563
8,Mendelian traits,all,GPN-Promoter,Cosine dist.,0.263391,0.038219
9,Mendelian traits,all,GPN-Promoter,Inner prod.,0.169318,0.047518


In [None]:
def format_score(x):
    return (x * 100).round().astype(int).apply(lambda y: f"{y:02d}")

def format_se(x):
    assert (x * 100).max() < 100
    return (x * 100).round().astype(int).apply(lambda y: f"{y:02d}")

#df["value"] = format_score(df.score) + "$\pm$" + format_se(df.se)
#df["value"] = format_score(df.score)
#df["value"] = df.score.apply(lambda x: f"{x:.2f}") + "$\pm$" + df.se.apply(lambda x: f"{x:.2f}")
df["value"] = df.AUPRC.apply(lambda x: f"{x:.3f}")
df

Unnamed: 0,dataset,subset,model,score,AUPRC,se,value
0,Mendelian traits,all,GPN-MSA,LLR,0.694475,0.041907,0.694
1,Mendelian traits,all,GPN-MSA,abs(LLR),0.654223,0.044973,0.654
2,Mendelian traits,all,GPN-MSA,L2 dist.,0.206924,0.020884,0.207
3,Mendelian traits,all,GPN-MSA,Cosine dist.,0.208002,0.020988,0.208
4,Mendelian traits,all,GPN-MSA,Inner prod.,0.301125,0.03169,0.301
5,Mendelian traits,all,GPN-Promoter,LLR,0.421676,0.069664,0.422
6,Mendelian traits,all,GPN-Promoter,abs(LLR),0.378816,0.068633,0.379
7,Mendelian traits,all,GPN-Promoter,L2 dist.,0.344996,0.0563,0.345
8,Mendelian traits,all,GPN-Promoter,Cosine dist.,0.263391,0.038219,0.263
9,Mendelian traits,all,GPN-Promoter,Inner prod.,0.169318,0.047518,0.169


In [7]:
df = df.pivot_table(
    columns=[
        "score",
    ],
    index=[
        "dataset",
        #"subset",
        "model",
    ],
    values="value",
    aggfunc="first", sort=False,
)
df

Unnamed: 0_level_0,score,LLR,abs(LLR),L2 dist.,Cosine dist.,Inner prod.
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mendelian traits,GPN-MSA,0.694,0.654,0.207,0.208,0.301
Mendelian traits,GPN-Promoter,0.422,0.379,0.345,0.263,0.169
Mendelian traits,NT,0.12,0.098,0.188,0.186,0.185
Mendelian traits,HyenaDNA,0.115,0.106,0.117,0.116,0.165
Mendelian traits,Caduceus,0.108,0.088,0.135,0.135,0.131
Mendelian traits,SpeciesLM,0.201,0.161,0.327,0.325,0.095
Complex traits,GPN-MSA,0.212,0.224,0.15,0.15,0.177
Complex traits,GPN-Promoter,0.112,0.11,0.126,0.126,0.125
Complex traits,NT,0.101,0.1,0.118,0.119,0.136
Complex traits,HyenaDNA,0.11,0.111,0.102,0.102,0.118


In [8]:
def boldface_best_model(x):
    threshold = 0.01
    y = x.astype(float)
    best_score = y.max()
    best_models = y[(best_score - y) < threshold].index
    res = x.copy()
    for best_model in best_models:
        res[best_model] = r"\textbf{" + res[best_model] + r"}"
    return res

df = df.apply(boldface_best_model, axis=1)
df

Unnamed: 0_level_0,score,LLR,abs(LLR),L2 dist.,Cosine dist.,Inner prod.
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mendelian traits,GPN-MSA,\textbf{0.694},0.654,0.207,0.208,0.301
Mendelian traits,GPN-Promoter,\textbf{0.422},0.379,0.345,0.263,0.169
Mendelian traits,NT,0.120,0.098,\textbf{0.188},\textbf{0.186},\textbf{0.185}
Mendelian traits,HyenaDNA,0.115,0.106,0.117,0.116,\textbf{0.165}
Mendelian traits,Caduceus,0.108,0.088,\textbf{0.135},\textbf{0.135},\textbf{0.131}
Mendelian traits,SpeciesLM,0.201,0.161,\textbf{0.327},\textbf{0.325},0.095
Complex traits,GPN-MSA,0.212,\textbf{0.224},0.150,0.150,0.177
Complex traits,GPN-Promoter,0.112,0.110,\textbf{0.126},\textbf{0.126},\textbf{0.125}
Complex traits,NT,0.101,0.100,0.118,0.119,\textbf{0.136}
Complex traits,HyenaDNA,\textbf{0.110},\textbf{0.111},0.102,0.102,\textbf{0.118}


In [9]:
#df.index.names = [None, None]
df.index.name = None
#df.columns.names = [None, None]
df.columns.name = None

In [10]:
print(df.to_latex(multicolumn_format='c', escape=False))

\begin{tabular}{lllllll}
\toprule
 &  & LLR & abs(LLR) & L2 dist. & Cosine dist. & Inner prod. \\
dataset & model &  &  &  &  &  \\
\midrule
\multirow[t]{6}{*}{Mendelian traits} & GPN-MSA & \textbf{0.694} & 0.654 & 0.207 & 0.208 & 0.301 \\
 & GPN-Promoter & \textbf{0.422} & 0.379 & 0.345 & 0.263 & 0.169 \\
 & NT & 0.120 & 0.098 & \textbf{0.188} & \textbf{0.186} & \textbf{0.185} \\
 & HyenaDNA & 0.115 & 0.106 & 0.117 & 0.116 & \textbf{0.165} \\
 & Caduceus & 0.108 & 0.088 & \textbf{0.135} & \textbf{0.135} & \textbf{0.131} \\
 & SpeciesLM & 0.201 & 0.161 & \textbf{0.327} & \textbf{0.325} & 0.095 \\
\cline{1-7}
\multirow[t]{6}{*}{Complex traits} & GPN-MSA & 0.212 & \textbf{0.224} & 0.150 & 0.150 & 0.177 \\
 & GPN-Promoter & 0.112 & 0.110 & \textbf{0.126} & \textbf{0.126} & \textbf{0.125} \\
 & NT & 0.101 & 0.100 & 0.118 & 0.119 & \textbf{0.136} \\
 & HyenaDNA & \textbf{0.110} & \textbf{0.111} & 0.102 & 0.102 & \textbf{0.118} \\
 & Caduceus & 0.098 & 0.097 & \textbf{0.115} & \textbf{0.115}