In [72]:
import pandas as pd
from pathlib import Path
import numpy as np

PERFORMANCE_TSV = "quantitative_analysis.tsv"
df = pd.read_csv(PERFORMANCE_TSV, sep="\t", index_col=0).reset_index()
df

Unnamed: 0,src_enc,tgt_enc,topk_jaccard,mrr,linear_cka,rbf_kernel_cka,mse,cosine_sim,enc1_type,enc2_type,anchor_choice,seed
0,local_fasttext,local_fasttext,1.000000,1.000000,1.000000,0.0,0.000000,1.000000,absolute,absolute,uniform,0
1,local_fasttext,local_fasttext,1.000000,1.000000,1.000000,0.0,0.000000,1.000000,relative,relative,uniform,0
2,local_fasttext,word2vec-google-news-300,0.000135,0.000177,0.758747,0.0,9938.420898,0.006158,absolute,absolute,uniform,0
3,local_fasttext,word2vec-google-news-300,0.376309,0.935341,0.782489,0.0,1416.778198,0.858322,relative,relative,uniform,0
4,word2vec-google-news-300,local_fasttext,0.000101,0.000085,0.758746,0.0,9938.420898,0.006158,absolute,absolute,uniform,0
...,...,...,...,...,...,...,...,...,...,...,...,...
475,local_fasttext,word2vec-google-news-300,0.389122,0.936201,0.800535,0.0,1255.892456,0.874411,relative,relative,kmeans,9
476,word2vec-google-news-300,local_fasttext,0.000101,0.000085,0.758746,0.0,9938.420898,0.006158,absolute,absolute,kmeans,9
477,word2vec-google-news-300,local_fasttext,0.432258,0.970375,0.801329,0.0,1253.104126,0.874690,relative,relative,kmeans,9
478,word2vec-google-news-300,word2vec-google-news-300,1.000000,1.000000,1.000000,0.0,0.000000,1.000000,absolute,absolute,kmeans,9


In [93]:
def latex_float(f):
    float_str = "{0:.2f}".format(f)
    if "e" in float_str:
        base, exponent = float_str.split("e")
        return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
    else:
        return float_str

def extract_mean_std(df: pd.DataFrame, anchors_choice, emb_type, src_enc, tgt_enc) -> str:
    try:
        mean_std = df.loc[anchors_choice, emb_type, src_enc, tgt_enc]
        mean = mean_std["mean"]
        std = mean_std["std"]
        return mean, std
    except (AttributeError, KeyError) as e:
        return e

ANCHORS_CHOICES = ["uniform", "fps", "kmeans", "top_1000", "top_5000", "top_10000"]
EMBEDDING_TYPES =  ['absolute', 'relative']
EMBEDDINGS_SPACES = ["local_fasttext", "word2vec-google-news-300"]

outdf = (
    df.groupby(
        [
            "anchor_choice",
            "enc1_type",
            "src_enc",
            "tgt_enc",
        ]
    )
.agg([np.mean, np.std, "count"])
.round(6)
.reindex(ANCHORS_CHOICES, level="anchor_choice")
)

TO_DROP = ['linear_cka', 'rbf_kernel_cka', 'mse', 'seed']
outdf = outdf.drop(columns=TO_DROP)
outdf
METRICS = ['topk_jaccard', 'mrr', 'cosine_sim']

for metric in METRICS:
    mean, std = outdf[metric]['mean'], outdf[metric]['std']
    new_col = [rf"${latex_float(m)} \pm {latex_float(s)}$" for m, s in zip(mean, std)]
    outdf[f'{metric}_new'] = new_col
    del outdf[metric]
    
outdf

  df.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,topk_jaccard_new,mrr_new,cosine_sim_new
anchor_choice,enc1_type,src_enc,tgt_enc,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
uniform,absolute,local_fasttext,local_fasttext,$1.00 \pm 0.00$,$1.00 \pm 0.00$,$1.00 \pm 0.00$
uniform,absolute,local_fasttext,word2vec-google-news-300,$0.00 \pm 0.00$,$0.00 \pm 0.00$,$0.01 \pm 0.00$
uniform,absolute,word2vec-google-news-300,local_fasttext,$0.00 \pm 0.00$,$0.00 \pm 0.00$,$0.01 \pm 0.00$
uniform,absolute,word2vec-google-news-300,word2vec-google-news-300,$1.00 \pm 0.00$,$1.00 \pm 0.00$,$1.00 \pm 0.00$
uniform,relative,local_fasttext,local_fasttext,$1.00 \pm 0.00$,$1.00 \pm 0.00$,$1.00 \pm 0.00$
uniform,relative,local_fasttext,word2vec-google-news-300,$0.38 \pm 0.01$,$0.93 \pm 0.00$,$0.86 \pm 0.00$
uniform,relative,word2vec-google-news-300,local_fasttext,$0.44 \pm 0.00$,$0.97 \pm 0.00$,$0.86 \pm 0.00$
uniform,relative,word2vec-google-news-300,word2vec-google-news-300,$1.00 \pm 0.00$,$1.00 \pm 0.00$,$1.00 \pm 0.00$
fps,absolute,local_fasttext,local_fasttext,$1.00 \pm 0.00$,$1.00 \pm 0.00$,$1.00 \pm 0.00$
fps,absolute,local_fasttext,word2vec-google-news-300,$0.00 \pm 0.00$,$0.00 \pm 0.00$,$0.01 \pm 0.00$


In [96]:
def to_latex(df, label):
    return df.to_latex(
        escape=False,
        caption=f"quantitative-analysis-word-embeddings {label}",
        label=f"tab:quantitative-analysis-word-embeddings-{label}",
        multirow=True,
        sparsify=True,
        multicolumn_format="c",
    )

print(to_latex(outdf, label='all'))

\begin{table}
\centering
\caption{quantitative-analysis-word-embeddings all}
\label{tab:quantitative-analysis-word-embeddings-all}
\begin{tabular}{lllllll}
\toprule
          &          &                          &                          & topk_jaccard_new &          mrr_new &   cosine_sim_new \\
          &          &                          &                          \\
anchor_choice & enc1_type & src_enc & tgt_enc &                  &                  &                  \\
\midrule
\multirow{8}{*}{uniform} & \multirow{4}{*}{absolute} & \multirow{2}{*}{local_fasttext} & local_fasttext &  $1.00 \pm 0.00$ &  $1.00 \pm 0.00$ &  $1.00 \pm 0.00$ \\
          &          &                          & word2vec-google-news-300 &  $0.00 \pm 0.00$ &  $0.00 \pm 0.00$ &  $0.01 \pm 0.00$ \\
\cline{3-7}
          &          & \multirow{2}{*}{word2vec-google-news-300} & local_fasttext &  $0.00 \pm 0.00$ &  $0.00 \pm 0.00$ &  $0.01 \pm 0.00$ \\
          &          &                          & word

  return df.to_latex(
