In [None]:
import pandas as pd
import numpy as np

fine_grained: bool = True
# anchor_dataset_name: str = "amazon_translated" # wikimatrix, amazon_translated
train_perc: float = 0.25
COLUMNS_TO_DROP = ["precision", "recall"]


def read_df(fine_grained, anchor_dataset_name, train_perc):

    full_df = pd.read_csv(
        f"nlp_multilingual-stitching-amazon-{'fine_grained' if fine_grained else 'coarse_grained'}-{anchor_dataset_name}-{train_perc}.tsv",
        sep="\t",
        index_col=0,
    )
    return full_df


def rearrange_embedtype_as_column(mydf, domain):
    relative_out = mydf[mydf[("embed_type", "")] == "relative"]
    relative_out.columns = pd.MultiIndex.from_tuples(
        [
            ("seed", "", ""),
            ("embed_type", "", ""),
            ("train_lang", "", ""),
            ("test_lang", "", ""),
            ("Relative", domain, "fscore"),
            ("Relative", domain, "mae"),
            ("stitched", "", ""),
        ],
    )
    absolute_out = mydf[mydf[("embed_type", "")] == "absolute"]
    absolute_out.columns = pd.MultiIndex.from_tuples(
        [
            ("seed", "", ""),
            ("embed_type", "", ""),
            ("train_lang", "", ""),
            ("test_lang", "", ""),
            ("Absolute", domain, "fscore"),
            ("Absolute", domain, "mae"),
            ("stitched", "", ""),
        ],
    )
    return pd.merge(
        relative_out.drop(columns=["embed_type"]),
        absolute_out.drop(columns=["embed_type"]),
        on=[
            ("train_lang", "", ""),
            ("test_lang", "", ""),
            ("seed", "", ""),
            ("stitched", "", ""),
        ],
    )


domain = "In Domain"
full_in_domain = read_df(fine_grained=fine_grained, anchor_dataset_name="amazon_translated", train_perc=train_perc)
full_in_domain = full_in_domain.drop(columns=COLUMNS_TO_DROP)
full_in_domain["fscore"] = full_in_domain["fscore"] * 100
full_in_domain.columns = pd.MultiIndex.from_tuples(
    [
        ("seed", ""),
        ("embed_type", ""),
        ("train_lang", ""),
        ("test_lang", ""),
        # ('In Domain',  'precision'),
        # ('In Domain',     'recall'),
        (domain, "fscore"),
        (domain, "mae"),
        ("stitched", ""),
    ],
)
full_in_domain = rearrange_embedtype_as_column(full_in_domain, domain=domain)

domain = "Out Domain"
full_out_domain = read_df(fine_grained=fine_grained, anchor_dataset_name="wikimatrix", train_perc=train_perc)
full_out_domain = full_out_domain.drop(columns=COLUMNS_TO_DROP)
full_out_domain["fscore"] = full_out_domain["fscore"] * 100
full_out_domain.columns = pd.MultiIndex.from_tuples(
    [
        ("seed", ""),
        ("embed_type", ""),
        ("train_lang", ""),
        ("test_lang", ""),
        # ('Out Domain',  'precision'),
        # ('Out Domain',     'recall'),
        (domain, "fscore"),
        (domain, "mae"),
        ("stitched", ""),
    ],
)
full_out_domain = rearrange_embedtype_as_column(full_out_domain, domain=domain)

df = pd.merge(
    full_in_domain,
    full_out_domain,
    on=[
        (
            "seed",
            "",
            "",
        ),
        (
            "train_lang",
            "",
            "",
        ),
        (
            "test_lang",
            "",
            "",
        ),
        (
            "stitched",
            "",
            "",
        ),
    ],
)
full_df = df.drop(
    columns=[
        (
            "seed",
            "",
            "",
        ),
        ("stitched", "", ""),
    ]
)


train_lang = "Train Lang"
test_lang = "Test Lang"
full_df = full_df.rename(columns={"train_lang": train_lang, "test_lang": test_lang})
full_df = full_df[
    [
        ("Train Lang", "", ""),
        ("Test Lang", "", ""),
        ("Absolute", "In Domain", "fscore"),
        ("Absolute", "In Domain", "mae"),
        ("Relative", "In Domain", "fscore"),
        ("Relative", "In Domain", "mae"),
        ("Relative", "Out Domain", "fscore"),
        ("Relative", "Out Domain", "mae"),
        ("Absolute", "Out Domain", "fscore"),
        ("Absolute", "Out Domain", "mae"),
    ]
]
full_df = full_df.drop(columns=[("Absolute", "Out Domain")])
full_df

In [None]:
def to_latex(df, label):
    return df.to_latex(
        escape=False,
        caption=f"Fine-grained: {fine_grained}, Train perc: {train_perc}",
        label=f'tab:multilingual-{label}-{"fine" if fine_grained else "coarse"}-grained',
        multirow=True,
        sparsify=True,
        multicolumn_format="c",
    )

In [None]:
pd.set_option("display.max_rows", None)
MEAN_STD_FORMAT = r"${:.2f} \pm {:.2f}$"

# SupMat

In [None]:
df = (
    full_df.groupby(
        [(train_lang, "", ""), (test_lang, "", "")],
    )
    .agg([np.mean, np.std])
    .round(2)
)
o = df.copy()
for embed in (
    "Absolute",
    "Relative",
):
    for domain in ("In Domain", "Out Domain"):
        if embed == "Absolute" and domain == "Out Domain":
            continue
        for metric, new_name in (("fscore", "FScore"), ("mae", "MAE")):
            df[(embed, domain, new_name, "")] = df.apply(
                lambda row: MEAN_STD_FORMAT.format(
                    row[(embed, domain, metric, "mean")], row[(embed, domain, metric, "std")]
                ),
                axis=1,
            )
            for agg in ("mean", "std"):
                df = df.drop(columns=[(embed, domain, metric, agg)])

from IPython.display import Latex
from IPython.display import display

print(to_latex(df, "full"))
o

# Main

In [None]:
df = full_df[full_df[(train_lang, "", "")] == "en"]
df = (
    df.groupby(
        [(train_lang, "", ""), (test_lang, "", "")],
    )
    .agg([np.mean, np.std])
    .round(2)
)

o = df.copy()
for embed in (
    "Absolute",
    "Relative",
):
    for domain in ("In Domain", "Out Domain"):
        if embed == "Absolute" and domain == "Out Domain":
            continue
        for metric, new_name in (("fscore", "FScore"), ("mae", "MAE")):
            df[(embed, domain, new_name, "")] = df.apply(
                lambda row: MEAN_STD_FORMAT.format(
                    row[(embed, domain, metric, "mean")], row[(embed, domain, metric, "std")]
                ),
                axis=1,
            )
            for agg in ("mean", "std"):
                df = df.drop(columns=[(embed, domain, metric, agg)])

from IPython.display import Latex
from IPython.display import display

print(to_latex(df, "en"))
o