!mamba install -c conda-forge "vegafusion-python-embed>=1.4.0" "vegafusion>=1.4.0" -y

In [2]:
import glob
import os

import altair as alt
import pandas as pd

from pathlib import Path
from theme import theme

In [3]:
alt.data_transformers.enable("vegafusion")
alt.themes.register("latex", theme)
alt.themes.enable("latex")

ThemeRegistry.enable('latex')

## Load results

In [4]:
directory = Path("outputs")
data = ["baidu", "ltr", "uva"]

In [5]:
def run_complete(file: Path):
    return (file.is_dir()
        and (file / "val.parquet").exists()
        and (file / "test_click.parquet").exists()
        and (file / "test_rel.parquet").exists())

def parse_model_name(path: Path):
    directory = path.name
    options = {}

    for option in directory.split(","):
        k, v = option.split("=")
        options[k] = v

    return options

def parse_result_file(run: Path, file: str):
    options = parse_model_name(run)
    
    test_rel_df = pd.read_parquet(run / file)
    test_rel_df["run"] = run.name
    test_rel_df["model"] = options["model"]
    test_rel_df["data"] = options["data"]
    test_rel_df["random_state"] = options["random_state"]
    
    return test_rel_df

def load_data(data, file: str):
    data_path = directory / data
    runs = [f for f in data_path.iterdir() if run_complete(f)]
    print(f"Loaded {len(runs)} run(s) for {data}")

    return pd.concat([parse_result_file(run, file) for run in runs])

In [6]:
rel_df = pd.concat([load_data(d, "test_rel.parquet") for d in data])
rel_df.head()

Loaded 55 run(s) for baidu
Loaded 55 run(s) for ltr
Loaded 55 run(s) for uva


Unnamed: 0,dcg@01,dcg@03,dcg@05,dcg@10,frequency_bucket,mrr@10,ndcg@10,query_id,run,model,data,random_state
0,3.0,4.5,8.675295,13.93903,8,1.0,0.4734,1,"data=baidu,es_patience=5,logging=True,max_epoc...",ips-listwise,baidu,1906
1,0.0,0.0,0.0,0.0,9,0.0,0.0,2,"data=baidu,es_patience=5,logging=True,max_epoc...",ips-listwise,baidu,1906
2,1.0,6.916508,6.916508,9.765993,3,1.0,0.264485,3,"data=baidu,es_patience=5,logging=True,max_epoc...",ips-listwise,baidu,1906
3,0.0,0.5,0.5,1.718555,8,0.333333,0.145399,4,"data=baidu,es_patience=5,logging=True,max_epoc...",ips-listwise,baidu,1906
4,0.0,3.5,3.5,5.708254,6,0.333333,0.324325,5,"data=baidu,es_patience=5,logging=True,max_epoc...",ips-listwise,baidu,1906


# Plot Ranking Results

In [7]:
model2name = {
    "naive-pointwise": "Point. Naive",
    "pbm-pointwise": "Point. PBM",
    "regression-em": "RegressionEM",
    "ips-pointwise": "Point. IPS",
    "naive-listwise": "List. Naive",
    "ips-listwise": "List. IPS",
    "dla": "Dual Learning Algorithm",
#    "pbm-listwise": "Listwise PBM",
    "naive-lambda-rank": "Naive LambdaRank",
    "pairwise-debias": "Pairwise Debias",
}

data2name = {
    "baidu": "Baidu BERT Embeddings",
    "uva": "Our BERT Embeddings",
    "ltr": "LTR Features"
}

# Plot Click Prediction

In [None]:
models = ["naive-pointwise", "pbm-pointwise", "regression-em", "ips-pointwise",]

click_df = pd.concat([load_data(d, "test_click.parquet") for d in data])
click_df = click_df[click_df.model.isin(models)]
click_df.head()

Loaded 55 run(s) for baidu
Loaded 55 run(s) for ltr
Loaded 55 run(s) for uva


## Significance Testing

In [None]:
metrics = ['dcg@01', 'dcg@03', 'dcg@05', 'dcg@10', 'mrr@10', "nll"]
click_models = ["naive-pointwise", "pbm-pointwise", "regression-em", "ips-pointwise",]

model2position = {model:i for i, model in enumerate(model2name.keys())}

In [None]:
from scipy.stats import ttest_rel

def t_test(source, data, base, models, alpha = 0.01):
    # Bonferoni correction
    alpha = alpha / len(models)
    results = []
    source = source[source["data"] == data]

    for metric in metrics:
        for model in models:
            model_df = source[source["model"] == base][metric]
            candidate_df = source[source["model"] == model][metric]

            result = ttest_rel(model_df, candidate_df, alternative="two-sided")
            reject = result.pvalue <= alpha
            improve = result.statistic < 0

            results.append({"data": data, "metric": metric, "base": base, "model": model, "improve": improve, "alpha": alpha, "significant": reject, "p_value": result.pvalue})
            
    return pd.DataFrame(results)

In [None]:
df = rel_df.merge(click_df, on=["data", "model", "run", "random_state", "query_id"], how="left")
df["nll"] = df["nll"].fillna(0)
df.head()

In [None]:
pointwise_models = ["naive-pointwise", "pbm-pointwise", "regression-em", "ips-pointwise"]
listwise_models = ["naive-listwise", "ips-listwise", "dla"]
lambdarank_models = ["naive-lambda-rank", "pairwise-debias"]

In [None]:
test_df = pd.concat([
    t_test(df, "baidu", "naive-pointwise", pointwise_models),
    t_test(df, "baidu", "naive-listwise", listwise_models),
    t_test(df, "baidu", "naive-lambda-rank", lambdarank_models),
    t_test(df, "uva", "naive-pointwise", pointwise_models),
    t_test(df, "uva", "naive-listwise", listwise_models),
    t_test(df, "uva", "naive-lambda-rank", lambdarank_models),
    t_test(df, "ltr", "naive-pointwise", pointwise_models),
    t_test(df, "ltr", "naive-listwise", listwise_models),
    t_test(df, "ltr", "naive-lambda-rank", lambdarank_models),
])

test_df.head()

In [None]:
def map_change(row):
    if not row.significant:
        return 0
    elif row.improve:
        return 1
    else:
        return -1

test_df["change"] = test_df.apply(map_change, axis=1)
test_df = test_df.pivot_table(index=["data", "model"], columns=["metric"], values=["change"], aggfunc="max", fill_value=False)
test_df.columns = test_df.columns.droplevel(level=0)
test_df = test_df.reset_index()
test_df

In [None]:
test_df["data_idx"] = test_df.data.map({"baidu": 0, "uva": 1, "ltr": 2})
test_df["model_idx"] = test_df.model.map(model2position)
test_df = test_df.sort_values(["data_idx", "model_idx"], ascending=True)
test_df = test_df.drop(columns=["data_idx", "model_idx"])
test_df

# Plot results in Table

In [None]:
def plot_change(diff):
    if diff == 1:
        return "$^\\blacktriangle$"
    elif diff == -1:
        return "$^\\blacktriangledown$"
    else:
        return ""

    

In [None]:
click_df = click_df[click_df["model"].isin(click_models)]

df = rel_df.merge(click_df, on=["data", "model", "run", "random_state", "query_id"], how="left")
table_df = df.groupby(["data", "model", "random_state"])[metrics].mean().reset_index()
table_df = table_df.groupby(["data", "model"])[metrics].agg(["mean", "std"]).reset_index()

table_df = table_df[table_df.model.isin(model2position)]

table_df["data_idx"] = table_df.data.map({"baidu": 0, "uva": 1, "ltr": 2})
table_df["model_idx"] = table_df.model.map(model2position)
table_df = table_df.sort_values(["data_idx", "model_idx"], ascending=True)
table_df = table_df.drop(columns=[("data_idx", ""), ("model_idx", "")])

table_df.model = table_df.model.map(model2name)

In [None]:
idx_df = table_df[["data", "model"]].copy()
idx_df.columns = idx_df.columns.droplevel(level=1)

for metric in metrics:
    mean = table_df[(metric, "mean")].map(lambda x: f"{x:.3f}" if isinstance(x, float) else x)
    std = table_df[(metric, "std")].map(lambda x: f"{x:.3f}" if isinstance(x, float) else x)
    diff = list(test_df[metric].map(plot_change))
    
    table_df[(metric, "mean")] = "$" + mean + "$ \scriptsize{$(" + std + ")$}" + diff
    table_df = table_df.drop(columns=[(metric, "std")])

table_df.columns = table_df.columns.droplevel(level=1)
table_df.columns = table_df.columns.str.upper()

In [None]:
table_df.to_latex("table.tex", index=False)