In [None]:
%cd ..

In [None]:
import altair as alt
from altair_saver import save
import pandas as pd
from util import load_experiment

In [None]:
baseline_df, val_df, test_df = load_experiment("dataset_size")

In [None]:
baseline_df["model"] = "Production Ranker"

In [None]:
model2name = {
    "Neural PBM - Unbiased": "PBM - True Bias",
    "Neural PBM - Biased": "PBM - Naive",
    "Neural PBM - Estimated bias": "PBM - Estimated Bias",
    "Pointwise IPS - Unbiased": "Point. IPS - True Bias",
    "Pointwise IPS - Biased": "Point. IPS / PBM - Naive",
}

test_df.model = test_df.model.map(model2name)
test_df = test_df[test_df.model != "PBM - Naive"]

In [None]:
def plot(dataset_df, legend=True, width=320, height=125, metric="average_relevant_position", title="", y=[0, 1.0], clip=False, label_y=True, label_x=True): 

    lines = alt.Chart(dataset_df, width=width, height=height, title=title).mark_line(clip=clip).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries" if label_x else None, axis=alt.Axis(format="~s")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False, domain=y), title=metric if label_y else None),
        color=alt.Color("model", legend=None),
        tooltip=[f"count({metric})", "n_sessions", f"mean({metric})"]
    )

    marks = alt.Chart(dataset_df).mark_point(clip=clip, size=50).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="~s")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False)),
        shape=alt.Shape("model"),
        color=alt.Color("model") if legend else alt.Color("model", legend=None),
        tooltip=[f"count({metric})", "n_sessions", f"mean({metric})"]
    )

    ci = alt.Chart(dataset_df).mark_errorband(opacity=0.5, clip=clip).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="~s")),
        y=alt.Y(metric, scale=alt.Scale(zero=False)),
        color=alt.Color("model", legend=None),
    )

    return alt.layer(
        lines,
        marks,
        ci
    ).resolve_scale(color="independent", shape="independent")

yahoo_df = pd.concat([test_df[test_df.dataset == "Yahoo"], baseline_df[baseline_df.dataset == "Yahoo"]])
istella_df = pd.concat([test_df[test_df.dataset == "Istella-S"], baseline_df[baseline_df.dataset == "Istella-S"]])
mslr_df = pd.concat([test_df[test_df.dataset == "MSLR-Web30K"], baseline_df[baseline_df.dataset == "MSLR-Web30K"]])
synthetic_df = pd.concat([test_df[test_df.dataset == "Synthetic"], baseline_df[baseline_df.dataset == "Synthetic"]])

chart = (
    (plot(mslr_df, legend=False, metric="nDCG@10", title="MSLR-WEB30K", y=[0.25, .5], clip=True, label_y=True, label_x=False) |
    plot(istella_df, legend=True, metric="nDCG@10", title="Istella", y=[0.60, 0.75], clip=True, label_y=False, label_x=False)) &
    (plot(yahoo_df, legend=False, metric="nDCG@10", title="Yahoo", y=[0.6, 0.75], clip=True, label_y=True) |
    plot(synthetic_df, legend=False, metric="nDCG@10", title="Synthetic", y=[0, 1.0], clip=True, label_y=False))
).configure_legend(
    orient="right",
    title=None,
    labelFont="serif",
    labelFontSize=14,
    columnPadding=20,
).configure_title(
    fontSize=14,
    fontWeight="normal",
    font="serif"
).configure_axis(
    titlePadding=10,
    titleFontSize=14,
    titleFontWeight="normal",
    titleFont="serif",
    labelFontSize=10,
    labelFontWeight="normal",
    labelFont="serif",
    tickCount=6
)

save(chart, "figures/results.pdf")
chart

# Statsistical Significance

In [None]:
!pip install -q statsmodels

In [None]:
test_df.dataset.unique()

In [None]:
source = test_df[test_df.n_sessions == 100_000_000].groupby(["model", "dataset"])[["nDCG@5", "nDCG@10", "ARP"]].agg(["mean", "std"]).round(3)
source

In [None]:
dataset = "MSLR-Web30K"
metric = "nDCG@10"
n_sessions = 100000000

for dataset in test_df.dataset.unique():
    columns = ["model", "random_state", metric]

    source = test_df[(test_df.dataset == dataset) & (test_df.n_sessions == n_sessions)].sort_values(columns)[columns]
    source.head()

    from scipy import stats
    import statsmodels.stats.multicomp as mc

    comparison = mc.MultiComparison(source[metric], source["model"])
    tbl, a1, a2 = comparison.allpairtest(stats.ttest_ind, method= "bonf", alpha=0.0001)

    print("\n", dataset)
    print(tbl)