In [1]:
%cd ..

/Users/philipphager/Developer/ultr-cm-vs-ips


In [2]:
import altair as alt
from altair_saver import save
import pandas as pd
from util import load_experiment

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
baseline_df, val_df, test_df = load_experiment("dataset_size")

In [4]:
baseline_df["model"] = "Production Ranker"

In [5]:
model2name = {
    "Neural PBM - Unbiased": "PBM - True Bias",
    "Neural PBM - Biased": "PBM - Naive",
    "Neural PBM - Estimated bias": "PBM - Estimated Bias",
    "Pointwise IPS - Unbiased": "Point. IPS - True Bias",
    "Pointwise IPS - Biased": "Point. IPS / PBM - Naive",
}

test_df.model = test_df.model.map(model2name)
test_df = test_df[test_df.model != "PBM - Naive"]

In [14]:
def plot(dataset_df, legend=True, width=320, height=125, metric="average_relevant_position", title="", y=[0, 1.0], clip=False, label_y=True, label_x=True): 
    lines = alt.Chart(dataset_df, width=width, height=height, title=title).mark_line(clip=clip).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries" if label_x else None, axis=alt.Axis(format="~s")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False, domain=y), title=metric if label_y else None),
        color=alt.Color("model", legend=None),
        tooltip=[f"count({metric})", "n_sessions", f"mean({metric})"]
    )

    marks = alt.Chart(dataset_df).mark_point(clip=clip, size=50).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="~s")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False)),
        shape=alt.Shape("model"),
        color=alt.Color("model", legend=None),
        tooltip=[f"count({metric})", "n_sessions", f"mean({metric})"]
    )

    ci = alt.Chart(dataset_df).mark_errorband(opacity=0.5, clip=clip).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="~s")),
        y=alt.Y(metric, scale=alt.Scale(zero=False)),
        color=alt.Color("model", legend=None),
    )

    return alt.layer(
        lines,
        marks,
        ci
    )

yahoo_df = pd.concat([test_df[test_df.dataset == "Yahoo"], baseline_df[baseline_df.dataset == "Yahoo"]])
istella_df = pd.concat([test_df[test_df.dataset == "Istella-S"], baseline_df[baseline_df.dataset == "Istella-S"]])
mslr_df = pd.concat([test_df[test_df.dataset == "MSLR-Web30K"], baseline_df[baseline_df.dataset == "MSLR-Web30K"]])
synthetic_df = pd.concat([test_df[test_df.dataset == "Synthetic"], baseline_df[baseline_df.dataset == "Synthetic"]])

chart = (
    (plot(mslr_df, legend=False, metric="nDCG@10", title="MSLR-WEB30K", y=[0.25, .5], clip=True, label_y=True, label_x=False) |
    plot(istella_df, legend=False, metric="nDCG@10", title="Istella", y=[0.60, 0.75], clip=True, label_y=False, label_x=False)) &
    (plot(yahoo_df, legend=False, metric="nDCG@10", title="Yahoo", y=[0.6, 0.75], clip=True, label_y=True) |
    plot(synthetic_df, legend=True, metric="nDCG@10", title="Synthetic", y=[0, 1.0], clip=True, label_y=False))
).configure_legend(
    orient="right",
    title=None,
    labelFont="serif",
    labelFontSize=14,
    columnPadding=20,
).configure_title(
    fontSize=14,
    fontWeight="normal",
    font="serif"
).configure_axis(
    titlePadding=10,
    titleFontSize=14,
    titleFontWeight="normal",
    titleFont="serif",
    labelFontSize=10,
    labelFontWeight="normal",
    labelFont="serif",
    tickCount=6
)

save(chart, "figures/results.pdf")
chart

# Statsistical Significance

In [7]:
!pip install -q statsmodels

In [8]:
test_df.dataset.unique()

array(['MSLR-Web30K', 'Istella-S', 'Synthetic', 'Yahoo'], dtype=object)

In [9]:
source = test_df[test_df.n_sessions == 100_000_000].groupby(["model", "dataset"])[["nDCG@5", "nDCG@10", "ARP"]].agg(["mean", "std"]).round(3)
source

Unnamed: 0_level_0,Unnamed: 1_level_0,nDCG@5,nDCG@5,nDCG@10,nDCG@10,ARP,ARP
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
model,dataset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
PBM - Estimated Bias,Istella-S,0.629,0.008,0.692,0.007,10.605,1.193
PBM - Estimated Bias,MSLR-Web30K,0.429,0.01,0.449,0.008,44.835,0.274
PBM - Estimated Bias,Synthetic,0.772,0.022,0.833,0.019,9.335,0.143
PBM - Estimated Bias,Yahoo,0.673,0.005,0.722,0.003,9.848,0.055
PBM - True Bias,Istella-S,0.638,0.003,0.703,0.004,8.911,0.212
PBM - True Bias,MSLR-Web30K,0.428,0.006,0.447,0.006,44.965,0.23
PBM - True Bias,Synthetic,1.0,0.0,1.0,0.0,8.14,0.004
PBM - True Bias,Yahoo,0.68,0.004,0.728,0.003,9.812,0.035
Point. IPS - True Bias,Istella-S,0.656,0.005,0.724,0.004,8.274,0.141
Point. IPS - True Bias,MSLR-Web30K,0.432,0.011,0.454,0.01,44.418,0.227


In [10]:
source = baseline_df[baseline_df.n_sessions == 100_000_000].groupby(["model", "dataset"])[["nDCG@5", "nDCG@10", "ARP"]].agg(["mean", "std"]).round(3)
source

Unnamed: 0_level_0,Unnamed: 1_level_0,nDCG@5,nDCG@5,nDCG@10,nDCG@10,ARP,ARP
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
model,dataset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Production Ranker,Istella-S,0.566,0.012,0.632,0.01,10.659,0.199
Production Ranker,MSLR-Web30K,0.301,0.025,0.33,0.023,49.223,0.664
Production Ranker,Synthetic,0.369,0.005,0.439,0.005,12.994,0.038
Production Ranker,Yahoo,0.613,0.012,0.671,0.009,10.439,0.091


In [11]:
dataset = "MSLR-Web30K"
metric = "nDCG@10"
n_sessions = 100000000

for dataset in test_df.dataset.unique():
    columns = ["model", "random_state", metric]

    source = test_df[(test_df.dataset == dataset) & (test_df.n_sessions == n_sessions)].sort_values(columns)[columns]
    source.head()

    from scipy import stats
    import statsmodels.stats.multicomp as mc

    comparison = mc.MultiComparison(source[metric], source["model"])
    tbl, a1, a2 = comparison.allpairtest(stats.ttest_ind, method= "bonf", alpha=0.0001)

    print("\n", dataset)
    print(tbl)


 MSLR-Web30K
Test Multiple Comparison ttest_ind 
FWER=0.00 method=bonf
alphacSidak=0.00, alphacBonf=0.000
        group1                  group2            stat   pval  pval_corr reject
-------------------------------------------------------------------------------
  PBM - Estimated Bias          PBM - True Bias  0.6504 0.5236       1.0  False
  PBM - Estimated Bias   Point. IPS - True Bias -1.2453  0.229       1.0  False
  PBM - Estimated Bias Point. IPS / PBM - Naive 11.7117    0.0       0.0   True
       PBM - True Bias   Point. IPS - True Bias -1.9903  0.062    0.3718  False
       PBM - True Bias Point. IPS / PBM - Naive 11.8548    0.0       0.0   True
Point. IPS - True Bias Point. IPS / PBM - Naive 12.1379    0.0       0.0   True
-------------------------------------------------------------------------------

 Istella-S
Test Multiple Comparison ttest_ind 
FWER=0.00 method=bonf
alphacSidak=0.00, alphacBonf=0.000
        group1                  group2            stat    pval  pval