In [1]:
%cd ..

/Users/philipphager/Developer/ultr-cm-vs-ips


In [2]:
import os
from collections import defaultdict
from pathlib import Path
from typing import List

import altair as alt
import pandas as pd
import torch


def _parse_hydra_config(path: Path):
    import yaml

    # Load yaml file ignoring custom hydra attributes
    file = open(path, "r")
    raw_yaml = yaml.full_load(file)
    return pd.json_normalize(raw_yaml, sep="_")


def _subset(frame, columns: List[str], metric_columns: List[str]):
    return (
        frame[columns + metric_columns]
        .dropna(axis=0, how="all", subset=metric_columns)
        .copy()
    )


def _cross_join(df1, df2):
    df1["key"] = 1
    df2["key"] = 1
    return df1.merge(df2, on="key").drop(columns=["key"])


def _rename(frame, stage):
    return frame.rename(
        columns={
            "model_name": "model",
            "data_name": "dataset",
            "simulation_n_sessions": "n_sessions",
            f"{stage}_ndcg": "nDCG",
            f"{stage}_ndcg@5": "nDCG@5",
            f"{stage}_ndcg@10": "nDCG@10",
            f"{stage}_arp": "ARP",
        }
    )


def load_experiment(experiment: str, directory: Path = Path("results")):
    path = directory / experiment
    frames = defaultdict(lambda: [])

    if not path.exists():
        return None, None

    for directory in path.iterdir():
        val_path = directory / "val.parquet"
        test_path = directory / "test.parquet"

        if directory.is_dir() and val_path.exists() and test_path.exists():
            param_df = _parse_hydra_config(directory / "config.yaml")
            val_df = pd.read_parquet(val_path)
            test_df = pd.read_parquet(test_path)
            test_df["dir"] = str(directory)

            val_df = _cross_join(val_df, param_df)
            test_df = _cross_join(test_df, param_df)

            frames["val"].append(val_df)
            frames["test"].append(test_df)

    val_df = (
        _rename(pd.concat(frames["val"]), "val") if len(frames["val"]) > 0 else None
    )
    test_df = (
        _rename(pd.concat(frames["test"]), "test") if len(frames["test"]) > 0 else None
    )

    return val_df, test_df


val_df, test_df = load_experiment("dataset_size")

In [3]:
def plot(dataset_df, legend=True, width=600, height=200, metric="average_relevant_position", title="", y=[0, 1.0]): 
    lines = alt.Chart(dataset_df, width=width, height=height, title=title).mark_line().encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="e")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False, domain=y)),
        color=alt.Color("model", legend=None),
    )

    marks = alt.Chart(dataset_df).mark_point().encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="e")),
        y=alt.Y(f"mean({metric})", scale=alt.Scale(zero=False)),
        shape=alt.Shape("model", legend=alt.Legend(orient="bottom"), title="") if legend else alt.Shape("model", legend=None),
        color=alt.Color("model", legend=alt.Legend(orient="bottom"), title="") if legend else alt.Color("model", legend=None),
    )

    ci = alt.Chart(dataset_df).mark_errorband(opacity=0.5).encode(
        x=alt.X("n_sessions", scale=alt.Scale(type="log"), title="Number of Train Queries", axis=alt.Axis(format="e")),
        y=alt.Y(metric, scale=alt.Scale(zero=False)),
        color=alt.Color("model", legend=None),
    )

    return alt.layer(
        lines,
        marks,
        ci
    ).resolve_scale(
        color="independent",
        shape="independent",
    )

yahoo_df = test_df[test_df.dataset == "Yahoo"]
mslr_df = test_df[test_df.dataset == "MSLR-Web30K"]

(
    plot(mslr_df, legend=True, metric="nDCG@10", title="MSLR-WEB30K", y=[0.25, 0.45]) &
    plot(yahoo_df, legend=True, metric="nDCG@10", title="Yahoo", y=[0.59, 0.74])
)

In [7]:
yahoo_df[yahoo_df.n_sessions == 100_000_000].groupby("model").epoch.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Neural PBM - Biased,10.0,57.9,13.795732,36.0,45.5,62.0,70.25,73.0
Neural PBM - Estimated bias,10.0,99.0,0.0,99.0,99.0,99.0,99.0,99.0
Neural PBM - Unbiased,10.0,42.9,19.098866,13.0,28.75,47.0,50.75,71.0
Pointwise IPS - Biased,10.0,57.9,13.795732,36.0,45.5,62.0,70.25,73.0
Pointwise IPS - Unbiased,10.0,54.7,15.239204,33.0,47.0,52.5,62.5,79.0


In [5]:
yahoo_df[yahoo_df.n_sessions == 100_000_000].groupby("model").nDCG.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Neural PBM - Biased,10.0,0.762229,0.001275,0.760259,0.761508,0.762247,0.762747,0.764821
Neural PBM - Estimated bias,10.0,0.764404,0.001441,0.762808,0.763153,0.764049,0.765549,0.766621
Neural PBM - Unbiased,10.0,0.768009,0.002314,0.763507,0.767292,0.767851,0.769041,0.771344
Pointwise IPS - Biased,10.0,0.762229,0.001275,0.760259,0.761508,0.762247,0.762747,0.764821
Pointwise IPS - Unbiased,10.0,0.775648,0.001273,0.773328,0.775274,0.775525,0.776777,0.777176
