In [None]:
import altair as alt
import numpy as np
import pandas as pd

from itertools import combinations
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

from src.data import WandbLoader
from src.score import repeated_cross_val_r2_score, get_metric_combinations

In [None]:
experiment_name = "sigir-cmip"
run_name = "sweep"

loader = WandbLoader(
    "your-entity",
    "your-project",
    experiment_name,
    run_name,
)

In [None]:
df = loader.load_metrics()
df.head()

In [None]:
def filter_data(df, user_model, train_policy, test_policy, drop_na=False):
    columns = ["model", "user_model", "train_policy", "random_state", "test/ppl"]
    
    ind_df = df[
        (df["user_model"] == user_model)
        & (df["train_policy"] == train_policy)
        & (df["test_policy"] == train_policy)
    ][columns]

    ind_df = ind_df.rename(columns={
        "test/ppl": "in-distribution PPL"
    })
    
    columns = ["model", "user_model", "train_policy", "test_policy", "random_state", "test/nDCG", "test/cmi", "test/ppl"]

    ood_df = df[
        (df["user_model"] == user_model)
        & (df["train_policy"] == train_policy)
        & (df["test_policy"] == test_policy)
    ][columns]

    ood_df = ood_df.rename(columns={
        "test/nDCG": "nDCG",
        "test/cmi": "CMIP",
        "test/ppl": "out-of-distribution PPL"
    })
    
    df = ind_df.merge(ood_df, on=["model", "user_model", "train_policy", "random_state"])
    
    if drop_na:
        df = df[df.notna().all(axis=1)]
    
    return df

In [None]:
user_model = "GradedPBM"
train_policy = "NoisyOraclePolicy"
test_policy = "UniformPolicy"

In [None]:
chart_df = filter_data(df, user_model, train_policy, test_policy, drop_na=True)
chart_df.head()

# Evaluate fit of Decision Tree Regressor using R2

Proportion of variation in `out-of-distribution PPL` that can be explained by combining `["in-distribution PPL", "nDCG", "CMIP"]`.

In [None]:
from itertools import product

user_models = ["GradedPBM", "GradedDBN", "MixtureDBN", "GradedCarousel"]
train_policies = ["NoisyOraclePolicy", "LightGBMRanker"]
test_policies = ["NoisyOraclePolicy", "LightGBMRanker", "UniformPolicy"]
metric_combinations = [
    ["in-distribution PPL"],
    ["nDCG"],
    ["CMIP"],
    ["in-distribution PPL", "nDCG"],
    ["in-distribution PPL", "CMIP"],
    ["nDCG", "CMIP"],
    ["in-distribution PPL", "nDCG", "CMIP"],
]

rows = []

for user_model, train_policy, test_policy in product(*[user_models, train_policies, test_policies]):
    if train_policy == test_policy:
        continue
    
    filter_df = filter_data(df, user_model, train_policy, test_policy, drop_na=True)
    
    if len(filter_df) > 0:    
        target = "out-of-distribution PPL"
        metrics = ["in-distribution PPL", "nDCG", "CMIP"]

        for metric_combination in metric_combinations:
            X = filter_df[metric_combination].values
            y = filter_df[target].values
            scores = repeated_cross_val_r2_score(
                X,
                y,
                DecisionTreeRegressor,
                n_splits=2,
                n_repeats=1000,
                use_adjusted_r2=True
            )
            
            for score in scores:  
                rows.append({
                    "user_model": f"{user_models.index(user_model)}_{user_model}",
                    "train_policy": f"{train_policies.index(train_policy)}_{train_policy}",
                    "test_policy": f"{test_policies.index(test_policy)}_{test_policy}",
                    "metric_combination": ", ".join(sorted(metric_combination)),
                    "r2_score": score,
                })
                
result_df = pd.DataFrame(rows)
result_df.head()

In [None]:
metric_df = result_df.groupby(["user_model", "train_policy", "test_policy", "metric_combination"], sort=False)["r2_score"].mean().reset_index()
metric_df = metric_df.pivot_table(
    index=["user_model", "train_policy", "test_policy"],
    columns="metric_combination",
    values="r2_score",
    sort=True
).reset_index()

metric_df = metric_df.round(3)
metric_df = metric_df[["user_model", "train_policy", "test_policy"] + list(result_df.metric_combination.unique())]
metric_df.style.background_gradient(axis=1)

In [None]:
" & ".join(list(map(str, metric_df.mean(0).round(3).values)))

## Significance testing

In [None]:
from scipy.stats import ttest_ind

alpha = 0.0001
target_metric = "in-distribution PPL, nDCG"
candidate_metrics = ['in-distribution PPL', 'nDCG', 'CMIP', 'in-distribution PPL, nDCG', 'CMIP, in-distribution PPL', 'CMIP, nDCG', 'CMIP, in-distribution PPL, nDCG']
rows = []

for user_model, train_policy, test_policy in product(*[user_models, train_policies, test_policies]):
    source = result_df[
        (result_df["user_model"].str.contains(user_model))
        & (result_df["train_policy"].str.contains(train_policy))
        & (result_df["test_policy"].str.contains(test_policy))
    ]
    
    if len(source) == 0:
        continue
    
    target_source = source[source["metric_combination"] == target_metric]
    
    for candidate_metric in candidate_metrics:
        candidate_source = source[source["metric_combination"] == candidate_metric]
        
        t, p = ttest_ind(
            candidate_source["r2_score"],
            target_source["r2_score"],
            alternative="two-sided",
            equal_var=False
        )
        
        reject_null = p < (alpha / len(candidate_metrics))
        
        if reject_null and t > 0:
            effect = 1 
        elif reject_null and t < 0:
            effect = -1
        else:
            effect = 0
   
        rows.append({
            "user_model": f"{user_models.index(user_model)}_{user_model}",
            "train_policy": f"{train_policies.index(train_policy)}_{train_policy}",
            "test_policy": f"{test_policies.index(test_policy)}_{test_policy}",
            "metric_combination": candidate_metric,
            "is_significant": reject_null,
            "p": p,
            "t": t,
            "effect": effect
        })

In [None]:
stats_df = pd.DataFrame(rows)
stats_df = stats_df.pivot_table(
    index=["user_model", "train_policy", "test_policy"],
    columns="metric_combination",
    values="effect",
    sort=True
).reset_index()
stats_df = stats_df[["user_model", "train_policy", "test_policy"] + list(result_df.metric_combination.unique())]
stats_df.style.background_gradient(axis=1)

In [None]:
def get_mark(i):
    i = float(i)
    
    if i == 1:
        return "$^\\blacktriangle$"
    elif i == -1:
        return "$^\\blacktriangledown$"
        
    return ""

metric_columns = list(result_df.metric_combination.unique())

for i in range(len(metric_df)):
    metric_row = metric_df.iloc[i]
    stats_row = stats_df.iloc[i]
    
    print(" & ".join([f"{m:.3f}{get_mark(s)}" for m, s in list(zip(metric_row[metric_columns].values, stats_row[metric_columns].values))]))