In [96]:
!pip install --quiet altair


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [249]:
import altair as alt
import pandas as pd
import rax

from src.theme import theme
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [98]:
alt.themes.register("latex", theme)
alt.themes.enable("latex")

ThemeRegistry.enable('latex')

# Relevance Label Distribution

In [99]:
annotation_df = pd.read_parquet("output/annotations.parquet")

In [100]:
source = annotation_df.groupby(["label"]).agg(
    documents=("title", "count"),
).reset_index()

source["perc_documents"] = (source.documents / source.documents.sum()).round(4)
source

Unnamed: 0,label,documents,perc_documents
0,0,219305,0.5516
1,1,36622,0.0921
2,2,112759,0.2836
3,3,28172,0.0709
4,4,714,0.0018


In [108]:
base = alt.Chart(source, width=400, height=200).encode(
    x=alt.X("label:O", title="Relevance label").axis(labelAngle=0),
    y=alt.Y("perc_documents:Q", title="% of documents"),
    text=alt.Text("perc_documents:Q", format=".2%"),
)

base.mark_bar() + base.mark_text(align="center", dy=-5, size=12)

In [109]:
directory = Path("figures")
name = "relevance_labels"

svg_file = directory / f"{name}.svg"
pdf_file = directory / f"{name}.pdf"

# to .svg
chart.save(svg_file)
# to .pdf
!rsvg-convert -f pdf {svg_file} > {pdf_file}

## Navigational Queries
Only documents that are the destination for a navigational query are marked with "4", thus, we can analyze navigational queries.

In [194]:
annotation_df["is_navigational"] = annotation_df["label"] == 4

query_df = annotation_df.groupby(["query_no", "frequency_bucket"]).agg(
    navigational_query=("is_navigational", "max"),
    navigational_documents=("is_navigational", "sum"),
).reset_index()

In [195]:
print(f"Share of queries that are navigational: {query_df['navigational_query'].mean():.2%}")
print(f"Average destination docs for a navigational query: {query_df[query_df['navigational_query']]['navigational_documents'].mean():.2f}")
print(f"P95 destination docs for a navigational query: {query_df[query_df['navigational_query']]['navigational_documents'].quantile(0.95)}")
print(f"Max destination docs for a navigational query: {query_df[query_df['navigational_query']]['navigational_documents'].max()}")

Share of queries that are navigational: 4.56%
Average destination docs for a navigational query: 2.23
P95 destination docs for a navigational query: 5.0
Max destination docs for a navigational query: 21


In [203]:
source = query_df.groupby(["frequency_bucket"]).agg(navigational_query=("navigational_query", "mean")).reset_index()

alt.Chart(source, title="Navigational queries and query frequency",  width=600, height=250).mark_bar().encode(
    x=alt.X("frequency_bucket:O", title="Query frequency (high to low)").axis(labelAngle=0),
    y=alt.Y("navigational_query", title="Rate of navigational queries").axis(format="%"),
    tooltip=list(source.columns),
)

## Relevance distribution and query frequency
H: Frequent queries have more relevant annotated docs

In [215]:
source = annotation_df.groupby(["frequency_bucket", "label"]).agg(total_documents=("title", "count")).reset_index()

alt.Chart(source, width=600, height=250).mark_bar().encode(
    x=alt.X("frequency_bucket:O", title="Query frequency (high to low)").axis(labelAngle=0),
    y=alt.Y("total_documents", title="% of documents with relevance").stack("normalize"),
    color=alt.Color("label:N", title="Relevance")
)

## Impact of queries on DCG

In [344]:
import numpy as np
import rax
from functools import partial

metric = partial(rax.dcg_metric, topn=10)

def random(metric, x):
    x = np.tile(x, (100, 1))
    return np.mean(metric(np.random.rand(*x.shape), x))

query_df = annotation_df.groupby(["query_no", "frequency_bucket"]).agg(labels=("label", list)).reset_index()
query_df["labels"] = query_df["labels"].map(np.array)
query_df["max"] = query_df["labels"].map(lambda x: metric(x, x))
query_df["min"] = query_df["labels"].map(lambda x: metric(4 - x, x))
query_df["diff"] = query_df["max"] - query_df["min"]
query_df["random"] = query_df["labels"].map(lambda x: random(metric, x))

In [345]:
print("Max attainable metric (perfect ranking):", query_df["max"].mean())
print("Min attainable metric (inverse ranking):", query_df["min"].mean())
print("Random ranking:", query_df["random"].mean())

Max attainable metric (perfect ranking): 17.461441244650498
Min attainable metric (inverse ranking): 0.6935725888240014
Random ranking: 6.67127039497147


In [352]:
def compute_importance(min, max):
    idx = np.argsort(-max)
    min = min[idx]
    max = max[idx]
    
    cum_metric = []

    cum_metric.append(min.mean())

    for i in range(len(min)):
        min[i] = max[i]
        cum_metric.append(min.mean())

    return pd.DataFrame({
        "x": range(len(min) + 1),
        "y": cum_metric,
    })

source = compute_importance(np.array(list(query_df["random"])), np.array(list(query_df["max"])))
source.head()

Unnamed: 0,x,y
0,0,6.67127
1,1,6.678887
2,2,6.683491
3,3,6.689999
4,4,6.696552


In [353]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_point(nearest=True, on="pointerover",
                              fields=["x"], empty=False)

# The basic line
line = alt.Chart(source.round(2)).mark_line(interpolate="basis").encode(
    x=alt.X("x:Q", title="Queries sorted by impact (Max DCG - Random DCG)"),
    y=alt.Y("y:Q", title="DCG@10")
)

selectors = alt.Chart(source).mark_point().encode(
    x="x:Q",
    opacity=alt.value(0),
).add_params(
    nearest
)

# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align="left", dx=5, dy=-5).encode(
    text=alt.condition(nearest, "y:Q", alt.value(" "))
)

# Draw a rule at the location of the selection
rules = alt.Chart(source).mark_rule(color="gray").encode(
    x="x:Q",
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
alt.layer(
    line, selectors, points, rules, text
).properties(
    width=600, height=300
)

In [354]:
# Some queries are not affected by their ranking (no relevant docs, or same relevance docs):
print(f"Number of test queries that are invariant the under ranking model: {len(query_df[query_df['diff'] == 0])}")
print(f"% of test queries that are invariant the under ranking model: {len(query_df[query_df['diff'] == 0]) / len(query_df):.2%}")

Number of test queries that are invariant the under ranking model: 515
% of test queries that are invariant the under ranking model: 7.35%


Note: Under MRR this share increases to almost 9% as Rax MRR treats all documents with label > 0 as relevant, meaning these additional queries contain no irrelevant document.
- Number of test queries that are invariant the under ranking model: 624
- % of test queries that are invariant the under ranking model: 8.90%