In [1]:
!pip install --quiet altair


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import altair as alt
import pandas as pd

In [3]:
click_df = pd.read_parquet("output/clicks.parquet", columns=["query", "title", "abstract", "position", "click"])
annotation_df = pd.read_parquet("output/annotations.parquet")

## Utils

In [4]:
def match(df1, df2, columns, unique=False):
    if unique:
        df1 = df1[columns].drop_duplicates(columns)
        df2 = df2[columns].drop_duplicates(columns)

    return df1.merge(df2, on=columns)

def missing_text(df, column, missing_token="21429"):
    return (df[column] == missing_token).agg(["sum", "mean"]).to_dict()

In [5]:
print("# Expert annotations")
print(f"Unique queries (by text): {annotation_df['query'].nunique():,}")
print(f"Unique queries (by id): {annotation_df['query_id'].nunique():,}")
print(f"Query-doc pairs: {len(annotation_df):,}")
print(f"Unique query/title comb.: {len(annotation_df[['query_id', 'title']].drop_duplicates()):,}")

# Expert annotations
Unique queries (by text): 7,008
Unique queries (by id): 5,201
Query-doc pairs: 397,572
Unique query/title comb.: 368,792


In [6]:
print("# User clicks")
print(f"Unique queries (by text): {click_df['query'].nunique():,}")
print(f"Unique queries (by text) not in click dataset: {annotation_df['query'].nunique() - click_df['query'].nunique():,}")
print(f"Query-doc pairs: {len(click_df):,}")

# User clicks
Unique queries (by text): 3,366
Unique queries (by text) not in click dataset: 3,642
Query-doc pairs: 59,891,217


In [None]:
print("# Text analysis")
print("## Expert annotations")
print(f"Missing query: {missing_text(annotation_df, column='query')}")
print(f"Missing title: {missing_text(annotation_df, column='title')}")
print(f"Missing abstract: {missing_text(annotation_df, column='abstract')}")
print()
print("## User clicks")
print(f"Missing query: {missing_text(click_df, column='query')}")
print(f"Missing title: {missing_text(click_df, column='title')}")
print(f"Missing abstract: {missing_text(click_df, column='abstract')}")
print()
print("# Text match")
print(f"Query/title comb. in both datasets: {len(match(annotation_df, click_df, columns=['query', 'title'])):,}")
print(f"Unique query/title comb. in both datasets: {len(match(annotation_df, click_df, columns=['query', 'title'], unique=True)):,}")
print(f"% of query/title comb. of the test set occur in both datasets: {len(match(annotation_df, click_df, columns=['query', 'title'], unique=True)) / len(annotation_df[['query', 'title']].drop_duplicates())}")
print(f"Query/title/abstract comb. in both datasets: {len(match(annotation_df, click_df, columns=['query', 'title', 'abstract'])):,}")

# Text analysis
## Expert annotations
Missing query: {'sum': 0.0, 'mean': 0.0}
Missing title: {'sum': 0.0, 'mean': 0.0}
Missing abstract: {'sum': 0.0, 'mean': 0.0}

## User clicks
Missing query: {'sum': 0.0, 'mean': 0.0}
Missing title: {'sum': 9744019.0, 'mean': 0.1626952913646754}
Missing abstract: {'sum': 56333874.0, 'mean': 0.940603260741888}

# Text match
Query/title comb. in both datasets: 51,746,434
Unique query/title comb. in both datasets: 8,672
% of query/title comb. of the test set occur in both datasets: 0.023512888435790996
