In [249]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.express as px

In [14]:
esci_data_path = "../esci-data/shopping_queries_dataset/"

df_examples = pd.read_parquet(esci_data_path + "shopping_queries_dataset_examples.parquet")

In [15]:
df_examples.head(20)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train
5,5,revent 80 cfm,0,B076Q7V5WX,us,E,0,1,train
6,6,revent 80 cfm,0,B075ZBF9HG,us,E,0,1,train
7,7,revent 80 cfm,0,B06W2LB17J,us,E,0,1,train
8,8,revent 80 cfm,0,B07JY1PQNT,us,E,0,1,train
9,9,revent 80 cfm,0,B01MZIK0PI,us,E,0,1,train


In [16]:
df_examples.describe(include='object')

Unnamed: 0,query,product_id,product_locale,esci_label,split
count,2621288,2621288,2621288,2621288,2621288
unique,130193,1802772,3,4,2
top,airpods,B01HFFXLNA,us,E,train
freq,233,120,1818825,1708158,1983272


In [17]:
df_examples.query_id.unique().size

130652

## Query set

In [28]:
np.random.seed(10)

In [185]:
# Sample query set
n_query_set = 20
# todo: sampling proportional to frequency
query_set = np.random.choice(df_examples["query"].unique(), n_query_set, replace=False)

# todo: perhaps consider product_locale? what about small/large version?

df_query_set = pd.DataFrame({"query": query_set})
df_query_set["weight"] = 1
df_query_set.head(10)

Unnamed: 0,query,weight
0,3d tree wall owl,1
1,handlebars,1
2,joyce meyer,1
3,mct oil capsules,1
4,nike air force 1 men,1
5,level tools,1
6,solar pool covers for inground pools,1
7,cute vans for women,1
8,classic black dresses for women,1
9,cuadro para oficina,1


## Judgments

In [186]:
# Select judgments
# Map esci_label to score
# create judgments per day in range
# create noise in score

label_num = {"E": 0, "S": 1, "C": 2, "I": 3}
label_score = [1, 0.1, 0.01, 0]
label_p_noise = 0.1

def label_noise(label):
    index = label_num[label]
    x = np.random.rand()
    if x < label_p_noise:
        index += 1 if x < label_p_noise / 2 else -1
        index = np.clip(index, 0, 3)
    return "ESCI"[index]

def label_to_score(label):
    return label_score[label_num[label]]


# Create dataset for 7 days
n_days = 7
end_date = datetime(2024,7,1)

dfs = []


for x in range(n_days):
    col_date = end_date - timedelta(days=x)
    df_judge = df_examples[df_examples["query"].isin(query_set)].copy()
    df_judge["judgment"] = df_judge.esci_label.apply(lambda x: label_to_score(label_noise(x)))
    df_judge["document"] = df_judge.product_id
    df_judge["datetime"] = col_date
    dfs.append(df_judge[["datetime", "query_id", "query", "document", "judgment"]])

df_judge = pd.concat(dfs).reset_index(drop=True)
df_judge.head(20)

Unnamed: 0,datetime,query_id,query,document,judgment
0,2024-07-01,3885,3d tree wall owl,B0797N6X4C,0.1
1,2024-07-01,3885,3d tree wall owl,B07W73XXQX,0.1
2,2024-07-01,3885,3d tree wall owl,B07TLKZT2Z,0.1
3,2024-07-01,3885,3d tree wall owl,B07YSCJSL3,1.0
4,2024-07-01,3885,3d tree wall owl,B07MDC94SD,0.1
5,2024-07-01,3885,3d tree wall owl,B07DLB4YKY,0.1
6,2024-07-01,3885,3d tree wall owl,B07BH3XQT2,1.0
7,2024-07-01,3885,3d tree wall owl,B075QGM8ZF,0.1
8,2024-07-01,3885,3d tree wall owl,B01E2Y7OZS,1.0
9,2024-07-01,3885,3d tree wall owl,B072DY7WJZ,0.1


## Fake Search Algorithms

In [233]:
import hashlib

def digest_str(s):
    m = hashlib.sha256()
    m.update(str.encode(s))
    return float(m.digest()[0]) / 256

class DropAndTopSearch:
    """
    Randomly drops documents from judgments and returns scores for kept documents.
    Emulates selecting some documents presents in judgments and showing them in top positions.
    """
    def __init__(self, name, p_keep):
        self.name = name
        self.p_keep = p_keep
    def run_search(self, df_judgment):
        """
        inputs
          df_judgment: query results dataframe for a single query,
            it contains the columns `query`, `document`.
            
        returns additional columns
          keep: whether the document is returned as part of the search
          score: what is the score by which the document is ranked
        """
        seed = df_judgment["query"] + df_judgment.document + self.name
        num = seed.apply(digest_str)
        df_judgment = df_judgment.copy()
        df_judgment["keep"] = num < self.p_keep
        df_judgment["score"] = np.where(num < self.p_keep, 1-num, 0)
        return df_judgment

ts1 = DropAndTopSearch("research_1", 0.1)
ts2 = DropAndTopSearch("research_2", 0.3)
ts3 = DropAndTopSearch("baseline", 0.3)
ts4 = DropAndTopSearch("challenger_1", 0.4)

search_configs = [ts1, ts2, ts3, ts4]

ts2.run_search(df_judge).head(20)

Unnamed: 0,datetime,query_id,query,document,judgment,keep,score
0,2024-07-01,3885,3d tree wall owl,B0797N6X4C,0.1,False,0.0
1,2024-07-01,3885,3d tree wall owl,B07W73XXQX,0.1,True,0.894531
2,2024-07-01,3885,3d tree wall owl,B07TLKZT2Z,0.1,False,0.0
3,2024-07-01,3885,3d tree wall owl,B07YSCJSL3,1.0,False,0.0
4,2024-07-01,3885,3d tree wall owl,B07MDC94SD,0.1,False,0.0
5,2024-07-01,3885,3d tree wall owl,B07DLB4YKY,0.1,False,0.0
6,2024-07-01,3885,3d tree wall owl,B07BH3XQT2,1.0,False,0.0
7,2024-07-01,3885,3d tree wall owl,B075QGM8ZF,0.1,False,0.0
8,2024-07-01,3885,3d tree wall owl,B01E2Y7OZS,1.0,False,0.0
9,2024-07-01,3885,3d tree wall owl,B072DY7WJZ,0.1,False,0.0


## Metrics

### Metrics definitions

Metrics evaluate the performance of a search configuration
To this purpose, they receive a dataframe that contains a search result per row
Metrics functions expect the following columns:
 * keep: whether the result is kept by the search config under evaluation
 * score: the score given to the search result (higher is better)
 * judgment: the judgment value for that row


In [242]:
def dcg(df, sorting_col="score", mask="keep"):
    df = df.sort_values(sorting_col, ascending=False)
    positions = np.arange(df.query_id.size) + 1
    dcgs = (2 ** df.judgment - 1) / np.log2(positions+1)
    dcgs = dcgs * df[mask] if mask else dcgs
    return np.sum(dcgs)

def ndcg(df):
    return dcg(df) / dcg(df, "judgment", None)

def precision(df, k):
    df = df.sort_values("score", ascending=False)
    return (df.judgment * df.keep).head(k).sum() / k

def jaccard(df):
    return (df.keep&df.ref_keep).sum() / (df.keep|df.ref_keep).sum()

metrics = [
    ("dcg", dcg, None),
    ("ndcg", ndcg, None),
    ("prec@1", lambda x: precision(x, 1), None),
    ("jaccard", jaccard, ts3),
]

### Metrics Calculation

Nested loop over:
 * Search configurations
 * Metrics
 * Judgments dataset (grouped by date and query)

In [246]:
df_metrics = []

for search in search_configs:
    for m_name, m_function, ref_search in metrics:
        for (dt, q_id, query), df_sq in df_judge.groupby(["datetime", "query_id", "query"]):
            ranked_df_sq = search.run_search(df_sq)
            if ref_search:
                ref_ranked_df_sq = ref_search.run_search(df_sq)
                ranked_df_sq["ref_keep"] = ref_ranked_df_sq.keep
                ranked_df_sq["ref_score"] = ref_ranked_df_sq.score
            metric = m_function(ranked_df_sq)
            df_metrics.append(pd.DataFrame({
                "datetime": [dt],
                "search_config": [search.name],
                "query_id": [q_id],
                "query": [query],
                "metric": [m_name],
                "value": [metric],
            }))
df_metrics = pd.concat(df_metrics)

### Metrics dataset

The metrics dataset contains the result of evaluating a search config on judgments.
Each row contains the evaluation of a search config on a query on a given metric.

It contains the following fields:
 * datetime: the date and time of the judgment
 * search_config: search config under evaluation
 * query_id: query_id from which judgment was derived
 * query: query from which judgment was derived
 * metric: name of metric used for evaluation
 * value: evaluation result

In [248]:
df_metrics.sample(10)

Unnamed: 0,datetime,search_config,query_id,query,metric,value
0,2024-06-25,challenger_1,63476,long johns for men,jaccard,0.125
0,2024-06-28,challenger_1,66671,mct oil capsules,jaccard,0.25
0,2024-06-26,research_2,61683,level tools,ndcg,0.435728
0,2024-06-30,challenger_1,26561,classic black dresses for women,dcg,5.123781
0,2024-06-30,research_1,30507,cuadro para oficina,jaccard,0.0
0,2024-06-30,research_1,48936,handlebars,prec@1,1.0
0,2024-07-01,research_1,104600,toyota white touch up,jaccard,0.176471
0,2024-06-30,research_2,32916,diabetic massager,jaccard,0.181818
0,2024-06-26,challenger_1,61683,level tools,prec@1,1.0
0,2024-06-30,research_2,63476,long johns for men,jaccard,0.125


In [240]:
df_metrics[["search_config", "metric", "value"]].groupby(["search_config", "metric"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
search_config,metric,Unnamed: 2_level_1
baseline,dcg,2.590198
baseline,jaccard,1.0
baseline,ndcg,0.411534
baseline,prec@1,0.790071
challenger_1,dcg,3.175085
challenger_1,jaccard,0.224918
challenger_1,ndcg,0.504038
challenger_1,prec@1,0.774071
research_1,dcg,1.156757
research_1,jaccard,0.066443


In [260]:
df_metrics[["search_config", "metric", "value", "datetime"]].groupby(["search_config", "metric", "datetime"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
search_config,metric,datetime,Unnamed: 3_level_1
baseline,dcg,2024-06-25,2.523220
baseline,dcg,2024-06-26,2.548838
baseline,dcg,2024-06-27,2.679400
baseline,dcg,2024-06-28,2.593999
baseline,dcg,2024-06-29,2.578848
...,...,...,...
research_2,prec@1,2024-06-27,0.765500
research_2,prec@1,2024-06-28,0.725000
research_2,prec@1,2024-06-29,0.810500
research_2,prec@1,2024-06-30,0.765500


# Visualizations

## Visualization of single configuration

In [269]:
sel = df_metrics.search_config.isin(["research_1"])
sel = sel & (df_metrics.metric.isin(["prec@1", "jaccard"]))
cols = ["search_config", "datetime", "metric"]
df = df_metrics[sel][cols + ["value"]].groupby(cols).mean().reset_index()

fig = px.line(df, x="datetime", y="value", color="metric")
fig.update_yaxes(matches=None)
fig

## Evaluating multiple search configurations

In [273]:
sel = df_metrics.search_config.isin(["research_1", "research_2", "challenger_1"])
sel = sel & (df_metrics.metric.isin(["prec@1", "dcg"]))
cols = ["search_config", "datetime", "metric"]
df = df_metrics[sel][cols + ["value"]].groupby(cols).mean().reset_index()

fig = px.line(df, x="datetime", y="value", color="search_config", facet_row="metric")
fig.update_yaxes(matches=None)
fig

In [289]:
sel = df_metrics.search_config.isin(["research_1", "research_2", "challenger_1"])
sel = sel & (df_metrics.metric.isin(["jaccard"]))
df = df_metrics[sel]
df = df[["value", "metric", "search_config", "query", "datetime"]]

fig = px.violin(df, y="value", x="search_config", points="all", color="metric", hover_data=df.columns)
fig

In [292]:
sel = df_metrics.search_config.isin(["research_1", "research_2", "challenger_1"])
sel = sel & (df_metrics.metric.isin(["dcg"]))
df = df_metrics[sel]
df = df[["value", "metric", "search_config", "query", "datetime"]]

fig = px.violin(df, y="value", x="search_config", points="all", color="metric", hover_data=df.columns)
fig

## Comparing two search configurations

In [293]:
sel = df_metrics.search_config.isin(["baseline", "challenger_1"])
sel = sel & (df_metrics.metric.isin(["dcg"]))
df = df_metrics[sel]
df = df[["value", "metric", "search_config", "query", "datetime"]]

fig = px.violin(df, y="value", x="search_config", points="all", color="metric", hover_data=df.columns)
fig