In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.express as px

In [2]:
esci_data_path = "../../../esci-data/shopping_queries_dataset/"

df_examples = pd.read_parquet(esci_data_path + "shopping_queries_dataset_examples.parquet")

In [3]:
df_examples.head(20)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train
5,5,revent 80 cfm,0,B076Q7V5WX,us,E,0,1,train
6,6,revent 80 cfm,0,B075ZBF9HG,us,E,0,1,train
7,7,revent 80 cfm,0,B06W2LB17J,us,E,0,1,train
8,8,revent 80 cfm,0,B07JY1PQNT,us,E,0,1,train
9,9,revent 80 cfm,0,B01MZIK0PI,us,E,0,1,train


In [4]:
df_examples.describe(include='object')

Unnamed: 0,query,product_id,product_locale,esci_label,split
count,2621288,2621288,2621288,2621288,2621288
unique,130193,1802772,3,4,2
top,airpods,B01HFFXLNA,us,E,train
freq,233,120,1818825,1708158,1983272


In [5]:
df_examples.query_id.unique().size

130652

## Query set

A query set has the columns:
 * query
 * weight

There is currently no date corresponding to the query set.
And currently the sampling is not done based on frequency

In [6]:
np.random.seed(10)

In [7]:
# Sample query set
n_query_set = 20
# todo: sampling proportional to frequency
query_set = np.random.choice(df_examples["query"].unique(), n_query_set, replace=False)

# todo: perhaps consider product_locale? what about small/large version?

df_query_set = pd.DataFrame({"query": query_set})
df_query_set["weight"] = 1
df_query_set.head(10)

Unnamed: 0,query,weight
0,ゆうびん番号枠入り はがき 切手なし,1
1,habitrail ovo twist,1
2,green wall decals,1
3,cruz comunion niño,1
4,fishing pole,1
5,citizen limited edition,1
6,nail light,1
7,舞台 アイシャドウ,1
8,explorer k2,1
9,bluray movie collection,1


In [8]:
df_query_set.to_csv("query_set.csv")

## Judgments

The judgments dataset has a row per query instance and document and has the following columns:
 * datetime: date of query/document instance
 * query_id: identifier of query instance
 * query: the query
 * document: identifier of a document result
 * judgment: Here we use the proposed ESCI mapping for DCG: `{"E": 0, "S": 1, "C": 2, "I": 3}`

In [9]:
# Select judgments
# Map esci_label to score
# create judgments per day in range
# create noise in score

label_num = {"E": 0, "S": 1, "C": 2, "I": 3}
label_score = [1, 0.1, 0.01, 0]
label_p_noise = 0.1

def label_noise(label):
    index = label_num[label]
    x = np.random.rand()
    if x < label_p_noise:
        index += 1 if x < label_p_noise / 2 else -1
        index = np.clip(index, 0, 3)
    return "ESCI"[index]

def label_to_score(label):
    return label_score[label_num[label]]


# Create dataset for 7 days
n_days = 7
end_date = datetime(2024,7,1)

dfs = []


for x in range(n_days):
    col_date = end_date - timedelta(days=x)
    df_judge = df_examples[df_examples["query"].isin(query_set)].copy()
    df_judge["judgment"] = df_judge.esci_label.apply(lambda x: label_to_score(label_noise(x)))
    df_judge["document"] = df_judge.product_id
    df_judge["datetime"] = col_date
    dfs.append(df_judge[["datetime", "query_id", "query", "document", "judgment"]])

df_judge = pd.concat(dfs).reset_index(drop=True)
df_judge.head(20)

Unnamed: 0,datetime,query_id,query,document,judgment
0,2024-07-01,4428,4t camo sweat shirt,B08229CW85,1.0
1,2024-07-01,4428,4t camo sweat shirt,B07ZND27W8,0.1
2,2024-07-01,4428,4t camo sweat shirt,B07NJSHRGQ,0.1
3,2024-07-01,4428,4t camo sweat shirt,B07MY27Q2W,0.1
4,2024-07-01,4428,4t camo sweat shirt,B07MMT8RWR,1.0
5,2024-07-01,4428,4t camo sweat shirt,B07HDZXWHY,1.0
6,2024-07-01,4428,4t camo sweat shirt,B07H9X9QPF,0.1
7,2024-07-01,4428,4t camo sweat shirt,B07FXDR1QN,0.1
8,2024-07-01,4428,4t camo sweat shirt,B0773ZZB9V,1.0
9,2024-07-01,4428,4t camo sweat shirt,B079FHZ7WN,0.1


In [10]:
df_judge.to_csv("judgments.csv")

## Fake Search Algorithms

These are not real search algorithms, rather fake algorithms in order to compute metrics.
An algorithm receives a document result set for a query and returns which documents it keeps
and what is the score that is assigned to them.

In [11]:
import hashlib

def digest_str(s):
    m = hashlib.sha256()
    m.update(str.encode(s))
    return float(m.digest()[0]) / 256

class DropAndTopSearch:
    """
    Randomly drops documents from judgments and returns scores for kept documents.
    Emulates selecting some documents presents in judgments and showing them in top positions.
    """
    def __init__(self, name, p_keep):
        self.name = name
        self.p_keep = p_keep
    def run_search(self, df_judgment):
        """
        inputs
          df_judgment: query results dataframe for a single query,
            it contains the columns `query`, `document`.
            
        returns additional columns
          keep: whether the document is returned as part of the search
          score: what is the score by which the document is ranked
        """
        seed = df_judgment["query"] + df_judgment.document + self.name
        num = seed.apply(digest_str)
        df_judgment = df_judgment.copy()
        df_judgment["keep"] = num < self.p_keep
        df_judgment["score"] = np.where(num < self.p_keep, 1-num, 0)
        return df_judgment

ts1 = DropAndTopSearch("research_1", 0.1)
ts2 = DropAndTopSearch("research_2", 0.3)
ts3 = DropAndTopSearch("baseline", 0.3)
ts4 = DropAndTopSearch("challenger_1", 0.4)

search_configs = [ts1, ts2, ts3, ts4]

ts2.run_search(df_judge).head(20)

Unnamed: 0,datetime,query_id,query,document,judgment,keep,score
0,2024-07-01,4428,4t camo sweat shirt,B08229CW85,1.0,False,0.0
1,2024-07-01,4428,4t camo sweat shirt,B07ZND27W8,0.1,False,0.0
2,2024-07-01,4428,4t camo sweat shirt,B07NJSHRGQ,0.1,False,0.0
3,2024-07-01,4428,4t camo sweat shirt,B07MY27Q2W,0.1,True,0.980469
4,2024-07-01,4428,4t camo sweat shirt,B07MMT8RWR,1.0,False,0.0
5,2024-07-01,4428,4t camo sweat shirt,B07HDZXWHY,1.0,False,0.0
6,2024-07-01,4428,4t camo sweat shirt,B07H9X9QPF,0.1,True,0.878906
7,2024-07-01,4428,4t camo sweat shirt,B07FXDR1QN,0.1,False,0.0
8,2024-07-01,4428,4t camo sweat shirt,B0773ZZB9V,1.0,True,0.890625
9,2024-07-01,4428,4t camo sweat shirt,B079FHZ7WN,0.1,True,0.941406


## Metrics

### Metrics definitions

Metrics evaluate the performance of a search configuration
To this purpose, they receive a dataframe that contains a search result per row
Metrics functions expect the following columns:
 * keep: whether the result is kept by the search config under evaluation
 * score: the score given to the search result (higher is better)
 * judgment: the judgment value for that row


In [12]:
def dcg(df, sorting_col="score", mask="keep"):
    df = df.sort_values(sorting_col, ascending=False)
    positions = np.arange(df.query_id.size) + 1
    dcgs = (2 ** df.judgment - 1) / np.log2(positions+1)
    dcgs = dcgs * df[mask] if mask else dcgs
    return np.sum(dcgs)

def ndcg(df):
    return dcg(df) / dcg(df, "judgment", None)

def precision(df, k):
    df = df.sort_values("score", ascending=False)
    return (df.judgment * df.keep).head(k).sum() / k

def jaccard(df):
    common_count = (df.keep&df.ref_keep).sum()
    union_count = (df.keep|df.ref_keep).sum()
    return common_count / union_count if common_count > 0.0 else common_count

metrics = [
    ("dcg", dcg, None),
    ("ndcg", ndcg, None),
    ("prec@1", lambda x: precision(x, 1), None),
    ("jaccard", jaccard, ts3),
]

### Metrics Calculation

Nested loop over:
 * Search configurations
 * Metrics
 * Judgments dataset (grouped by date and query)

In [13]:
df_metrics = []

for search in search_configs:
    for m_name, m_function, ref_search in metrics:
        for (dt, q_id, query), df_sq in df_judge.groupby(["datetime", "query_id", "query"]):
            ranked_df_sq = search.run_search(df_sq)
            if ref_search:
                ref_ranked_df_sq = ref_search.run_search(df_sq)
                ranked_df_sq["ref_keep"] = ref_ranked_df_sq.keep
                ranked_df_sq["ref_score"] = ref_ranked_df_sq.score
            metric = m_function(ranked_df_sq)
            df_metrics.append(pd.DataFrame({
                "datetime": [dt],
                "search_config": [search.name],
                "query_id": [q_id],
                "query": [query],
                "metric": [m_name],
                "value": [metric],
            }))
df_metrics = pd.concat(df_metrics)

### Metrics dataset calculation

The metrics dataset contains the result of evaluating a search config on judgments.
Each row contains the evaluation of a search config on a query on a given metric.

It contains the following fields:
 * datetime: the date and time of the judgment
 * search_config: search config under evaluation
 * query_id: query_id from which judgment was derived
 * query: query from which judgment was derived
 * metric: name of metric used for evaluation
 * value: evaluation result

In [14]:
df_metrics.sample(10)

Unnamed: 0,datetime,search_config,query_id,query,metric,value
0,2024-06-28,research_1,94694,small dog muzzle,jaccard,0.5
0,2024-06-27,research_2,120498,スマホ レーダー探知機,prec@1,0.01
0,2024-06-29,challenger_1,120498,スマホ レーダー探知機,jaccard,0.333333
0,2024-07-01,baseline,40757,fishing pole,prec@1,0.1
0,2024-06-26,research_2,38683,explorer k2,jaccard,0.157895
0,2024-06-29,baseline,46483,gold shirt women,ndcg,0.435409
0,2024-06-25,research_1,26436,citizen limited edition,ndcg,0.025632
0,2024-07-01,research_2,118297,エプロン 子供,jaccard,0.222222
0,2024-06-29,research_1,94694,small dog muzzle,dcg,2.13093
0,2024-06-26,baseline,17666,bluray movie collection,dcg,2.13093


In [15]:
df_metrics.to_csv("metrics.csv")