In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.express as px
pd.set_option('display.float_format','{:.2f}'.format)

In [2]:
esci_data_path = "../../../esci-data/shopping_queries_dataset/"

df_examples = pd.read_parquet(esci_data_path + "shopping_queries_dataset_examples.parquet")

In [3]:
df_examples.head(20)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train
5,5,revent 80 cfm,0,B076Q7V5WX,us,E,0,1,train
6,6,revent 80 cfm,0,B075ZBF9HG,us,E,0,1,train
7,7,revent 80 cfm,0,B06W2LB17J,us,E,0,1,train
8,8,revent 80 cfm,0,B07JY1PQNT,us,E,0,1,train
9,9,revent 80 cfm,0,B01MZIK0PI,us,E,0,1,train


In [4]:
df_examples.describe(include='object')

Unnamed: 0,query,product_id,product_locale,esci_label,split
count,2621288,2621288,2621288,2621288,2621288
unique,130193,1802772,3,4,2
top,airpods,B01HFFXLNA,us,E,train
freq,233,120,1818825,1708158,1983272


In [5]:
df_examples.query_id.unique().size

130652

## Query set

A query set has the columns:
 * query_set_id
 * query

Potentially other columns:
 * (head/torso/tail)

There is currently no date corresponding to the query set.
And currently the sampling is not done based on frequency

In [6]:
np.random.seed(10)

In [7]:
# Sample query sets

query_sets = [("sampled_queries", 200), ("top_queries", 20)]

res = []

for query_set_id, n_query_set in query_sets:
    # todo: sampling proportional to frequency
    query_set = np.random.choice(df_examples["query"].unique(), n_query_set, replace=False)

    # todo: perhaps consider product_locale? what about small/large version?

    df = pd.DataFrame({"query": query_set})
    df["query_set_id"] = query_set_id
    res.append(df)
df_query_set = pd.concat(res)
df_query_set.head(10)

Unnamed: 0,query,query_set_id
0,ゆうびん番号枠入り はがき 切手なし,sampled_queries
1,habitrail ovo twist,sampled_queries
2,green wall decals,sampled_queries
3,cruz comunion niño,sampled_queries
4,fishing pole,sampled_queries
5,citizen limited edition,sampled_queries
6,nail light,sampled_queries
7,舞台 アイシャドウ,sampled_queries
8,explorer k2,sampled_queries
9,bluray movie collection,sampled_queries


In [8]:
df_query_set.to_csv("query_set.csv", index=False)

## Judgments

The judgments dataset has a row per query instance and document and has the following columns:
 * datetime: date of query/document instance
 * query_id: identifier of query instance
 * query: the query
 * document: identifier of a document result
 * judgment: Here we use the proposed ESCI mapping for DCG: `{"E": 0, "S": 1, "C": 2, "I": 3}`

In [9]:
# Select judgments
# Map esci_label to score
# create judgments per day in range
# create noise in score

label_num = {"E": 0, "S": 1, "C": 2, "I": 3}
label_score = [1, 0.1, 0.01, 0]
label_p_noise = 0.1

def label_to_score(label):
    return label_score[label_num[label]]

df_judge = df_examples[df_examples["query"].isin(set(df_query_set["query"].values))].copy()
df_judge["judgment"] = df_judge.esci_label.apply(lambda x: label_to_score(x))
df_judge["document"] = df_judge.product_id
df_judge = df_judge[["query", "document", "judgment"]].reset_index(drop=True)
df_judge.head(20)

Unnamed: 0,query,document,judgment
0,00g 目立たない,B08TTRRTKY,0.1
1,00g 目立たない,B08THGRFN5,0.1
2,00g 目立たない,B08HGRMFF3,0.1
3,00g 目立たない,B07XG2BK5P,1.0
4,00g 目立たない,B08F6ZJQBV,0.1
5,00g 目立たない,B08X6M1XDH,0.1
6,00g 目立たない,B07RXZ7MND,1.0
7,00g 目立たない,B08HD6H9QZ,1.0
8,00g 目立たない,B0928VX2MR,0.1
9,00g 目立たない,B09B6YWQLL,1.0


In [10]:
df_judge.to_csv("judgments.csv", index=False)

## Fake Search Algorithms

These are not real search algorithms, rather fake algorithms in order to compute metrics.
An algorithm receives a document result set for a query and returns which documents it keeps
and what is the score that is assigned to them.

In [11]:
import hashlib

def digest_str(s):
    m = hashlib.sha256()
    m.update(str.encode(s))
    return float(m.digest()[0]) / 256

class DropAndTopSearch:
    """
    Randomly drops documents from judgments and returns scores for kept documents.
    Emulates selecting some documents presents in judgments and showing them in top positions.
    """
    def __init__(self, name, p_keep, noise):
        self.name = name
        self.p_keep = p_keep
        self.noise_magnitude = noise
    def run_search(self, df_judgment, date):
        """
        inputs
          df_judgment: query results dataframe for a single query,
            it contains the columns `query`, `document`.
            
        returns additional columns
          keep: whether the document is returned as part of the search
          score: what is the score by which the document is ranked
        """
        seed = df_judgment["query"] + df_judgment.document + self.name
        num = seed.apply(digest_str)
        noise_seed = self.name + date
        noise = (digest_str(noise_seed) - 0.5) * 2.0 * self.noise_magnitude
        df_judgment = df_judgment.copy()
        df_judgment["keep"] = num + noise < self.p_keep
        df_judgment["score"] = np.where(num + noise < self.p_keep, 1-num-noise, 0)
        return df_judgment

ts1 = DropAndTopSearch("research_1", 0.1, 0.05)
ts2 = DropAndTopSearch("research_2", 0.3, 0.1)
ts3 = DropAndTopSearch("baseline", 0.4, 0.01)
ts4 = DropAndTopSearch("challenger_1", 0.5, 0.02)

search_configs = [ts1, ts2, ts3, ts4]

ts2.run_search(df_judge, str(datetime(2024,7,1))).head(20)

Unnamed: 0,query,document,judgment,keep,score
0,00g 目立たない,B08TTRRTKY,0.1,False,0.0
1,00g 目立たない,B08THGRFN5,0.1,False,0.0
2,00g 目立たない,B08HGRMFF3,0.1,False,0.0
3,00g 目立たない,B07XG2BK5P,1.0,False,0.0
4,00g 目立たない,B08F6ZJQBV,0.1,False,0.0
5,00g 目立たない,B08X6M1XDH,0.1,False,0.0
6,00g 目立たない,B07RXZ7MND,1.0,False,0.0
7,00g 目立たない,B08HD6H9QZ,1.0,False,0.0
8,00g 目立たない,B0928VX2MR,0.1,True,0.76
9,00g 目立たない,B09B6YWQLL,1.0,False,0.0


## Metrics

### Metrics definitions

Metrics evaluate the performance of a search configuration
To this purpose, they receive a dataframe that contains a search result per row
Metrics functions expect the following columns:
 * keep: whether the result is kept by the search config under evaluation
 * score: the score given to the search result (higher is better)
 * judgment: the judgment value for that row


In [12]:
def dcg(df, sorting_col="score", mask="keep"):
    df = df.sort_values(sorting_col, ascending=False)
    positions = np.arange(df[sorting_col].size) + 1
    dcgs = (2 ** df.judgment - 1) / np.log2(positions+1)
    dcgs = dcgs * df[mask] if mask else dcgs
    return np.sum(dcgs)

def ndcg(df):
    return dcg(df) / dcg(df, "judgment", None)

def precision(df, k):
    df = df.sort_values("score", ascending=False)
    return (df.judgment * df.keep).head(k).sum() / k

def jaccard(df):
    common_count = (df.keep&df.ref_keep).sum()
    union_count = (df.keep|df.ref_keep).sum()
    return common_count / union_count if common_count > 0.0 else common_count

metrics = [
    ("dcg", dcg, None),
    ("ndcg", ndcg, None),
    ("prec@1", lambda x: precision(x, 1), None),
    ("prec@5", lambda x: precision(x, 1), None),
    ("prec@10", lambda x: precision(x, 10), None),
    ("jaccard", jaccard, ts3),
]

### Metrics Calculation

Nested loop over:
 * Search configurations
 * Metrics
 * Judgments dataset (grouped by date and query)

In [13]:
# Create dataset for 7 days
n_days = 7
end_date = datetime(2024,7,1)

df_metrics = []

for search in search_configs:
    for m_name, m_function, ref_search in metrics:
        for x in range(n_days):
            for (query_set_id, query), df_one_query_set in df_query_set.groupby(["query_set_id", "query"]):
                curr_date = end_date - timedelta(days=x)
                queries = df_one_query_set["query"].values
                judgments = df_judge[df_judge["query"]==query]
                ranked_df_sq = search.run_search(judgments, str(curr_date))
                if ref_search:
                    ref_ranked_df_sq = ref_search.run_search(judgments, str(curr_date))
                    ranked_df_sq["ref_keep"] = ref_ranked_df_sq.keep
                    ranked_df_sq["ref_score"] = ref_ranked_df_sq.score
                metric = m_function(ranked_df_sq)
                df_metrics.append(pd.DataFrame({
                    "datetime": [curr_date],
                    "search_config": [search.name],
                    "query_set_id": [query_set_id],
                    "query": [query],
                    "metric": [m_name],
                    "value": [metric],
                }))
df_metrics = pd.concat(df_metrics)

In [14]:
pd.crosstab([df_metrics.query_set_id, df_metrics.search_config], df_metrics.metric, df_metrics.value, aggfunc="mean")

Unnamed: 0_level_0,metric,dcg,jaccard,ndcg,prec@1,prec@10,prec@5
query_set_id,search_config,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sampled_queries,baseline,2.46,1.0,0.47,0.64,0.46,0.64
sampled_queries,challenger_1,2.79,0.28,0.53,0.65,0.53,0.65
sampled_queries,research_1,0.77,0.07,0.14,0.52,0.1,0.52
sampled_queries,research_2,2.03,0.21,0.4,0.65,0.36,0.65
top_queries,baseline,3.05,1.0,0.51,0.63,0.6,0.63
top_queries,challenger_1,3.51,0.27,0.59,0.82,0.66,0.82
top_queries,research_1,0.91,0.05,0.14,0.54,0.13,0.54
top_queries,research_2,2.59,0.21,0.44,0.82,0.45,0.82


In [15]:
# Sometimes baseline has a jaccard of 0 because of the way that fake searches are defined
# (sometimes baseline returns no documents and it messes up the jaccard calculation)

# ix = df_metrics.search_config=="baseline"
# ix = ix & (df_metrics.metric=="jaccard")
# ix = ix & (df_metrics.value < 1)
# df_metrics[ix].head()

### Metrics dataset calculation

The metrics dataset contains the result of evaluating a search config on judgments.
Each row contains the evaluation of a search config on a query on a given metric.

It contains the following fields:
 * datetime: the date and time of the evaluation
 * search_config: search config under evaluation
 * query_set_id: id of the query set used for evaluation
 * query: query from which judgment was derived
 * metric: name of metric used for evaluation
 * value: evaluation result

In [16]:
df_metrics.sample(10)

Unnamed: 0,datetime,search_config,query_set_id,query,metric,value
0,2024-06-30,baseline,top_queries,瞬間せっちゃくざい 白くならない,prec@10,0.55
0,2024-07-01,research_1,sampled_queries,urban books series paperback,ndcg,0.35
0,2024-06-25,baseline,sampled_queries,ballet protectors,jaccard,1.0
0,2024-06-28,research_2,sampled_queries,razer wireless mouse,ndcg,0.35
0,2024-06-25,research_1,sampled_queries,princess revlon,prec@5,0.1
0,2024-06-29,research_2,sampled_queries,メバル ルアー,ndcg,0.29
0,2024-06-30,research_2,sampled_queries,ナーズ,dcg,3.3
0,2024-07-01,research_1,sampled_queries,assassins creed woman costume,prec@10,0.0
0,2024-06-26,baseline,sampled_queries,baby nursery nightlight,jaccard,1.0
0,2024-06-29,baseline,sampled_queries,回転灯 ネットワーク,prec@5,1.0


In [20]:
_16.iloc[1:5]

Unnamed: 0,datetime,search_config,query_set_id,query,metric,value
0,2024-07-01,research_1,sampled_queries,urban books series paperback,ndcg,0.35
0,2024-06-25,baseline,sampled_queries,ballet protectors,jaccard,1.0
0,2024-06-28,research_2,sampled_queries,razer wireless mouse,ndcg,0.35
0,2024-06-25,research_1,sampled_queries,princess revlon,prec@5,0.1


In [17]:
df_metrics.to_csv("metrics.csv", index=False)