# 3) Baseline Search & Metrics

In [2]:
import pandas as pd
import numpy as np
import mercury as mr
import requests
import json
import pytrec_eval
from opensearchpy import OpenSearch

In [3]:
DATA_DIR = '/Users/danielwrigley/work/Testing/git_repos/esci-data/shopping_queries_dataset/'

In [4]:
df_examples = pd.read_parquet(DATA_DIR + '/shopping_queries_dataset_examples.parquet')

## Query set
A query set has the columns:

* query_set_id
* query

Potentially other columns:

* (head/torso/tail)

There is currently no date corresponding to the query set. And currently the sampling is not done based on frequency

In [5]:
# We only use English queries for now
df_queries_us = df_examples[df_examples['product_locale'] == 'us']

In [6]:
np.random.seed(10)

In [7]:
# Sample query sets

query_sets = [("sampled_queries", 200), ("top_queries", 20)]

res = []

for query_set_id, n_query_set in query_sets:
    # todo: sampling proportional to frequency
    query_set = np.random.choice(df_queries_us["query"].unique(), n_query_set, replace=False)

    # todo: perhaps consider product_locale? what about small/large version?

    df = pd.DataFrame({"query": query_set})
    df["query_set_id"] = query_set_id
    res.append(df)
df_query_set = pd.concat(res)
df_query_set.head(10)

Unnamed: 0,query,query_set_id
0,runtz,sampled_queries
1,trooper bandana shoe,sampled_queries
2,tcl a1x phone case straight talk,sampled_queries
3,bose headphones replacement cord,sampled_queries
4,uniball vision elite,sampled_queries
5,definitely not paid enough for this,sampled_queries
6,raid deep reach fogger,sampled_queries
7,usb camera,sampled_queries
8,reusable produce bags,sampled_queries
9,latex dental dam,sampled_queries


## Judgments
The judgments dataset has a row per query instance and document and has the following columns:

* datetime: date of query/document instance
* query_id: identifier of query instance
* query: the query
* document: identifier of a document result
* judgment: Here we use the proposed ESCI mapping for DCG: `{"E": 0, "S": 1, "C": 2, "I": 3}` --> this looks odd. These look more like labels since `E` means "Exact Match" and would be the lowest score according to that wording

In [8]:
# Select judgments
# Map esci_label to score
# create judgments per day in range
# create noise in score

label_num = {"E": 0, "S": 1, "C": 2, "I": 3}
label_score = [3, 2, 1, 0]
label_p_noise = 0.1

def label_to_score(label):
    return label_score[label_num[label]]

df_judge = df_examples[df_examples["query"].isin(set(df_query_set["query"].values))].copy()
df_judge["judgment"] = df_judge.esci_label.apply(lambda x: label_to_score(x))
df_judge["document"] = df_judge.product_id
df_judge = df_judge[["query", "document", "judgment"]].reset_index(drop=True)
df_judge.head(20)

Unnamed: 0,query,document,judgment
0,$30 roblox gift card not digital,B07RX6FBFR,3
1,$30 roblox gift card not digital,B09194H44R,0
2,$30 roblox gift card not digital,B08R5N6W6B,2
3,$30 roblox gift card not digital,B07Y693ND1,0
4,$30 roblox gift card not digital,B07RZ75JW3,2
5,$30 roblox gift card not digital,B07RZ74VLR,2
6,$30 roblox gift card not digital,B07M9XQ9YB,0
7,$30 roblox gift card not digital,B078RJ1KZ6,0
8,$30 roblox gift card not digital,B01N6RK9UE,0
9,$30 roblox gift card not digital,B016Y2BVKA,3


# Transform the judgments to the qrels format that `trec_eval` can work with

### Group by queries and export to a file with the index to have queries and query ids

In [47]:
df_queries = df_judge.groupby(by='query', as_index=False).agg({
    'judgment': ['count']
})
df_query_idx = df_queries['query']
name = 'queries.txt'

df_query_idx.to_csv(name, sep="\t", header=False)

### Go through the queries and update the original ratings with the query ids

In [45]:
df_query_idx = pd.DataFrame(df_query_idx)

In [46]:
df_query_idx

Unnamed: 0,query
0,$30 roblox gift card not digital
1,(fiction without frontiers)
2,100
3,10x10x6 cake box without window
4,15 inch light weight laptop that has lots of m...
...,...
215,wooden stool
216,woodwick wax melt
217,world of warcraft anniversary collector's edition
218,wowled


In [12]:
df_query_idx = df_query_idx.reset_index().rename(columns={'index': 'idx'})

df_merged = pd.merge(df_judge, df_query_idx, on='query', how='left')
df_merged['Q0'] = 0
df_ratings = df_merged[['idx', 'Q0', 'document', 'judgment']]
df_ratings.columns = ['idx', 'Q0', 'docid', 'rating']

In [13]:
name = 'ratings.qrels'

df_ratings.to_csv(name, sep="\t", header=False, index=False)
df_ratings

Unnamed: 0,idx,Q0,docid,rating
0,0,0,B07RX6FBFR,3
1,0,0,B09194H44R,0
2,0,0,B08R5N6W6B,2
3,0,0,B07Y693ND1,0
4,0,0,B07RZ75JW3,2
...,...,...,...,...
4060,219,0,B00JX10Q2O,2
4061,219,0,B00KY41UHO,3
4062,219,0,B00QXJOUL2,3
4063,219,0,B00UY14WCM,3


## Query OpenSearch with the Baseline Configuration

We use a simple `multi_match` query with a couple of fields and field weights.

This will serve as our baseline. We get the first 10 results for each query we have in `df_judge` and store the results in a format that `trec_eval`can work with later when evaluating the results.

This gives us a quantification of the quality of the baseline configuration.

We will use this to compare our hybrid search configurations against.

In [14]:
url = "http://localhost:9200/ecommerce/_search"

headers = {
    'Content-Type': 'application/json'
}

df_relevance = pd.DataFrame()

for query in df_query_idx.itertuples():

    payload = {
      "_source": {
        "excludes": [
          "title_embedding"
        ]
      },
      "query": {
        "multi_match" : {
          "type":       "best_fields",
          "fields":     [
              "product_id^100",
            "product_bullet_point^3",
            "product_color^2",
            "product_brand^5",
            "product_description",
            "product_title^10"
          ],
          "operator":   "and",
          "query":      query[2]
        }
      }
    }

    response = requests.request("POST", url, headers=headers, data=json.dumps(payload)).json()
    #print(query[1])
    #print(query[2])
    #mr.JSON(response, level=0)
    
    position = 0
    for hit in response['hits']['hits']:
        # create a new row for the DataFrame and append it
        row = { 'query_id' : str(query[1]), 'query_string': query[2],  'Q0' : "Q0", 'product_id' : hit["_id"], 'position' : str(position), 'relevance' : hit["_score"], 'run': 'default' }
        #df_relevance = df_relevance.append(row, ignore_index=True)
        #df_relevance.loc[len(df_relevance)] = row

        new_row_df = pd.DataFrame([row])
        df_relevance = pd.concat([df_relevance, new_row_df], ignore_index=True)
        #print("%(id)s %(title)s: %(name)s" % hit["_source"])
        position += 1
    
# store the DataFrame without header and index, with tabs as delimiters
#name = '../data/default_result'
#df_relevance.to_csv(name, sep="\t", header=False, index=False)

# work with two for loops:
# 1) one to iterate over the list of queries and have a query id instead of a query
# 2) another one to iterate over the result sets to have the position of the result in the result set 

# DataFrame with columns:
# query_id: the id of the query as the trec_eval tool needs a numeric id rather than a query string as an identifier
# Q0: all lines have Q0, currently unused by trec_eval
# product_id: the id of the product in the hit list
# position: the position of the product in the result set
# relevance: relevance as given by the search engine
# run: the name of the query run

0
$30 roblox gift card not digital


1
(fiction without frontiers)


2
100


3
10x10x6 cake box without window


4
15 inch light weight laptop that has lots of memory storage and RAM.  It should include a HDMI port as well as a USB drive.  


5
15 microsoft card


6
40 amp 240 volt timer


7
45 women birthday party supplies


8
4khdip168nv-1


9
7 days without a pun makes one weak


10
a deed without a name


11
adaptador control xbox 360 a pc


12
adidas soccer cleats


13
alexa leak detector


14
alphabet stickers


15
american flag 5x8 outdoor made in usa


16
andis t edger


17
back to december taylor swift


18
backpacks lunchbox combo


19
bambu utensil set


20
barbie puppy movies


21
baseball shirt 6 under armour


22
battery samsung galaxy s7 edge


23
beeswax wood polish


24
belief without borders


25
best ride on cars mercedes 3-in-1 push car in white


26
black and white checkered curtains 2 panels


27
body shop gift set for men


28
boho earrings


29
boo costume monsters inc


30
bookshelf 4 tier


31
bose headphones replacement cord


32
boys smart watch


33
bra shiny


34
bridal shower decorations


35
bulwark flame resistant jacket


36
card holders for four decksplaying cards


37
cargo pants men big and tall


38
carhartt extreme cold weather boot socks


39
carhartt windbreaker


40
casio illuminator watches for men


41
cat balls


42
ceramtic coating


43
charger wireless for iphone


44
christmas dress for petite women


45
coil incense


46
cole haan grand crosscourt


47
collagen supplements


48
columbia ice maiden ii womens insulated snow boots


49
computer software programs adobe


50
computer table


51
concealer color changing foundation


52
contact lens colored for eyes


53
converse sneakers for women


54
corn hole game wooden


55
cousin it costume


56
custom 3 panel wall art


57
cwine glasses


58
dachshund case


59
daves killer bread


60
definitely not paid enough for this


61
dental floss 0.3


62
detective pikachu digital


63
dieter roth


64
dildo for sex


65
dinosaur decorations orange


66
dji goggles


67
do not enter tape


68
double couple famous painting compression socks


69
drone with live camera for adults


70
drywall anchors heavy duty


71
dummy door handle brass


72
echo dot flush mount


73
eye mask without strap


74
ez bow maker for ribbon


75
fabric aprons


76
fast charging wireless charger


77
fire 7 kids edition tablet


78
fitness rollers deep tissue massage


79
fletchings for arrows


80
floral wallpaper stick and peel


81
fluoride coated scissors


82
four paws magic coat instant dog mat remover


83
frixion pens


84
frolic flowers


85
futon queen


86
galaxy 8 note case


87
gatorade bottle


88
gift teen girl


89
girls bible


90
girls tights


91
glitter lamp color change


92
go chef 8 in 1 cooker


93
golf sets for men left handed


94
ground lift box


95
halloween glow bracelets kids


96
heavy duty plastic folders with pockets


97
helen kaminski hats


98
henley shirts for men


99
hey water antoinette portis


100
hug machine


101
if lost or drunk please return to bestie


102
ikea wood desk


103
ink 61 black and color


104
instant pot air fryer


105
iphone 11 alpha glass screen protector


106
iphone grip


107
jbl bluetooth speakers


108
jobs evergreen tree spikes


109
kid animal plates


110
kids play rug pj mask


111
kobe baby jersey


112
laptop refurbished


113
latex dental dam


114
lg v20 case


115
lights bulbs color changing


116
linkin park


117
long workout tank tops for women loose fit


118
macaw toys


119
margarita glasses


120
mary meyer baby mat


121
maxi pads without wings


122
men school bag


123
mens ralph lauren puffer jacket


124
military face paint kit


125
mora wood carving knife


126
mortara


127
moto play screen protector


128
mre beef


129
muzzy broadheads


130
my tv


131
navistar international


132
neuroquell for neuropathy


133
newborn onesies


134
ninja 4qt air fryer black af101


135
old fashion mix


136
opi linkin park after dark


137
orange textured paint


138
organicbone broth


139
outdoor baby swing


140
pack abd play sheet


141
panties


142
patio chair swing


143
phoenix model a330


144
pilot knee board


145
pine tree garland


146
pink ski mask


147
plastic divider container


148
play kitchen for toddlers accessories


149
plus size black shirt


150
plywood clips for windows


151
pocket storage


152
pokemon shield nintendo switch


153
porch shades outdoor roll up


154
posh moomy


155
pot organizer


156
powdered lemonade drink mix


157
power ball hand exerciser


158
psp


159
pumping backpack


160
puzzle interlocking


161
quick grip american tool mini


162
quickbooks'


163
rae dunn christmas mug


164
rag doll kindergarten


165
raid deep reach fogger


166
razer kishi


167
red flannel nightgowns for women


168
red long sleeve dress


169
renogy 100w solar panel


170
reusable produce bags


171
ribeye steaks


172
rose petals for romantic night


173
rudin by ivan turgenev


174
runa energy drink


175
runtz


176
rv sink drain


177
samsung galaxy note 8 case


178
science diet


179
shark vacmop


180
slarm window sensor


181
slip in sneakers for men


182
small cigarette case


183
smarty jr


184
snuff bullet


185
spice variety pack


186
ss officer


187
starbucks mugs you are here


188
still eyeliner


189
stuffed animal washer bag


190
sundiata


191
super nes wires


192
surgical pathology


193
sushi go card game


194
tcl a1x phone case straight talk


195
telescopic handle


196
tetley


197
the first years gate parts


198
the gallows


199
thrive mascara


200
trooper bandana shoe


201
ultra fast keto boost 800mg


202
uniball vision elite


203
usb camera


204
used pc desktop


205
wallets for men


206
water brushes


207
wella 7r 810


208
white mesh bodysuit


209
wiggle scooters for kids


210
winter cycling shoe covers


211
winter jacket


212
wireless keyboard mouse combo


213
women sexy dresses for party


214
women shawl for winter


215
wooden stool


216
woodwick wax melt


217
world of warcraft anniversary collector's edition


218
wowled


219
yarn purple and pink


## Transform data to meet the `pytrec_eval` requirements

### Convert string ids to integer values

In [15]:
df_relevance

Unnamed: 0,query_id,query_string,Q0,product_id,position,relevance,run
0,1,(fiction without frontiers),Q0,1787581780,0,309.121900,default
1,1,(fiction without frontiers),Q0,B082VGLV18,1,309.121900,default
2,1,(fiction without frontiers),Q0,B07GJVWWWR,2,309.121900,default
3,1,(fiction without frontiers),Q0,B08C5MQFCY,3,309.121900,default
4,1,(fiction without frontiers),Q0,1787583325,4,298.643860,default
...,...,...,...,...,...,...,...
1614,219,yarn purple and pink,Q0,B079J4V4G8,5,49.612446,default
1615,219,yarn purple and pink,Q0,B07HG28KLX,6,49.244354,default
1616,219,yarn purple and pink,Q0,B08B9K99R9,7,44.207966,default
1617,219,yarn purple and pink,Q0,B075ZM4TTT,8,43.978270,default


In [16]:
name = 'baseline_results'
df_relevance.to_csv(name, sep="\t", header=False, index=False)

In [17]:
!trec_eval ratings.qrels baseline_results -m ndcg_cut

trec_eval.form_res_qrels: duplicate docs withouttrec_eval: Can't calculate measure 'ndcg_cut'


In [18]:
df_relevance[df_relevance['product_id'] == 'B076JH5NHZ']

Unnamed: 0,query_id,query_string,Q0,product_id,position,relevance,run


In [19]:
!head baseline_results

1	(fiction without frontiers)	Q0	1787581780	0	309.1219	default
1	(fiction without frontiers)	Q0	B082VGLV18	1	309.1219	default
1	(fiction without frontiers)	Q0	B07GJVWWWR	2	309.1219	default
1	(fiction without frontiers)	Q0	B08C5MQFCY	3	309.1219	default
1	(fiction without frontiers)	Q0	1787583325	4	298.64386	default
1	(fiction without frontiers)	Q0	B07KL6QTN1	5	298.64386	default
1	(fiction without frontiers)	Q0	B07QCVBDRS	6	298.64386	default
1	(fiction without frontiers)	Q0	B07SM9NJCP	7	298.64386	default
1	(fiction without frontiers)	Q0	B08C5LDHMS	8	298.64386	default
1	(fiction without frontiers)	Q0	1787585107	9	298.64386	default


In [20]:
!head ratings.qrels

0	0	B07RX6FBFR	3
0	0	B09194H44R	0
0	0	B08R5N6W6B	2
0	0	B07Y693ND1	0
0	0	B07RZ75JW3	2
0	0	B07RZ74VLR	2
0	0	B07M9XQ9YB	0
0	0	B078RJ1KZ6	0
0	0	B01N6RK9UE	0
0	0	B016Y2BVKA	3


In [21]:
df_query_idx[df_query_idx['idx'] == 16]

Unnamed: 0,idx,query
16,16,andis t edger


url = "http://localhost:9200/ecommerce/_search"

headers = {
    'Content-Type': 'application/json'
}

df_relevance = pd.DataFrame()

payload = {
  "_source": {
    "excludes": [
      "title_embedding"
    ]
  },
  "query": {
    "multi_match" : {
      "type":       "best_fields",
      "fields":     [
          "product_id^100",
        "product_bullet_point^3",
        "product_color^2",
        "product_brand^5",
        "product_description",
        "product_title^10"
      ],
      "operator":   "and",
      "query":      "america wood shapes"
    }
  }
}

response = requests.request("POST", url, headers=headers, data=json.dumps(payload)).json()
#print(query[1])
#mr.JSON(response.json(), level=4)

#for i in range (0, 10):
position = 0
#print(response['hits'])
for hit in response['hits']['hits']:
    # create a new row for the DataFrame and append it
    row = { 'query_id' : str(query[0]), 'Q0' : "Q0", 'product_id' : hit["_id"], 'position' : str(position), 'relevance' : hit["_score"], 'run': 'default' }
    #df_relevance = df_relevance.append(row, ignore_index=True)
    #df_relevance.loc[len(df_relevance)] = row

    new_row_df = pd.DataFrame([row])
    df_relevance = pd.concat([df_relevance, new_row_df], ignore_index=True)
    #print("%(id)s %(title)s: %(name)s" % hit["_source"])
    position += 1

mr.JSON(response, level=1)

In [22]:
df_judge[df_judge['query'] == 'america wood shapes']

Unnamed: 0,query,document,judgment


In [23]:
!trec_eval ratings_test.qrels baseline_results_test -m ndcg_cut

ndcg_cut_5            	all	0.5087
ndcg_cut_10           	all	0.6157
ndcg_cut_15           	all	0.6157
ndcg_cut_20           	all	0.6157
ndcg_cut_30           	all	0.6157
ndcg_cut_100          	all	0.6157
ndcg_cut_200          	all	0.6157
ndcg_cut_500          	all	0.6157
ndcg_cut_1000         	all	0.6157


In [24]:
df_ratings

Unnamed: 0,idx,Q0,docid,rating
0,0,0,B07RX6FBFR,3
1,0,0,B09194H44R,0
2,0,0,B08R5N6W6B,2
3,0,0,B07Y693ND1,0
4,0,0,B07RZ75JW3,2
...,...,...,...,...
4060,219,0,B00JX10Q2O,2
4061,219,0,B00KY41UHO,3
4062,219,0,B00QXJOUL2,3
4063,219,0,B00UY14WCM,3


## Transform data to meet the `pytrec_eval` requirements

### Convert string ids to integer values

In [25]:
unique_ids = pd.Series(pd.concat([df_relevance['product_id'], df_ratings['docid']]).unique())

# Step 2: Create a mapping of each unique identifier to an integer
id_to_int = {id_val: idx for idx, id_val in enumerate(unique_ids, start=1)}

# Step 3: Map the identifiers in both DataFrames
df_relevance['product_id_int'] = df_relevance['product_id'].map(id_to_int)
df_ratings['docid_int'] = df_ratings['docid'].map(id_to_int)

In [26]:
df_relevance.head(3)

Unnamed: 0,query_id,query_string,Q0,product_id,position,relevance,run,product_id_int
0,1,(fiction without frontiers),Q0,1787581780,0,309.1219,default,1
1,1,(fiction without frontiers),Q0,B082VGLV18,1,309.1219,default,2
2,1,(fiction without frontiers),Q0,B07GJVWWWR,2,309.1219,default,3


In [27]:
df_ratings

Unnamed: 0,idx,Q0,docid,rating,docid_int
0,0,0,B07RX6FBFR,3,1613
1,0,0,B09194H44R,0,1614
2,0,0,B08R5N6W6B,2,1615
3,0,0,B07Y693ND1,0,1616
4,0,0,B07RZ75JW3,2,1617
...,...,...,...,...,...
4060,219,0,B00JX10Q2O,2,5024
4061,219,0,B00KY41UHO,3,5025
4062,219,0,B00QXJOUL2,3,5026
4063,219,0,B00UY14WCM,3,5027


In [28]:
# Drop the Q0 column as it is not needed
df_pytrec_qrels = df_ratings.drop(columns=['Q0', 'docid'])

df_pytrec_qrels['docid_int'] = df_pytrec_qrels['docid_int'].astype(str)

# Initialize an empty dictionary to store the final qrel structure
qrel = {}

# Group by 'idx' (which corresponds to 'q1', 'q2', etc.)
for idx, group in df_pytrec_qrels.groupby('idx'):
    # Create a dictionary for each group where 'docid' is the key and 'rating' is the value
    qrel[str(idx)] = dict(zip(group['docid_int'], group['rating']))
#print(qrel)

In [29]:
df_pytrec_results = df_relevance.drop(columns=['Q0', 'position', 'run', 'product_id'])

df_pytrec_results['relevance'] = df_pytrec_results['relevance'].astype(int)
df_pytrec_results['product_id_int'] = df_pytrec_results['product_id_int'].astype(str)

# Initialize an empty dictionary to store the final 'run' structure
run = {}

# Group by 'query_id' (which corresponds to 'q1', 'q2', etc.)
for query_id, group in df_pytrec_results.groupby('query_id'):
    # Create a dictionary for each group where 'product_id' is the key and 'relevance' is the value
    run[query_id] = dict(zip(group['product_id_int'], group['relevance']))

# Print the resulting run structure
#print(run)

In [38]:
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg'})

data = evaluator.evaluate(run)

print(json.dumps(data, indent=1))

{
 "1": {
  "map": 0.14285714285714285,
  "ndcg": 0.2998871210371002
 },
 "10": {
  "map": 0.7219907407407408,
  "ndcg": 0.8479117780988195
 },
 "100": {
  "map": 0.4,
  "ndcg": 0.5577409001984587
 },
 "101": {
  "map": 0.625,
  "ndcg": 0.7441140557478908
 },
 "102": {
  "map": 0.18181818181818182,
  "ndcg": 0.2275100331754748
 },
 "103": {
  "map": 0.3279761904761905,
  "ndcg": 0.5427847748201621
 },
 "104": {
  "map": 0.022222222222222223,
  "ndcg": 0.11033110561045088
 },
 "105": {
  "map": 0.1,
  "ndcg": 0.29740901205864334
 },
 "106": {
  "map": 0.0,
  "ndcg": 0.0
 },
 "107": {
  "map": 0.020833333333333332,
  "ndcg": 0.08805659432350128
 },
 "109": {
  "map": 0.06212797619047619,
  "ndcg": 0.18379934077708113
 },
 "112": {
  "map": 0.0,
  "ndcg": 0.0
 },
 "113": {
  "map": 0.5414930555555556,
  "ndcg": 0.6627929946637657
 },
 "114": {
  "map": 0.04375,
  "ndcg": 0.150670619038414
 },
 "115": {
  "map": 0.046875,
  "ndcg": 0.15499424445272175
 },
 "116": {
  "map": 0.2581845238095

In [41]:
ndcg_sum = 0
map_sum = 0
num_queries = len(data)

# Iterate over the dictionary and sum the 'ndcg' and 'map' values
for query, metrics in data.items():
    ndcg_sum += metrics['ndcg']
    map_sum += metrics['map']

# Calculate the averages
average_ndcg = ndcg_sum / num_queries
average_map = map_sum / num_queries

# Print the results
print("Baseline metrics")
print(f"Average NDCG: {average_ndcg}")
print(f"Average MAP: {average_map}")

Baseline metrics
Average NDCG: 0.29164067313268893
Average MAP: 0.18442964264614412


In [49]:
unique_ids = pd.Series(pd.concat([df_relevance['product_id'], df_ratings['docid']]).unique())

# Create a mapping of each unique identifier to an integer
id_to_int = {id_val: idx for idx, id_val in enumerate(unique_ids, start=1)}

# Map the identifiers in both DataFrames
df_relevance['product_id_int'] = df_relevance['product_id'].map(id_to_int)
df_ratings['docid_int'] = df_ratings['docid'].map(id_to_int)

In [50]:
# Drop the Q0 column as it is not needed
df_pytrec_qrels = df_ratings.drop(columns=['Q0', 'docid'])

df_pytrec_qrels['docid_int'] = df_pytrec_qrels['docid_int'].astype(str)

# Initialize an empty dictionary to store the final qrel structure
qrel = {}

# Group by 'idx' (which corresponds to 'q1', 'q2', etc.)
for idx, group in df_pytrec_qrels.groupby('idx'):
    # Create a dictionary for each group where 'docid' is the key and 'rating' is the value
    qrel[str(idx)] = dict(zip(group['docid_int'], group['rating']))
#print(qrel)

In [51]:
df_pytrec_results = df_relevance.drop(columns=['Q0', 'position', 'run', 'product_id'])

df_pytrec_results['relevance'] = df_pytrec_results['relevance'].astype(int)
df_pytrec_results['product_id_int'] = df_pytrec_results['product_id_int'].astype(str)

# Initialize an empty dictionary to store the final 'run' structure
run = {}

# Group by 'query_id' (which corresponds to 'q1', 'q2', etc.)
for query_id, group in df_pytrec_results.groupby('query_id'):
    # Create a dictionary for each group where 'product_id' is the key and 'relevance' is the value
    run[query_id] = dict(zip(group['product_id_int'], group['relevance']))

# Print the resulting run structure
#print(run)

In [52]:
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg'})

data = evaluator.evaluate(run)

print(json.dumps(data, indent=1))

{
 "1": {
  "map": 0.14285714285714285,
  "ndcg": 0.2998871210371002
 },
 "10": {
  "map": 0.7219907407407408,
  "ndcg": 0.8479117780988195
 },
 "100": {
  "map": 0.4,
  "ndcg": 0.5577409001984587
 },
 "101": {
  "map": 0.625,
  "ndcg": 0.7441140557478908
 },
 "102": {
  "map": 0.18181818181818182,
  "ndcg": 0.2275100331754748
 },
 "103": {
  "map": 0.3279761904761905,
  "ndcg": 0.5427847748201621
 },
 "104": {
  "map": 0.022222222222222223,
  "ndcg": 0.11033110561045088
 },
 "105": {
  "map": 0.1,
  "ndcg": 0.29740901205864334
 },
 "106": {
  "map": 0.0,
  "ndcg": 0.0
 },
 "107": {
  "map": 0.020833333333333332,
  "ndcg": 0.08805659432350128
 },
 "109": {
  "map": 0.06212797619047619,
  "ndcg": 0.18379934077708113
 },
 "112": {
  "map": 0.0,
  "ndcg": 0.0
 },
 "113": {
  "map": 0.5414930555555556,
  "ndcg": 0.6627929946637657
 },
 "114": {
  "map": 0.04375,
  "ndcg": 0.150670619038414
 },
 "115": {
  "map": 0.046875,
  "ndcg": 0.15499424445272175
 },
 "116": {
  "map": 0.2581845238095

In [53]:
ndcg_sum = 0
map_sum = 0
num_queries = len(data)

# Iterate over the dictionary and sum the 'ndcg' and 'map' values
for query, metrics in data.items():
    ndcg_sum += metrics['ndcg']
    map_sum += metrics['map']

# Calculate the averages
average_ndcg = ndcg_sum / num_queries
average_map = map_sum / num_queries

# Print the results
print("Baseline metrics")
print(f"Average NDCG: {average_ndcg}")
print(f"Average MAP: {average_map}")

Baseline metrics
Average NDCG: 0.29164067313268893
Average MAP: 0.18442964264614412
