In [75]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from datasets import load_dataset
# from tqdm import tqdm 
import json
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import time
from tqdm import tqdm # Progress bar
import collections
import math

In [41]:
# configurations and data sources

# # connect to to the local instance (run bin/elasticsearch in another terminal)
es = Elasticsearch("http://localhost:9200")

# setting the index name
index_name = "trec_product_search"

# data source paths
corpus_path = "product_catalogue_esci.jsonl"
query_pathl = "qid2query.tsv"
qrels_path = "product-search-dev.qrels.txt"

# downloading nltk wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# obligatory connection test
if es.ping():
    info = es.info()
    print("\nelasticsearch connected! \n")
else:
    print("\nelasticearch ERROR \n")


elasticsearch connected! 



[nltk_data] Downloading package wordnet to /home/marvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marvin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
index_settings = {
  "settings": {
    "analysis": {
      "analyzer": {
        "trec_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase", "my_stop_filter", "english_stemmer"]
        }
      },
      "filter": {
        "english_stemmer": {"type": "stemmer", "language": "english"},
        "my_stop_filter": {"type": "stop", "stopwords": "_english_"}
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {"type": "keyword"},
      "title": {"type": "text", "analyzer": "trec_analyzer"},
      "contents": {"type": "text", "analyzer": "trec_analyzer"},
      "brand": {"type": "text", "analyzer": "trec_analyzer"}
    }
  }
}

In [44]:
# delete index in case we break something
if es.indices.exists(index=index_name):
   es.indices.delete(index=index_name)

# recreate index
es.indices.create(index=index_name, body=index_settings)
print(f"fresh index '{index_name}' has been created ")

fresh index 'trec_product_search' has been created 


In [45]:
# first look into the corpus
with open(corpus_path, 'r', encoding='utf-8') as f:
   for i in range(20):
      print(json.loads(f.readline()))
      

# we find the fields 'product_id', 'product_title',  'product_description', 'product_bullet_point', 'product_brand', 'product_color_name', 'product_locale', 'trecid'

{'product_id': 'B003O0MNGC', 'product_title': 'Delta BreezSignature VFB25ACH 80 CFM Exhaust Bath Fan with Humidity Sensor', 'product_description': None, 'product_bullet_point': 'Virtually silent at less than 0.3 sones\nPrecision engineered with DC brushless motor for extended reliability\nEasily switch in and out of humidity sensing mode by toggling wall switch\nENERGY STAR qualified for efficient cost-saving operation\nPrecision engineered with DC brushless motor for extended reliability, this fan will outlast many household appliances', 'product_brand': 'DELTA ELECTRONICS (AMERICAS) LTD.', 'product_color_name': 'White', 'product_locale': 'us', 'trecid': 201460}
{'product_id': 'B00MARNO5Y', 'product_title': 'Aero Pure AP80RVLW Super Quiet 80 CFM Recessed Fan/Light Bathroom Ventilation Fan with White Trim Ring', 'product_description': None, 'product_bullet_point': 'Super quiet 80CFM energy efficient fan virtually disappears into the ceiling leaving only a recessed light in view\nMay be

In [46]:
def generate_docs():
  print(f"Reading {corpus_path}...")
  with open(corpus_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
      try:
        doc = json.loads(line)

        # 1. ID MAPPING
        # Prioritize 'trecid' for the assignment grading
        doc_id = str(doc.get("trecid") or doc.get("product_id"))

        # 2. KEY FIELDS
        brand = doc.get("product_brand", "") or ""
        title = doc.get("product_title", "") or ""

        # 3. CONTENT MERGING (The "All-In" Strategy)
        # Combine Description + Bullets + Color into one searchable blob
        desc = doc.get("product_description", "") or ""
        bullets = doc.get("product_bullet_point", "") or ""
        color = doc.get("product_color_name", "") or ""
        
        full_contents = f"{desc} {bullets} {color}"

        yield {
          "_index": index_name,
          "_id": doc_id,
          "_source": {
            "id": doc_id,
            "title": title,
            "contents": full_contents,
            "brand": brand,
            # Metadata (not searched, but stored)
            "asin": doc.get("product_id"), 
            "locale": doc.get("product_locale") 
          }
        }
      except json.JSONDecodeError:
        continue

In [47]:
# BULK INDEXING LETS GO
# !!!! change chunk_size if it doesn't perform well !!!!
# took me 8m 15s with chunk_size = 5000
success, failed = helpers.bulk(
    es, 
    generate_docs(),
    stats_only =True,
    chunk_size=5000)

print(f"\n done! success: {success}, failed: {failed}")


Reading product_catalogue_esci.jsonl...

 done! success: 1118990, failed: 0


In [65]:
# create a mask for english stop words
english_stopwords = set(stopwords.words('english'))

# cashing for bettre performance
synonym_cache = {}

def get_synonyms(word):

    # big brain logic: check the cache 
    if word in synonym_cache:
        return synonym_cache[word]
    
    # filter out stop words into an empty list
    if word.lower() in english_stopwords or len(word) < 3:
        synonym_cache[word] = []
        return []

    
    # it is recommended to filter out short words as they carry no helpful meanin
    
    # again we use set so we avoid duplicates
    found_synonyms = set()

    # we use two loops with wordnet. one retrieves the synset (group), so the meaning,  and the second retieves the synonyms. e.g. bank has multiple meanings, like financial institution or river bank. the lemma loop retrives the synonyms in each of those meanings.
    for group in wordnet.synsets(word):
        for lemma in group.lemmas():
            text = lemma.name()

            # wordnet uses underscores so we replace that with a space
            clean_text = text.replace('_', ' ')

            # skip the words used in query
            if clean_text.lower() != word.lower():
                found_synonyms.add(clean_text)

    results = list(found_synonyms)
    synonym_cache[word] = results
    return results


def expand_query(text):
    """
    Takes a user query string (e.g., "iphone case") 
    and adds synonyms (e.g., "iphone case telephone shell").
    """
    tokens = text.split()
    final = []
    for t in tokens:
        final.append(t) # Keep original word
        final.extend(get_synonyms(t)) # Add synonyms
    
    # Remove duplicates and join back to string
    return " ".join(list(set(final)))


In [66]:
def load_qrels(qrels_path):
    """
    Reads the QRELS (Answer Key) file.
    Returns a dictionary: { query_id: {relevant_doc_1, relevant_doc_2, ...} }
    """
    qrels = collections.defaultdict(set)
    print(f"Loading QRELS from {qrels_path}...")
    
    with open(qrels_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            # The file format is: query_id  iteration  doc_id  relevance_score
            # We only care about pairs where relevance > 0
            if len(parts) == 4 and int(parts[3]) > 0:
                qid = parts[0]
                doc_id = parts[2]
                qrels[qid].add(doc_id)
                
    print(f"Loaded answers for {len(qrels)} queries.")
    return qrels

In [67]:
# 5. DEFINE SEARCH WITH SYNONYMS (PLAN B)
def search_products(user_query, top_k=10):
    
    # 1. Expand the query
    # "samsung case" -> "samsung case casing shell"
    expanded_query = expand_query(user_query)
    
    # 2. Construct the Search
    query_body = {
        "size": top_k,
        "query": {
            "multi_match": {
                "query": expanded_query, 
                
                # We search in title and contents.
                # Title is boosted (^3) so exact matches appear higher
                "fields": ["title^3", "contents", "brand^2"],
                
                # CRITICAL: Use "or". If we used "and", the document would need 
                # to contain ALL synonyms (impossible).
                "operator": "or" 
            }
        }
    }

    response = es.search(index=index_name, body=query_body)
    return response['hits']['hits']

# --- FINAL TEST ---
print("--- SEARCH TEST ---")
results = search_products("mobile phone")

for i, hit in enumerate(results):
    print(f"{i+1}. {hit['_source']['title'][:80]}...")

--- SEARCH TEST ---
1. AKOAK Loud Sound RJ11 Telephone Ring Ringer Amplifier Telephone Answering Access...
2. Loud Sound RJ11 Telephone Ring Ringer Amplifier...
3. Clip Type Earphones，Portable Stereophone Headphones,with Microphone and Call Con...
4. AKOAK Loud Sound Telephone Ring Ringer Amplifier Volume Enhancer with Light Flas...
5. Jabra Evolve 20 UC Wired Headset, Stereo Professional Telephone Headphones for G...
6. Work from Home Office Telephone Call Center Dial Key Pad Phone + Headset Headpho...
7. 2PCS 95dB Extra-Loud Telephone Ringer Phone Ring Amplifier Ringing Help Strobe L...
8. Jabra Evolve 20 UC Wired Headset, Stereo Professional Telephone Headphones for G...
9. Future Call Amplified Telephone Ringer With Visual Indicator...
10. IFYOO Gaming Keyboard and Mouse Combo Set for Mobile Games Controller, Compatibl...


In [76]:
import math

def calculate_scores(run_results, qrels, run_name):
    """
    Calculates Recall@100, MRR, and NDCG@100 as required by the assignment.
    """
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    count = 0
    
    for qid, relevant_docs in qrels.items():
        if qid not in run_results: continue
        
        # We evaluate the top 100 results (as requested by rubric)
        found_docs = run_results[qid][:100]
        count += 1
        
        # --- 1. RECALL@100 ---
        # (Relevant docs found) / (Total relevant docs exist)
        hits = len(set(found_docs) & relevant_docs)
        recall_sum += hits / len(relevant_docs)
        
        # --- 2. RECIPROCAL RANK (MRR) ---
        # Score is 1/Rank of the FIRST relevant item found
        rr = 0
        for i, docid in enumerate(found_docs):
            if docid in relevant_docs:
                rr = 1.0 / (i + 1)
                break
        mrr_sum += rr
        
        # --- 3. NDCG@100 ---
        # A complex metric that rewards putting correct items at the very top
        dcg = 0.0
        idcg = 0.0
        
        # Calculate DCG (Your Score)
        for i, docid in enumerate(found_docs):
            if docid in relevant_docs:
                # Logarithmic decay: lower rank = less points
                dcg += 1.0 / math.log2(i + 2)
        
        # Calculate IDCG (The Perfect Score)
        # If we had put all relevant docs at the top, what would the score be?
        num_relevant = len(relevant_docs)
        num_found_limit = min(num_relevant, 100) # We only look at top 100
        
        for i in range(num_found_limit):
            idcg += 1.0 / math.log2(i + 2)
            
        if idcg > 0:
            ndcg_sum += dcg / idcg

    print(f"\n--- RESULTS: {run_name} ---")
    if count > 0:
        print(f"Evaluated {count} queries.")
        print(f"Recall@100:      {recall_sum/count:.4f}")
        print(f"Reciprocal Rank: {mrr_sum/count:.4f}")
        print(f"NDCG@100:        {ndcg_sum/count:.4f}")
    else:
        print("⚠️ ERROR: No queries evaluated.")

In [77]:
from tqdm import tqdm
import collections

def run_final_submission():
    print("1. Loading Data...")
    qrels = collections.defaultdict(set)
    with open("product-search-dev.qrels.txt", 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 4 and int(parts[3]) > 0:
                qrels[parts[0]].add(parts[2])
    
    with open("qid2query.tsv", 'r') as f:
        all_lines = f.readlines()
        
    print(f"2. Generating Run Files for {len(qrels)} queries...")
    
    f_base = open("run_baseline.txt", "w")
    f_opt  = open("run_optimized.txt", "w")
    
    results_base = {}
    results_opt = {}
    
    # Run fully on the lines that have answers
    for line in tqdm(all_lines):
        parts = line.strip().split('\t')
        if len(parts) < 2: continue
        qid, qtext = parts[0], parts[1]
        
        if qid not in qrels: continue

        # === A. BASELINE (Standard BM25) ===
        # "Compare different retrieval models" -> Standard Model
        hits_base = es.search(index=index_name, size=100, body={
            "query": {"multi_match": {
                "query": qtext,
                "fields": ["title", "contents", "brand"]
            }}
        })['hits']['hits']
        
        results_base[qid] = [h['_id'] for h in hits_base]
        for i, h in enumerate(hits_base):
            # Format: query_id Q0 doc_id rank score run_tag
            f_base.write(f"{qid} Q0 {h['_id']} {i+1} {h['_score']:.4f} baseline\n")
            
        # === B. OPTIMIZED (Boosted BM25) ===
        # "Try re-ranking" / "Parameter Tuning" -> Boosted Model
        hits_opt = es.search(index=index_name, size=100, body={
            "query": {"multi_match": {
                "query": qtext,
                "fields": ["title^3", "contents", "brand^2"],
                "type": "best_fields"
            }}
        })['hits']['hits']
        
        results_opt[qid] = [h['_id'] for h in hits_opt]
        for i, h in enumerate(hits_opt):
            f_opt.write(f"{qid} Q0 {h['_id']} {i+1} {h['_score']:.4f} optimized\n")

    f_base.close()
    f_opt.close()
    
    print("\n3. Calculating Final Grades (per rubric requirements)...")
    calculate_scores(results_base, qrels, "Baseline Run")
    calculate_scores(results_opt, qrels, "Optimized Run")

run_final_submission()

1. Loading Data...
2. Generating Run Files for 8935 queries...


  hits_base = es.search(index=index_name, size=100, body={
  hits_opt = es.search(index=index_name, size=100, body={
100%|██████████| 30734/30734 [02:47<00:00, 183.07it/s]   



3. Calculating Final Grades (per rubric requirements)...

--- RESULTS: Baseline Run ---
Evaluated 8934 queries.
Recall@100:      0.3886
Reciprocal Rank: 0.4554
NDCG@100:        0.3152

--- RESULTS: Optimized Run ---
Evaluated 8934 queries.
Recall@100:      0.4035
Reciprocal Rank: 0.4955
NDCG@100:        0.3357
