In [None]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from datasets import load_dataset
# from tqdm import tqdm 
import json
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet

In [None]:
# configurations and data sources

# # connect to to the local instance (run bin/elasticsearch in another terminal)
es = Elasticsearch("http://localhost:9200")

# setting the index name
index_name = "trec_product_search"

# data source paths
corpus_path = "product_catalogue_esci.jsonl"
query_pathl = "qid2query.tsv"
qrels_path = "product-search-dev.qrels.txt"

# downloading nltk wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# obligatory connection test
if es.ping():
    info = es.info()
    print("elasticsearch connected!")
else:
    print("ERROR")

In [None]:
index_settings = {
  "settings": {
    "analysis": {
      "analyzer": {
        "trec_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase", "my_stop_filter", "english_stemmer"]
        }
      },
      "filter": {
        "english_stemmer": {"type": "stemmer", "language": "english"},
        "my_stop_filter": {"type": "stop", "stopwords": "_english_"}
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {"type": "keyword"},
      "title": {"type": "text", "analyzer": "trec_analyzer"},
      "contents": {"type": "text", "analyzer": "trec_analyzer"},
      "brand": {"type": "text", "analyzer": "trec_analyzer"}
    }
  }
}

In [None]:
# delete index in case we break something
if es.indices.exists(index=index_name):
   es.indices.delete(index=index_name)

# recreate index
es.indices.create(index=index_name, body=index_settings)
print(f"index '{index_name}' created ")

In [None]:
# first look into the corpus
with open(corpus_path, 'r', encoding='utf-8') as f:
   for i in range(20):
      print(json.loads(f.readline()))
      

# we find the fields 'product_id', 'product_title',  'product_description', 'product_bullet_point', 'product_brand', 'product_color_name', 'product_locale', 'trecid'

In [None]:
def generate_docs():
  print(f"Reading {corpus_path}...")
  with open(corpus_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
      try:
        doc = json.loads(line)

        # 1. ID MAPPING
        # Prioritize 'trecid' for the assignment grading
        doc_id = str(doc.get("trecid") or doc.get("product_id"))

        # 2. KEY FIELDS
        brand = doc.get("product_brand", "") or ""
        title = doc.get("product_title", "") or ""

        # 3. CONTENT MERGING (The "All-In" Strategy)
        # Combine Description + Bullets + Color into one searchable blob
        desc = doc.get("product_description", "") or ""
        bullets = doc.get("product_bullet_point", "") or ""
        color = doc.get("product_color_name", "") or ""
        
        full_contents = f"{desc} {bullets} {color}"

        yield {
          "_index": index_name,
          "_id": doc_id,
          "_source": {
            "id": doc_id,
            "title": title,
            "contents": full_contents,
            "brand": brand,
            # Metadata (not searched, but stored)
            "asin": doc.get("product_id"), 
            "locale": doc.get("product_locale") 
          }
        }
      except json.JSONDecodeError:
        continue

In [None]:
# BULK INDEXING LETS GO
# !!!! change chunk_size if it doesn't perform well !!!!
# took me 8m 15s with chunk_size = 5000
success, failed = helpers.bulk(
    es, 
    generate_docs(),
    stats_only =True,
    chunk_size=5000)

print(f"\n done! success: {success}, failed: {failed}")


In [None]:
# create a mask for english stop words
english_stopwords = set(stopwords.words('english'))

def get_synonyms(word):
    # filter out stop words into an empty list
    if word.lower() in english_stopwords:
        return []
    
    # it is recommended to filter out short words as they carry no helpful meaning
    if len(word) < 3:
        return []
    
    # again we use set so we avoid duplicates
    found_synonyms = set()

    # we use two loops with wordnet. one retrieves the synset (group), so the meaning,  and the second retieves the synonyms. e.g. bank has multiple meanings, like financial institution or river bank. the lemma loop retrives the synonyms in each of those meanings.
    for group in wordnet.synsets(word):
        for lemma in group.lemmas():
            text = lemma.name()

            # wordnet uses underscores so we replace that with a space
            clean_text = text.replace('_', ' ')

            # skip the words used in query
            if clean_text.lower() != word.lower():
                found_synonyms.add(clean_text)

    return list(found_synonyms)


In [None]:
def expand_query(user_query):
    # split the query into tokens
    tokens = user_query.split()
    
    # Step 2: Create a bucket to hold our new, massive list of words
    final_tokens = []
    
    for token in tokens:
        # Always keep the original word!
        final_tokens.append(token)
        
        # Get the synonyms for this specific word
        synonyms = get_synonyms(token)
        
        # Add the synonyms to our bucket
        # (We use 'extend' to add a list to a list)
        final_tokens.extend(synonyms)
        
    # Step 3: Glue all the words back together into one giant string
    return " ".join(final_tokens)

# --- 4. TEST IT ---
print("\n--- SYNONYM TEST ---")
test_phrase = "mobile phone"
print(f"Original: {test_phrase}")
print(f"Expanded: {expand_query(test_phrase)}")

In [None]:
# 5. DEFINE SEARCH WITH SYNONYMS (PLAN B)
def search_products(user_query, top_k=10):
    
    # 1. Expand the query
    # "samsung case" -> "samsung case casing shell"
    expanded_query = expand_query(user_query)
    
    # 2. Construct the Search
    query_body = {
        "size": top_k,
        "query": {
            "multi_match": {
                "query": expanded_query, 
                
                # We search in title and contents.
                # Title is boosted (^3) so exact matches appear higher
                "fields": ["title^3", "contents", "brand^2"],
                
                # CRITICAL: Use "or". If we used "and", the document would need 
                # to contain ALL synonyms (impossible).
                "operator": "or" 
            }
        }
    }

    response = es.search(index=index_name, body=query_body)
    return response['hits']['hits']

# --- FINAL TEST ---
print("--- SEARCH TEST ---")
results = search_products("mobile phone")

for i, hit in enumerate(results):
    print(f"{i+1}. {hit['_source']['title'][:80]}...")

In [36]:
import math
import collections
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import time

# --- 1. SETUP RESOURCES ---
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading WordNet...")
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('stopwords')

english_stops = set(stopwords.words('english'))

# --- 2. FAST SYNONYM LOGIC ---
# We use a cache so we don't look up the same word (like "phone") 10,000 times
synonym_cache = {}

def get_synonyms(word):
    if word in synonym_cache:
        return synonym_cache[word]
    
    if word.lower() in english_stops or len(word) < 3:
        synonym_cache[word] = []
        return []
    
    found = set()
    # OPTIMIZATION: Nouns only, top 2 meanings
    try:
        for group in wordnet.synsets(word, pos=wordnet.NOUN)[:2]:
            for lemma in group.lemmas():
                clean = lemma.name().replace('_', ' ')
                if clean.lower() != word.lower(): found.add(clean)
    except:
        pass # Safety for weird characters
        
    result = list(found)
    synonym_cache[word] = result
    return result

def expand_query(text):
    tokens = text.split()
    final = []
    for t in tokens:
        final.append(t)
        final.extend(get_synonyms(t))
    return " ".join(list(set(final)))

# --- 3. EVALUATOR ---
class TrecEvaluator:
    def __init__(self, qrels_path):
        self.qrels = collections.defaultdict(set)
        # Load QRELS first so we know which queries matter
        with open(qrels_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 4 and int(parts[3]) > 0:
                    self.qrels[parts[0]].add(parts[2])

    def score(self, run_results, name):
        recall, mrr, ndcg, count = 0, 0, 0, 0
        for qid, rel_docs in self.qrels.items():
            if qid not in run_results: continue
            found = run_results[qid]
            count += 1
            
            # Recall
            hits = len(set(found) & rel_docs)
            recall += hits / len(rel_docs)
            
            # MRR
            for i, d in enumerate(found):
                if d in rel_docs:
                    mrr += 1.0 / (i + 1); break
            
            # NDCG
            dcg, idcg = 0.0, 0.0
            for i, d in enumerate(found):
                if d in rel_docs: dcg += 1.0 / math.log2(i + 2)
            for i in range(min(len(found), len(rel_docs))):
                idcg += 1.0 / math.log2(i + 2)
            if idcg > 0: ndcg += dcg / idcg

        print(f"\n--- RESULTS: {name} ---")
        if count == 0:
            print("WARNING: No matching queries found. Check IDs!")
        else:
            print(f"Recall@100: {recall/count:.4f}")
            print(f"MRR:        {mrr/count:.4f}")
            print(f"NDCG@100:   {ndcg/count:.4f}")

# --- 4. EXECUTION WITH PROGRESS BAR ---
def run_project_fast():
    print("1. Loading Answer Key (QRELS)...")
    # Load QRELS first to filter queries
    valid_qids = set()
    with open("product-search-dev.qrels.txt", 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 1:
                valid_qids.add(parts[0])
    
    print(f"   Found {len(valid_qids)} queries with answers.")
    print("2. Starting Search Loop (showing progress every 100 queries)...")
    
    f1 = open("run_baseline.txt", "w")
    f2 = open("run_optimized.txt", "w")
    
    results_base = {}
    results_opt = {}
    
    processed_count = 0
    start_time = time.time()
    
    with open("qid2query.tsv", 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2: continue
            qid, qtext = parts[0], parts[1]
            
            # OPTIMIZATION: Skip queries that won't be graded
            if qid not in valid_qids:
                continue
            
            processed_count += 1
            if processed_count % 100 == 0:
                elapsed = time.time() - start_time
                rate = processed_count / elapsed
                print(f"   Processed {processed_count} queries... ({rate:.1f} q/sec)")
            
            # === A. BASELINE ===
            hits = es.search(index=index_name, size=100, body={
                "query": {"multi_match": {"query": qtext, "fields": ["title", "contents", "brand"]}}
            })['hits']['hits']
            results_base[qid] = [h['_id'] for h in hits]
            for i, h in enumerate(hits):
                f1.write(f"{qid} Q0 {h['_id']} {i+1} {h['_score']:.4f} baseline\n")
                
            # === B. OPTIMIZED ===
            hits_opt = es.search(index=index_name, size=100, body={
                "query": {"multi_match": {
                    "query": expand_query(qtext),
                    "fields": ["title^3", "contents", "brand^2"],
                    "operator": "or"
                }}
            })['hits']['hits']
            results_opt[qid] = [h['_id'] for h in hits_opt]
            for i, h in enumerate(hits_opt):
                f2.write(f"{qid} Q0 {h['_id']} {i+1} {h['_score']:.4f} optimized\n")
                
    f1.close(); f2.close()
    
    print("\n3. Calculating Grades...")
    evaluator = TrecEvaluator("product-search-dev.qrels.txt")
    evaluator.score(results_base, "Baseline Run")
    evaluator.score(results_opt, "Optimized Run")

run_project_fast()

Downloading WordNet...
1. Loading Answer Key (QRELS)...
   Found 8954 queries with answers.
2. Starting Search Loop (showing progress every 100 queries)...


[nltk_data] Downloading package wordnet to /home/marvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marvin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  hits = es.search(index=index_name, size=100, body={
  hits_opt = es.search(index=index_name, size=100, body={


   Processed 100 queries... (63.5 q/sec)
   Processed 200 queries... (68.5 q/sec)
   Processed 300 queries... (70.7 q/sec)
   Processed 400 queries... (73.0 q/sec)
   Processed 500 queries... (74.3 q/sec)
   Processed 600 queries... (75.3 q/sec)
   Processed 700 queries... (75.8 q/sec)
   Processed 800 queries... (76.0 q/sec)
   Processed 900 queries... (76.0 q/sec)
   Processed 1000 queries... (76.3 q/sec)
   Processed 1100 queries... (76.6 q/sec)
   Processed 1200 queries... (76.9 q/sec)
   Processed 1300 queries... (77.2 q/sec)
   Processed 1400 queries... (77.4 q/sec)
   Processed 1500 queries... (77.2 q/sec)
   Processed 1600 queries... (77.4 q/sec)
   Processed 1700 queries... (77.6 q/sec)
   Processed 1800 queries... (67.4 q/sec)
   Processed 1900 queries... (68.0 q/sec)
   Processed 2000 queries... (68.6 q/sec)
   Processed 2100 queries... (69.2 q/sec)
   Processed 2200 queries... (69.6 q/sec)
   Processed 2300 queries... (69.5 q/sec)
   Processed 2400 queries... (70.0 q/sec)
 