# README

### **Running this notebook might take up to 5 minutes!**

### I found this assignment a bit tricky. Here are my solutions:

1. I used a custom pipeline with elasticsearch's built-in features *(standard tokenizer, english stop words, lowercase)*

2. I prefer notebooks to be **portable**, so instead of downloading the collection locally, I have built a generator that streams and transforms the collection on the fly with elasticsearch's helpers. this way my laptop doesn't explode and **you can run this notebook on your device without risk.**

3. Similar approach for the query list. Using requests and the io + csv trick to store the data into a variable without physically downloading the data

4. Then matched query against collection

note: you might need to edit this config
> xpack.security.enabled: false

I used a remote session to my linux subsystem for this notebook





In [1]:
from elasticsearch import Elasticsearch, ConnectionError
import json
import gzip
import requests
from elasticsearch import helpers
from tqdm import tqdm
import csv
import io 

In [None]:
es = Elasticsearch("http://localhost:9200")
corpus_url = "https://huggingface.co/datasets/trec-product-search/Product-Search-Corpus-v0.1/resolve/main/data/trec/collection.trec.gz"
index_name = "trec_products"
query_url = "https://huggingface.co/datasets/trec-product-search/product-search-corpus/resolve/main/data/qid2query.tsv"
my_topic_id = "24280"
my_query_text = ""

info = es.info()
print(info)

{'name': 'mkueverlingasus', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'GwKghPX8Q8GPYkzqbcWUhg', 'version': {'number': '9.1.4', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '0b7fe68d2e369469ff9e9f344ab6df64ab9c5293', 'build_date': '2025-09-16T22:05:19.073893347Z', 'build_snapshot': False, 'lucene_version': '10.2.2', 'minimum_wire_compatibility_version': '8.19.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:
# configs for tokenizer and filters
index_config = {
  "settings": {
    "analysis": {
      "analyzer": {
        "trec_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "english_stop"
          ]
        }
      },
      # use built-in english stopwords
      "filter": {
        "english_stop": {
          "type": "stop",
          "stopwords": "_english_"
        }
      }
    }
  },
  # schema
  "mappings": {
    "properties": {
      "name": {
        "type": "text",
        "analyzer": "trec_analyzer"
      },
      "description": {
        "type": "text",
        "analyzer": "trec_analyzer"
      },
      "brand": {
        "type": "keyword"
      }
    }
  }
}

try:
    # delete if exist
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)

    # recreate index
    response = es.indices.create(index=index_name, body=index_config)

except Exception as e:
    print({e})

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "trec_products"
}


In [4]:
# generator function to yield documents for bulk indexing
def generate_actions(url):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        
        with gzip.open(r.raw, 'rt', encoding='utf-8', errors='ignore') as f:
            
            # parsing each line in the corpus
            for line in tqdm(f, desc="Reading corpus"):
                try:
                    # removing nulls
                    clean_line = line.replace('\0', '')
                    
                    # data is tabular, split by tabs
                    columns = clean_line.split('\t')

                    # correct variable names
                    doc_id = columns[0]
                    name = columns[1]
                    # IMPORTANT missing descriptions are possible
                    description = columns[2] if len(columns) > 2 else "" 

                    # laptop crash prevention
                    yield {
                      "_index": index_name,
                      "_id": doc_id,
                      "_source": {
                          "name": name,
                          "description": description
                      }
                    }
                # skip and log errors
                except Exception as e:
                    print(f"Skipping row due to error: {e}")

# BULK INDEXING LETS GO
try:
    print("starting bulk indexing... this will take a little bit ...")
    
    # helpers.bulk is super powerful here for bulk operations (speed and lightweight memory)
    success, failed = helpers.bulk(
        es, 
        generate_actions(corpus_url), 
        chunk_size=1000,
        raise_on_error=False,  
        stats_only=True
    )
    
    print(f"\n done! success: {success}, failed: {failed}")

    # refresh index (like commit in solr)
    es.indices.refresh(index=index_name)

except Exception as e:
    print(f"something went wrong: {e}")

starting bulk indexing... coffee time ...


Reading corpus: 1118658it [03:21, 5558.54it/s]



 done! success: 1118658, failed: 0


In [5]:
print(f"finding query for topic ID: {my_topic_id}...")

response = requests.get(query_url)
response.raise_for_status()

# read the text content of the response
file_content = response.text
f = io.StringIO(file_content)

# use the csv reader to handle the tsv
reader = csv.reader(f, delimiter='\t')

for row in reader:
    if row[0] == my_topic_id:
        my_query_text = row[1]
        break

if my_query_text:
    print(f"found query: '{my_query_text}'")
else:
    print(f"noquery for topic ID {my_topic_id}")

finding query for topic ID: 24280...
found query: 'cellar air conditioner'


In [6]:
print(f"searching for: '{my_query_text}' from topic ID {my_topic_id}...")

# result body
query_body = {
  "query": {
    "multi_match": {
      "query": my_query_text,
      "fields": ["name", "description"]
    }
  }
}

# RUN THE SEARCH 
response = es.search(
    index=index_name,
    body=query_body,
    size=10 # top 10 results
)

print("\n--- SEARCH RESULTS ---")

results = response.body

# pretty print
print(f"{results['hits']['total']['value']} total matching documents")
print("Top 10 hits:")

for i, hit in enumerate(results['hits']['hits']):
    print(f"\n--- Result {i+1} (Score: {hit['_score']}) ---")
    print(f"  ID: {hit['_id']}")
    print(f"  Name: {hit['_source'].get('name')}")
    
    description = hit['_source'].get('description', '')
    if description:
          # only print first 150 chars
          print(f"  Description: {description[:150]}...") 
    else:
          print("  Description: [EMPTY]")

searching for: 'cellar air conditioner' from topic ID 24280...


  response = es.search(



--- SEARCH RESULTS ---
10000 total matching documents
Top 10 hits:

--- Result 1 (Score: 19.53284) ---
  ID: 7691
  Name: WhisperKOOL SC PRO 3000 Wine Cooling Unit
  Description: Product Description SC PRO 2000 SC PRO 3000 SC PRO 4000 SC PRO 8000 Cellar Size (cubic ft) 300 cu. ft. 650 cu. ft. 1000 cu. ft. 2000 cu. ft. Dimension...

--- Result 2 (Score: 19.53284) ---
  ID: 204027
  Name: WhisperKOOL SC PRO 4000 Wine Cooling Unit
  Description: Product Description SC PRO 2000 SC PRO 3000 SC PRO 4000 SC PRO 8000 Cellar Size (cubic ft) 300 cu. ft. 650 cu. ft. 1000 cu. ft. 2000 cu. ft. Dimension...

--- Result 3 (Score: 19.53284) ---
  ID: 727683
  Name: WhisperKOOL SC PRO 2000 Wine Cooling Unit
  Description: Product Description SC PRO 2000 SC PRO 3000 SC PRO 4000 SC PRO 8000 Cellar Size (cubic ft) 300 cu. ft. 650 cu. ft. 1000 cu. ft. 2000 cu. ft. Dimension...

--- Result 4 (Score: 18.187454) ---
  ID: 307978
  Name: Ironwood Gourmet 28221 Appalachian Salt Cellar, Acacia Wood
  Descriptio