In [None]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from datasets import load_dataset
from tqdm import tqdm 

In [2]:
es = Elasticsearch("http://localhost:9200")
index_name = "trec_product_search"

In [3]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

Deleted existing index: trec_product_search


In [4]:
index_name = "trec_product_search"

# Delete index if exists to ensure a clean slate
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"‚ôªÔ∏è  Deleted old index: {index_name}")

settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                # LAB BOOK REQUIREMENT: HTML Stripping + English Processing
                "html_english_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": ["html_strip"],  # 1. Strips <br>, <div>, etc.
                    "filter": [
                        "lowercase",                # 2. Case insensitive
                        "english_stop",             # 3. Removes stopwords
                        "english_stemmer"           # 4. Stemming (run -> running)
                    ]
                }
            },
            "filter": {
                "english_stop": {"type": "stop", "stopwords": "_english_"},
                "english_stemmer": {"type": "stemmer", "language": "english"}
            }
        }
    },
    "mappings": {
        "properties": {
            "docid": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "html_english_analyzer"},
            "brand": {"type": "keyword"}, # Keyword for Faceting/Filtering
            
            # --- PLAN B STRATEGY: METADATA AUGMENTATION ---
            # We combine Title + Brand + Bullets here for maximum recall.
            "search_content": {
                "type": "text", 
                "analyzer": "html_english_analyzer"
            }
        }
    }
}

es.indices.create(index=index_name, body=settings)
print(f"‚úÖ Created index '{index_name}' with Metadata Augmentation strategy.")

‚úÖ Created index 'trec_product_search' with Metadata Augmentation strategy.


In [6]:
# ---------------------------------------------------------
# 3. STREAM & INDEX CORPUS (The "Direct Link" Fix)
# ---------------------------------------------------------
print("\nüåä Connecting to Hugging Face Data Stream...")

# 1. We define the direct link to the raw data file
# This bypasses the "Dataset scripts are no longer supported" error
corpus_url = "https://huggingface.co/datasets/trec-product-search/product-search-corpus/resolve/main/corpus.jsonl.gz"

# 2. We load it as a generic "json" file instead of a custom dataset
# split="train" ensures we get the data generator
ds = load_dataset("json", data_files=corpus_url, split="train", streaming=True)

def generate_actions():
    # Iterate through the stream
    for row in ds:
        # Note: The raw JSON keys might slightly differ from the python script version
        # We use .get() to be safe.
        docid = row.get("docid")
        title = row.get("title") or ""
        brand = row.get("brand") or ""
        
        # Handle bullet points (sometimes list, sometimes None)
        bullets = row.get("bullet_points")
        if isinstance(bullets, list):
            bullets = " ".join(bullets)
        elif bullets is None:
            bullets = ""

        # LAB BOOK LOGIC: Concatenate fields
        combined_text = f"{title} {brand} {bullets}"

        yield {
            "_index": index_name,
            "_id": docid,
            "_source": {
                "docid": docid,
                "title": title,
                "brand": brand,
                "search_content": combined_text  # <-- Plan B Field
            }
        }

print("üöÄ Starting indexing... (This takes a few minutes)")

# We use standard tqdm with a manual update
successes, failed = helpers.bulk(es, tqdm(generate_actions(), mininterval=1.0), stats_only=True)

print(f"\n‚úÖ Indexing Complete!")
print(f"Indexed: {successes} documents")
print(f"Failed: {failed} documents")


üåä Connecting to Hugging Face Data Stream...
üöÄ Starting indexing... (This takes a few minutes)


1118658it [04:28, 4162.88it/s]


‚úÖ Indexing Complete!
Indexed: 1118658 documents
Failed: 0 documents





In [7]:
print("\nüì• Loading Queries for verification...")
# Using the resolved raw link
query_url = "https://huggingface.co/datasets/trec-product-search/product-search-2024-queries/resolve/main/2024_test_queries.tsv"

queries_df = pd.read_csv(query_url, sep="\t")
print(f"‚úÖ Loaded {len(queries_df)} queries.")
print(queries_df.head(3))


üì• Loading Queries for verification...
‚úÖ Loaded 116 queries.
   299                       peplum top
0  314  women light weight bikini pants
1  312   drawstring shorts women casual
2  170            avaivy facial product
