In [1]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from datasets import load_dataset
from tqdm import tqdm 
import json
from collections import Counter

In [2]:
# configurations and data sources

# # connect to to the local instance (run bin/elasticsearch in another terminal)
es = Elasticsearch("http://localhost:9200")

# setting the index name
index_name = "trec_product_search"

# data source urls 
corpus_path = "corpus.jsonl"
query_pathl = "2024_test_queries.tsv"

# run
run_name ='es_bm25'

# connection test
if es.ping():
    info = es.info()
    print("elasticsearch connected!")
else:
    print("ERROR")

elasticsearch connected!


In [3]:
index_settings = {
   "settings": {
      "analysis": {
         "analyzer": {
            "trec_analyzer": {
               "type": "custom",
               "tokenizer": "standard",
               "filter": [
                  "lowercase",
                  "english_stemmer"
               ]
            }
         },
         "filter": {
            "english_stemmer": {
               "type": "stemmer",
               "language": "english"
            }
         }
      }
   },
   "mappings": {
      "properties": {
         "id": {"type": "keyword"},
         "title": {"type": "text", "analyzer": "trec_analyzer"},
         "contents": {"type": "text", "analyzer": "trec_analyzer"},
         "brand": {"type": "text", "analyzer": "trec_analyzer"}
      }
   }
}

In [4]:
# delete index in case we break something
if es.indices.exists(index=index_name):
   es.indices.delete(index=index_name)

# recreate index
es.indices.create(index=index_name, body=index_settings)
print(f"index '{index_name}' created ")

index 'trec_product_search' created 


In [5]:
# first look into the corpus
with open(corpus_path, 'r', encoding='utf-8') as f:
   for i in range(10):
      print(json.loads(f.readline()))

# we find the fields 'docid', 'title' and 'text'
# let's augment our metadata

{'docid': 1, 'title': 'FYY Leather Case with Mirror for Samsung Galaxy S8 Plus, Leather Wallet Flip Folio Case with Mirror and Wrist Strap for Samsung Galaxy S8 Plus Black', 'text': "Cell Phones & Accessories Cases, Holsters & Sleeves Flip Cases P r o d u c t   D e s c r i p t i o n   P r e m i u m   P U   L e a t h e r   T o p   q u a l i t y .   M a d e   w i t h   P r e m i u m   P U   L e a t h e r .   R e c e i v e r   d e s i g n .   A c c u r a t e   c u t - o u t   f o r   r e c e i v e r .   C o n v e n i e n t   t o   A n s w e r   t h e   p h o n e   w i t h o u t   o p e n   t h e   c a s e .   H a n d   s t r a p   m a k e s   i t   e a s y   t o   c a r r y   a r o u n d .   R F I D   T e c h n i q u e   R F I D   T e c h n i q u e :   R a d i o   F r e q u e n c y   I d e n t i f i c a t i o n   t e c h n o l o g y ,   t h r o u g h   r a d i o   s i g n a l s   t o   i d e n t i f y   s p e c i f i c   t a r g e t s   a n d   t o   r e a d   a n d   c o p y   e l e c t 

In [None]:
# we start using a list of known brands, exluding brands that are common words in english
# the goal is to add a field called 'brands' that we can pump later for precision

# AI generated list of brands (Gemini)
known_brands = {
    # tech, electronics
    "samsung", "apple", "sony", "hp", "dell", "asus", "lenovo", "microsoft", 
    "canon", "nikon", "nintendo", "playstation", "xbox", "logitech", "lg", 
    "philips", "epson", "brother", "panasonic", "toshiba", "acer", "msi", 
    "seagate", "wd", "sandisk", "kingston", "corsair", "razer", "garmin", 
    "fitbit", "gopro", "roku", "google", "amazon", "oneplus", "xiaomi", 
    "huawei", "motorola", "nokia", "bose", "jbl", "sennheiser", "skullcandy",
    "anker", "belkin", "tp-link", "netgear", "ubiquiti",

    # hardware
    "intel", "amd", "nvidia", "gigabyte", "evga", "coolermaster", "thermaltake",

    # home/kitchen appliances
    "dyson", "shark", "irobot", "kitchenaid", "cuisinart", "keurig", "nespresso", 
    "instant pot", "ninja", "vitamix", "breville", "black+decker", "hamilton beach",
    "whirlpool", "ge", "bosch", "miele", "honeywell", "t-fal", "pyrex",

    # garden tools
    "dewalt", "makita", "milwaukee", "ryobi", "craftsman", "bosch", "hitachi", 
    "stihl", "huskvarna", "toro", "karcher",

    # clothes
    "nike", "adidas", "puma", "under armour", "reebok", "new balance", "skechers",
    "asics", "converse", "vans", "timberland", "crocs", "birkenstock", "clarks",
    "levis", "wrangler", "lee", "calvin klein", "tommy hilfiger", "ralph lauren",
    "the north face", "columbia", "patagonia", "arcteryx", "marmot", "hanes",
    "fruit of the loom", "champion", "fila", "lululemon", "oakley", "ray-ban",

    # toys, hobbies
    "lego", "hasbro", "mattel", "barbie", "hot wheels", "fisher-price", "nerf", 
    "funko", "bandai", "pokemon", "disney", "marvel", "star wars", "crayola", 
    "melissa & doug", "vtech",

    # beauty and care
    "loreal", "maybelline", "garnier", "neutrogena", "olay", "dove", "nivea", 
    "gillette", "braun", "philips sonicare", "oral-b", "colgate", "crest", 
    "revlon", "covergirl", "mac", "clinique", "estee lauder",

    # luxury
    "gucci", "prada", "versace", "rolex", "casio", "seiko", "citizen", "fossil", 
    "timex", "michael kors", "coach", "kate spade"
}

In [7]:
# let's check how many brands we find

found_brands = Counter()

# sets are just quicker
target_brands = set(known_brands)

with open(corpus_path, 'r', encoding='utf-8') as f:
    limit_rows = 50000
    for i, line in enumerate(f):
        if i >= limit_rows:
            break

        # extract titles into dictionary and normalize to lower    
        doc = json.loads(line)
        title = doc.get("title", "").lower()
        
        # tokenize the title so that we only match the full word and turn to set
        title_tokens = set(title.split())
        
        # compare the found titles with our brand list
        matches = title_tokens.intersection(target_brands)
        
        for match in matches:
            found_brands[match] += 1

# results
print(" -- top 50 --")
if not found_brands:
    print("target brand now found in sample")
else:
    for brand, count in found_brands.most_common(50):
        print(f"{brand}: {count}")
print(f" -- sample size: {limit_rows} --")

active_brands = {brand for brand, count in found_brands.items() if count > 0}
print(f"also removed brands that haven't been found in the sample {len(known_brands)} to {len(active_brands)} active brands.")

 -- top 50 --
amazon: 423
samsung: 364
apple: 319
nike: 265
adidas: 253
disney: 213
hp: 123
champion: 116
hanes: 116
intel: 104
nintendo: 99
sony: 99
xbox: 97
skechers: 95
lg: 92
lego: 91
wrangler: 86
marvel: 79
funko: 77
crocs: 75
columbia: 70
playstation: 63
canon: 62
google: 61
lee: 55
puma: 55
vans: 53
shark: 52
dell: 51
mac: 51
asus: 49
clarks: 40
reebok: 39
asics: 38
keurig: 38
dewalt: 37
amd: 36
timberland: 36
nerf: 34
lenovo: 34
motorola: 33
philips: 32
ge: 32
fitbit: 32
brother: 31
garmin: 31
craftsman: 29
nvidia: 29
microsoft: 28
nikon: 27
 -- sample size: 50000 --
also removed brands that haven't been found in the sample 163 to 142 active brands.


In [8]:
# let's actually optimize the brand list now
def extract_brand(title):
    # if title is empty, return emptry string to avoid errors
    if not title:
        return ""
    # tokenizing ensures we don't match partial words and convert to set to remove duplicates
    title_tokens = set(title.lower().split())

    # check for matches
    found = title_tokens.intersection(active_brands)
    
    if found:
        # if multiple match, take the longest one (for example 'applee' vs 'apple inc')
        return max(found, key=len)
    return ""

In [9]:
# using a 'generator' because we want to prevrent the ram from overloading
# this will stream data one line a time
def generate_docs():
   # open the 1gb+ dataset in read-mode using utf-8 encoding to handle special characters
   with open(corpus_path, 'r', encoding='utf-8') as f:
      # loop through the file one line at a time
      for line in f:
         # translate the line into a dictionary
         doc = json.loads(line)
         
         # extract and rename
         id = str(doc.get("docid"))
         title = doc.get("title", "")
         description = doc.get("text", "")
         
         # call the function we defined above to find a brand name inside the current title.
         brand_extracted = extract_brand(title)
         
         # yield is what makes this a generator
         yield {
            "_index": index_name,
            "_id": id,
            # _source contains the actual data fields we want to make searchable
            "_source": {
               "id": id,
               "title": title,
               "contents": description, 
               "brand": brand_extracted
            }
         }
    

In [None]:
# BULK INDEXING LETS GO
# !!!! change chunk_size if it doesn't perform well !!!!
# took me 4 mins 20 sec with chunk_size = 5000
success, failed = helpers.bulk(
    es, 
    generate_docs(),
    stats_only =True,
    chunk_size=5000)

print(f"\n done! success: {success}, failed: {failed}")



 done! success: 1118658, failed: 0
