# Investigation: Are Top 5 "Alcohol Prep Pads" Products in Ground Truth?

This notebook searches for specific products in the full dataset to confirm whether they are labeled for the query "alcohol prep pads".


In [7]:
import pandas as pd
import numpy as np

# Load the data
prod = pd.read_parquet("/home/ubuntu/environments/semantic-search-grainger/esci-data/shopping_queries_dataset/shopping_queries_dataset_products.parquet")
quer = pd.read_parquet("/home/ubuntu/environments/semantic-search-grainger/esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet")
df_examples_products = pd.merge(
    quer,
    prod,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

# df_examples_products_filter = pd.read_parquet('./example_products.parquet')

print(f"Dataset shape: {df_examples_products.shape}")
print(f"\nColumns: {df_examples_products.columns.tolist()}")


Dataset shape: (2621288, 14)

Columns: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'split', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color']


In [8]:
# These are the top 5 products from the search results
products_to_check = [
    "Winner Alcohol Prep Pads, 4-Ply Square Cotton Pads Well-saturated in Alcohol, 20",
    "Alcohol Prep Pads 400 Pack 6X3cm - Sterile 75% Alcohol Wipes - Thick Cotton Indi",
    "Alcohol Prep Pads | Medium 2-Ply - 200 Alcohol Wipes, individually wrapped Cotto",
    "Alcohol Prep Pads, Medium 2-Ply - 400 Alcohol Wipes, individually wrapped Swabs,",
    "Alcohol Prep Pads, Thick Alcohol Swabs (Pack of 400) - CUR45585RB"
]

query = "alcohol prep pads"


In [9]:
# Search for each product in the FULL dataset
print("SEARCHING FOR TOP 5 PRODUCTS IN FULL DATASET:\n")
print("=" * 120)

for i, title_fragment in enumerate(products_to_check, 1):
    # Remove the trailing "..." or truncation if present
    search_term = title_fragment.split('...')[0]
    
    print(f"\n{i}. Searching for: '{title_fragment}'")
    print("-" * 120)
    
    # Find this product anywhere in the dataset
    matching_products = df_examples_products[
        df_examples_products['product_title'].str.contains(search_term, regex=False, na=False)
    ]
    
    if len(matching_products) == 0:
        print(f"   ❌ NOT FOUND in the entire dataset")
    else:
        print(f"   ✓ Found {len(matching_products)} matching row(s) in dataset")
        
        # Now check if any of these matches are for the "alcohol prep pads" query
        matching_for_query = matching_products[matching_products['query'] == query]
        
        if len(matching_for_query) == 0:
            print(f"   ❌ NOT labeled for query '{query}'")
            print(f"   → This product is in the dataset for OTHER queries:")
            other_queries = matching_products['query'].unique()
            for q in other_queries[:5]:  # Show first 5 queries
                print(f"      • {q}")
            if len(other_queries) > 5:
                print(f"      ... and {len(other_queries) - 5} more queries")
        else:
            print(f"   ✅ FOUND for query '{query}'!")
            for idx, row in matching_for_query.iterrows():
                print(f"      Product ID: {row['product_id']}")
                print(f"      ESCI Label: {row['esci_label']}")
                print(f"      Full Title: {row['product_title']}")
                
print("\n" + "=" * 120)


SEARCHING FOR TOP 5 PRODUCTS IN FULL DATASET:


1. Searching for: 'Winner Alcohol Prep Pads, 4-Ply Square Cotton Pads Well-saturated in Alcohol, 20'
------------------------------------------------------------------------------------------------------------------------
   ✓ Found 2 matching row(s) in dataset
   ❌ NOT labeled for query 'alcohol prep pads'
   → This product is in the dataset for OTHER queries:
      • alcohol pads
      • alcohol wipes

2. Searching for: 'Alcohol Prep Pads 400 Pack 6X3cm - Sterile 75% Alcohol Wipes - Thick Cotton Indi'
------------------------------------------------------------------------------------------------------------------------
   ✓ Found 1 matching row(s) in dataset
   ❌ NOT labeled for query 'alcohol prep pads'
   → This product is in the dataset for OTHER queries:
      • antiseptic wipes individually wrapped

3. Searching for: 'Alcohol Prep Pads | Medium 2-Ply - 200 Alcohol Wipes, individually wrapped Cotto'
--------------------------------

In [10]:
# Let's also see what products ARE labeled for "alcohol prep pads"
df_alcohol_query = df_examples_products[df_examples_products['query'] == query]

print(f"PRODUCTS ACTUALLY LABELED FOR '{query}':\n")
print("=" * 120)
print(f"\nTotal: {len(df_alcohol_query)} products\n")

for i, (idx, row) in enumerate(df_alcohol_query.iterrows(), 1):
    print(f"{i:2d}. [{row['esci_label']}] {row['product_title']}")
    print(f"    Product ID: {row['product_id']}, Brand: {row['product_brand']}")
    print()
    
print("=" * 120)


PRODUCTS ACTUALLY LABELED FOR 'alcohol prep pads':


Total: 16 products

 1. [E] Medpride Alcohol Prep Pads| 100 Pack| Medical-Grade, Sterile, Individually-Wrapped, Isopropyl Cotton Swabs| Disposable, Medium Square Size, 2ply, Latex Free & Antiseptic| for Medical & First-Aid Kits
    Product ID: B07F2N14FV, Brand: MED PRIDE

 2. [E] Care Touch Alcohol Prep Pads, Medium 2-Ply - 300 Alcohol Wipes
    Product ID: B01MDMA1ZB, Brand: Care Touch

 3. [E] Alcohol Prep Pads, Thick Alcohol Swabs (Pack of 400) - CUR45585RB
    Product ID: B07MYM8MXV, Brand: Curad

 4. [E] Dynarex 1113 Latex Free Sterile Alcohol Prep Pad (Box of 200)
    Product ID: B01HOISXWW, Brand: Dynarex

 5. [E] 100 Pcs Alcohol Prep Pads, 75% Alcohol Cotton Slices, Alcohol Gauze Pads Individually Wrapped Swap Pad, 6 x 6cm/2.4in x 2.4in
    Product ID: B087XYMMZX, Brand: Little Martin’s Drawer

 6. [E] 100 Pcs 75% Alcohol Cotton Slices, Alcohol Gauze Pads Individually Wrapped Swap Pad for Cleaning Care Mobile Phone Nail Comp