<a href="https://colab.research.google.com/github/navneetkrc/Deep_learning_experiments/blob/master/query_rewriting_using_pyterrier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# PyTerrier Query Expansion Tutorial (Modified for Google Colab)
# Last updated: October 26, 2023 (Adaptable for 2025+)

!pip install -q --upgrade python-terrier  # Install the latest pyterrier (important!)

In [25]:
import pandas as pd
import random

product_names = [
    # (Samsung product names - same as before) ...
    "Samsung Galaxy S25 Ultra", "Samsung Galaxy S25+", "Samsung Galaxy S25",
    "Samsung Galaxy S26 Ultra", "Samsung Galaxy S24 Ultra", "Samsung Galaxy S24+", "Samsung Galaxy S24",
    "Samsung Galaxy S24 Fan Edition (FE)", "Samsung Galaxy S22 Ultra",
    "Samsung Galaxy Z Fold6", "Samsung Galaxy Z Flip6", "Samsung Galaxy Z Fold Special Edition",
    "Samsung Galaxy A16", "Samsung Galaxy A16 5G", "Samsung Galaxy A14", "Samsung Galaxy A06",
    "Samsung Galaxy A34 5G", "Samsung Galaxy A35", "Samsung Galaxy A54 5G", "Samsung Galaxy A55",
    "Samsung Galaxy M05", "Samsung Galaxy M14 4G", "Samsung Galaxy M15",
    "Samsung Galaxy M35", "Samsung Galaxy M55", "Samsung Galaxy M55s",
    "Samsung Galaxy F05", "Samsung Galaxy F14 4G", "Samsung Galaxy F15", "Samsung Galaxy F55",
    "Samsung Galaxy C55", "Samsung Galaxy XCover7",
    "Samsung Galaxy Tab S10 Ultra", "Samsung Galaxy Tab S10+",
    "Samsung Galaxy Tab S10 FE (Wi-Fi)", "Samsung Galaxy Tab S10 FE (5G)",
    "Samsung Galaxy Tab S10 FE Plus (Wi-Fi)", "Samsung Galaxy Tab S10 FE Plus (5G)",
    "Samsung Galaxy Tab S9 Ultra", "Samsung Galaxy Tab S9+",
    "Samsung Galaxy Tab S9 FE+", "Samsung Galaxy Tab S9 FE",
    "Samsung Galaxy Tab Active5 (2024)",
    "Samsung Galaxy Tab A9+", "Samsung Galaxy Tab A9",
    "Samsung Galaxy Tab S6 Lite (2024)",
    "Samsung Galaxy Watch Ultra",
    "Samsung Galaxy Watch7",
    "Samsung Galaxy Watch Fan Edition (FE)",
    "Samsung Galaxy Watch6 Classic",
    "Samsung Galaxy Watch6",
    "Samsung Galaxy Watch5 Pro",
    "Samsung Galaxy Watch5",
    "Samsung Galaxy Fit3",
    "Samsung Galaxy Buds Pro",
    "Samsung Galaxy Buds2 Pro",
    "Samsung Galaxy Buds Fan Edition (FE)",
    "Samsung Galaxy Buds2",
    "Samsung Galaxy Buds",
    "Samsung Bespoke 4-Door Flex Refrigerator",
    "Samsung RF28T5021SR (28 cu. ft. Large Capacity 3-Door French Door Refrigerator with AutoFill Water Pitcher)",
    "Samsung Family Hub Refrigerator",
    "Samsung 27 cu. ft. Large Capacity 3-Door French Door Refrigerator",
    "Samsung Bespoke Bottom Freezer Refrigerator",
    "Samsung WA54R7600AV (High-Efficiency Top Load Washer with Super Speed)",
    "Samsung 5.4 cu. ft. High-Efficiency Top Load Washer",
    "Samsung Smart Dial Front Load Washer",
    "Samsung Bespoke Ultra Capacity Front Load Washer",
    "Samsung Top Load Washer with Active WaterJet",
    "Samsung QN900C Neo QLED 8K TV",
    "Samsung The Frame QLED 4K TV",
    "Samsung OLED S95C TV",
    "Samsung 65-inch Class Crystal UHD 4K Smart TV",
    "Samsung Neo QLED 4K QN90B TV Series",
    "Samsung The Terrace Outdoor TV Series",
    "Samsung The Serif TV Series",
    "Samsung Bespoke Slide-in Gas Range",
    'Samsung NV51K7770DG (Double Oven Range)',
    'Samsung Smart Induction Cooktop',
    'Samsung Smart Freestanding Gas Range',
    'Samsung Electric Range with Air Fry',
    'Samsung Over-the-Range Microwave with Sensor Cook',
     'Samsung StormWash Dishwasher',
     'Samsung Bespoke Smart Dishwasher',
     'Samsung Odyssey G9 Gaming Monitor',
     'Samsung ViewFinity S9 Monitor Series',
     'Samsung ViewFinity S65VC Series (34-inch)',
     'Samsung 27-inch M80C UHD 4K Smart Monitor'
]

categories = {
    "Smartphones": ["phone", "mobile", "smartphone", "android", "5G", "4G", "LTE", "camera", "display", "processor", "storage", "RAM", "battery", "screen"],
    "Tablets": ["tablet", "android", "display", "pen", "stylus", "Wi-Fi", "5G", "storage", "RAM", "battery", "portable", "screen"],
    "Wearables": ["smartwatch", "watch", "fitness tracker", "health", "activity", "heart rate", "sleep tracking", "GPS", "Bluetooth", "wearable", "wrist"],
    "Audio": ["earbuds", "headphones", "wireless", "Bluetooth", "noise cancelling", "ANC", "audio", "sound", "music", "bass", "buds"],
    "Refrigerators": ["refrigerator", "fridge", "freezer", "cooling", "smart", "French door", "side-by-side", "top freezer", "bottom freezer", "stainless steel", "capacity", "cubic feet", "cu. ft."],
    "Washing Machines": ["washing machine", "washer", "dryer", "laundry", "front load", "top load", "high-efficiency", "smart", "capacity", "steam", "cubic feet", "cu. ft."],
    "TVs": ["TV", "television", "QLED", "OLED", "4K", "8K", "smart TV", "HDR", "display", "screen", "inches", "resolution", "refresh rate", "Hz"],
    "Kitchen Appliances": ["range", "cooktop", "oven", "microwave", "dishwasher", "smart", "stainless steel", "induction", "gas", "electric", "convection"],
    "Monitors": ["monitor", "display", "gaming", "4K", "UHD", "curved", "screen", "inches", "resolution", "Hz", "response time", "panel"]
}

def generate_description(product_name):
    description = f"{product_name}."
    category_keywords = []
    for cat, keywords in categories.items():
        if any(keyword.lower() in product_name.lower() for keyword in keywords):
            category_keywords = keywords
            break
    if not category_keywords:
        category_keywords = ["electronics", "device", "Samsung", "quality", "reliable"]
    num_keywords = random.randint(3, 6)
    selected_keywords = random.sample(category_keywords, min(num_keywords, len(category_keywords)))
    description += " Features: " + ", ".join(selected_keywords) + "."
    if random.random() < 0.7:
        if "phone" in category_keywords or "tablet" in category_keywords:
            colors = ["Black", "White", "Silver", "Blue", "Titanium", "Gray", "Phantom Black", "Cream", "Lavender", "Graphite"]
            storage = ["128GB", "256GB", "512GB", "1TB"]
            description += f" Available in {random.choice(colors)} with {random.choice(storage)} storage."
        elif "refrigerator" in category_keywords:
            colors = ["Stainless Steel", "Black Stainless Steel", "White", "Bespoke White Glass", "Bespoke Navy Glass", "Champagne Stainless Steel"]
            description += f" Available in {random.choice(colors)}."
        elif "washing machine" in category_keywords or "dishwasher" in category_keywords:
            colors = ["White", "Stainless Steel", "Black Stainless Steel", "Champagne", "Platinum"]
            description += f" Available in {random.choice(colors)}."
        elif "TV" in category_keywords or "monitor" in category_keywords:
          sizes = ["32-inch", "43-inch", "50-inch", "55-inch", "65-inch", "75-inch", "85-inch"]
          description += f" Available in a {random.choice(sizes)} size."
    return description

# --- Create the Samsung DataFrame ---
data_samsung = []
doc_id_counter = 1
for product_name in product_names:
    description = generate_description(product_name)
    data_samsung.append({"docno": f"d{doc_id_counter}", "text": description})
    doc_id_counter += 1

df_samsung = pd.DataFrame(data_samsung)

# --- Example DataFrame (for comparison) ---
data_example = [
    {"docno": "d1", "text": "red running shoes nike"},
    {"docno": "d2", "text": "blue sneakers adidas for men"},
    {"docno": "d3", "text": "women's white casual shoes"},
    {"docno": "d4", "text": "black leather boots for winter"},
    {"docno": "d5", "text": "running shoes for marathon training"},
    {"docno": "d6", "text": "cheap nike shoes on sale"},
    {"docno": "d7", "text": "adidas ultraboost running sneakers"},
]
df_example = pd.DataFrame(data_example)

# --- Demonstrating Identical Structure ---
print("Samsung DataFrame Head:")
print(df_samsung.head())
print("\nSamsung DataFrame Info:")
print(df_samsung.info())  # Show column names and types

print("\nExample DataFrame Head:")
print(df_example.head())
print("\nExample DataFrame Info:")
print(df_example.info())  # Show column names and types

print("\nAre the column names the same?", df_samsung.columns.equals(df_example.columns))

Samsung DataFrame Head:
  docno                                               text
0    d1  Samsung Galaxy S25 Ultra. Features: device, Sa...
1    d2  Samsung Galaxy S25+. Features: electronics, re...
2    d3  Samsung Galaxy S25. Features: device, electron...
3    d4  Samsung Galaxy S26 Ultra. Features: electronic...
4    d5  Samsung Galaxy S24 Ultra. Features: quality, r...

Samsung DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   docno   88 non-null     object
 1   text    88 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB
None

Example DataFrame Head:
  docno                                 text
0    d1               red running shoes nike
1    d2         blue sneakers adidas for men
2    d3           women's white casual shoes
3    d4       black leather boots for winter
4    d5  running shoes for marathon training

Example Da

In [28]:
# PyTerrier Query Expansion Tutorial (Modified for Google Colab)
# Last updated: October 27, 2023

# !pip install --upgrade python-terrier  # Install the latest pyterrier
import pyterrier as pt
import pandas as pd
import os
import random  # Import the 'random' module

if not pt.started():
    pt.init()

# 1. Create the Samsung dataset (using the improved generation code)
product_names = [
    # Smartphones
    "Samsung Galaxy S25 Ultra", "Samsung Galaxy S25+", "Samsung Galaxy S25",
    "Samsung Galaxy S26 Ultra", "Samsung Galaxy S24 Ultra", "Samsung Galaxy S24+", "Samsung Galaxy S24",
    "Samsung Galaxy S24 Fan Edition (FE)", "Samsung Galaxy S22 Ultra",
    "Samsung Galaxy Z Fold6", "Samsung Galaxy Z Flip6", "Samsung Galaxy Z Fold Special Edition",
    "Samsung Galaxy A16", "Samsung Galaxy A16 5G", "Samsung Galaxy A14", "Samsung Galaxy A06",
    "Samsung Galaxy A34 5G", "Samsung Galaxy A35", "Samsung Galaxy A54 5G", "Samsung Galaxy A55",
    "Samsung Galaxy M05", "Samsung Galaxy M14 4G", "Samsung Galaxy M15",
    "Samsung Galaxy M35", "Samsung Galaxy M55", "Samsung Galaxy M55s",
    "Samsung Galaxy F05", "Samsung Galaxy F14 4G", "Samsung Galaxy F15", "Samsung Galaxy F55",
    "Samsung Galaxy C55", "Samsung Galaxy XCover7",

    # Tablets
    "Samsung Galaxy Tab S10 Ultra", "Samsung Galaxy Tab S10+",
    "Samsung Galaxy Tab S10 FE (Wi-Fi)", "Samsung Galaxy Tab S10 FE (5G)",
    "Samsung Galaxy Tab S10 FE Plus (Wi-Fi)", "Samsung Galaxy Tab S10 FE Plus (5G)",
    "Samsung Galaxy Tab S9 Ultra", "Samsung Galaxy Tab S9+",
    "Samsung Galaxy Tab S9 FE+", "Samsung Galaxy Tab S9 FE",
    "Samsung Galaxy Tab Active5 (2024)",
    "Samsung Galaxy Tab A9+", "Samsung Galaxy Tab A9",
    "Samsung Galaxy Tab S6 Lite (2024)",

    # Wearables
    "Samsung Galaxy Watch Ultra",
    "Samsung Galaxy Watch7",
    "Samsung Galaxy Watch Fan Edition (FE)",
    "Samsung Galaxy Watch6 Classic",
    "Samsung Galaxy Watch6",
    "Samsung Galaxy Watch5 Pro",
    "Samsung Galaxy Watch5",
    "Samsung Galaxy Fit3",

    # Audio
    "Samsung Galaxy Buds Pro",
    "Samsung Galaxy Buds2 Pro",
    "Samsung Galaxy Buds Fan Edition (FE)",
    "Samsung Galaxy Buds2",
    "Samsung Galaxy Buds",

    # Refrigerators
    "Samsung Bespoke 4-Door Flex Refrigerator",
    "Samsung RF28T5021SR (28 cu. ft. Large Capacity 3-Door French Door Refrigerator with AutoFill Water Pitcher)",
    "Samsung Family Hub Refrigerator",
    "Samsung 27 cu. ft. Large Capacity 3-Door French Door Refrigerator",
    "Samsung Bespoke Bottom Freezer Refrigerator",

    # Washing Machines
    "Samsung WA54R7600AV (High-Efficiency Top Load Washer with Super Speed)",
    "Samsung 5.4 cu. ft. High-Efficiency Top Load Washer",
    "Samsung Smart Dial Front Load Washer",
    "Samsung Bespoke Ultra Capacity Front Load Washer",
    "Samsung Top Load Washer with Active WaterJet",

    # TVs
    "Samsung QN900C Neo QLED 8K TV",
    "Samsung The Frame QLED 4K TV",
    "Samsung OLED S95C TV",
    "Samsung 65-inch Class Crystal UHD 4K Smart TV",
    "Samsung Neo QLED 4K QN90B TV Series",
    "Samsung The Terrace Outdoor TV Series",
    "Samsung The Serif TV Series",

    # Kitchen Appliances
    # Ranges & Cooktops
    "Samsung Bespoke Slide-in Gas Range",
    'Samsung NV51K7770DG (Double Oven Range)',
    'Samsung Smart Induction Cooktop',
    'Samsung Smart Freestanding Gas Range',
    'Samsung Electric Range with Air Fry',

     # Microwaves & Dishwashers
     'Samsung Over-the-Range Microwave with Sensor Cook',
     'Samsung StormWash Dishwasher',
     'Samsung Bespoke Smart Dishwasher',

     # Monitors
     'Samsung Odyssey G9 Gaming Monitor',
     'Samsung ViewFinity S9 Monitor Series',
     'Samsung ViewFinity S65VC Series (34-inch)',
     'Samsung 27-inch M80C UHD 4K Smart Monitor'
]

categories = {
    "Smartphones": ["phone", "mobile", "smartphone", "android", "5G", "4G", "LTE", "camera", "display", "processor", "storage", "RAM", "battery", "screen"],
    "Tablets": ["tablet", "android", "display", "pen", "stylus", "Wi-Fi", "5G", "storage", "RAM", "battery", "portable", "screen"],
    "Wearables": ["smartwatch", "watch", "fitness tracker", "health", "activity", "heart rate", "sleep tracking", "GPS", "Bluetooth", "wearable", "wrist"],
    "Audio": ["earbuds", "headphones", "wireless", "Bluetooth", "noise cancelling", "ANC", "audio", "sound", "music", "bass", "buds"],
    "Refrigerators": ["refrigerator", "fridge", "freezer", "cooling", "smart", "French door", "side-by-side", "top freezer", "bottom freezer", "stainless steel", "capacity", "cubic feet", "cu. ft."],
    "Washing Machines": ["washing machine", "washer", "dryer", "laundry", "front load", "top load", "high-efficiency", "smart", "capacity", "steam", "cubic feet", "cu. ft."],
    "TVs": ["TV", "television", "QLED", "OLED", "4K", "8K", "smart TV", "HDR", "display", "screen", "inches", "resolution", "refresh rate", "Hz"],
    "Kitchen Appliances": ["range", "cooktop", "oven", "microwave", "dishwasher", "smart", "stainless steel", "induction", "gas", "electric", "convection"],
    "Monitors": ["monitor", "display", "gaming", "4K", "UHD", "curved", "screen", "inches", "resolution", "Hz", "response time", "panel"]
}

def generate_description(product_name):
    description = f"{product_name}."
    category_keywords = []
    for cat, keywords in categories.items():
        if any(keyword.lower() in product_name.lower() for keyword in keywords):
            category_keywords = keywords
            break
    if not category_keywords:
        category_keywords = ["electronics", "device", "Samsung", "quality", "reliable"]
    num_keywords = random.randint(3, 6)
    selected_keywords = random.sample(category_keywords, min(num_keywords, len(category_keywords)))
    description += " Features: " + ", ".join(selected_keywords) + "."
    if random.random() < 0.7:
        if "phone" in category_keywords or "tablet" in category_keywords:
            colors = ["Black", "White", "Silver", "Blue", "Titanium", "Gray", "Phantom Black", "Cream", "Lavender", "Graphite"]
            storage = ["128GB", "256GB", "512GB", "1TB"]
            description += f" Available in {random.choice(colors)} with {random.choice(storage)} storage."
        elif "refrigerator" in category_keywords:
            colors = ["Stainless Steel", "Black Stainless Steel", "White", "Bespoke White Glass", "Bespoke Navy Glass", "Champagne Stainless Steel"]
            description += f" Available in {random.choice(colors)}."
        elif "washing machine" in category_keywords or "dishwasher" in category_keywords:
            colors = ["White", "Stainless Steel", "Black Stainless Steel", "Champagne", "Platinum"]
            description += f" Available in {random.choice(colors)}."
        elif "TV" in category_keywords or "monitor" in category_keywords:
          sizes = ["32-inch", "43-inch", "50-inch", "55-inch", "65-inch", "75-inch", "85-inch"]
          description += f" Available in a {random.choice(sizes)} size."
    return description

# Create the Samsung DataFrame
data_samsung = []
doc_id_counter = 1
for product_name in product_names:
    description = generate_description(product_name)
    data_samsung.append({"docno": f"d{doc_id_counter}", "text": description})
    doc_id_counter += 1

df = pd.DataFrame(data_samsung)  # Use the Samsung data


# 2. Index the dataset
index_dir = "./tmp_index"
if not os.path.exists(index_dir):
  os.makedirs(index_dir)

indexer = pt.IterDictIndexer(index_dir, overwrite=True)
index_ref = indexer.index(df.to_dict(orient='records'))

# 3. Load the index.
index = pt.IndexFactory.of(index_ref)

# 4. Example Usage (Query Expansion - Bo1)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
bo1 = pt.rewrite.Bo1QueryExpansion(index)
pipeline = bm25 >> bo1

query = "Samsung phone"  # Use a query relevant to the Samsung data
expanded_query = pipeline.transform(pd.DataFrame([{'qid' : '1', 'query' : query}]))
print(f"Original Query: {query}")
print(f"Expanded Query (Bo1): {expanded_query.loc[0, 'query']}")

# 5. Example Usage (Query Expansion - RM3)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
rm3 = pt.rewrite.RM3(index)
pipeline = bm25 >> rm3

query = "Samsung phone"  # Use a query relevant to the Samsung data
expanded_query_rm3 = pipeline.transform(pd.DataFrame([{'qid' : '1', 'query' : query}]))
print(f"Original Query: {query}")
print(f"Expanded Query (RM3): {expanded_query_rm3.loc[0, 'query']}")

# 6. Example Usage (Simple Retrieval)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
results = bm25.search("Samsung phone")  # Use a query relevant to the Samsung data
print("\nBM25 Retrieval Results:")
print(results)

  if not pt.started():


Original Query: Samsung phone
Expanded Query (Bo1): applypipeline:off samsung^1.243326763 phone^1.743486008 4g^0.859252218 ram^0.515320091 5g^0.439349177 featur^0.269962742 galaxi^0.227733774 stainless^0.000000000 fridg^0.000000000 smartphon^0.000000000
Original Query: Samsung phone
Expanded Query (RM3): applypipeline:off ram^0.071827300 fridg^0.000841117 4g^0.107837841 camera^0.035816755 french^0.001682235 samsung^0.300000012 smartphon^0.035816755 displai^0.035816755 door^0.002523352 phone^0.371827304 processor^0.036010545

BM25 Retrieval Results:
   qid  docid docno  rank      score          query
0    1     27   d28     0  -2.405999  Samsung phone
1    1     21   d22     1  -2.495913  Samsung phone
2    1     62   d63     2  -5.464291  Samsung phone
3    1     65   d66     3  -5.464291  Samsung phone
4    1     72   d73     4  -5.464291  Samsung phone
..  ..    ...   ...   ...        ...            ...
83   1     29   d30    83 -11.250368  Samsung phone
84   1     30   d31    84 -11

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  bm25 = pt.BatchRetrieve(index, wmodel="BM25")


In [39]:
# !pip install --upgrade python-terrier  # Install the latest pyterrier
import pyterrier as pt
import pandas as pd
import os
import random

if not pt.started():
    pt.init()

# --- Data Generation (Improved) ---
product_names = [
    "Samsung Galaxy S25 Ultra", "Samsung Galaxy S25+", "Samsung Galaxy S25",
    "Samsung Galaxy S26 Ultra", "Samsung Galaxy S24 Ultra", "Samsung Galaxy S24+", "Samsung Galaxy S24",
    "Samsung Galaxy S24 Fan Edition (FE)", "Samsung Galaxy S22 Ultra",
    "Samsung Galaxy Z Fold6", "Samsung Galaxy Z Flip6", "Samsung Galaxy Z Fold Special Edition",
    "Samsung Galaxy A16", "Samsung Galaxy A16 5G", "Samsung Galaxy A14", "Samsung Galaxy A06",
    "Samsung Galaxy A34 5G", "Samsung Galaxy A35", "Samsung Galaxy A54 5G", "Samsung Galaxy A55",
    "Samsung Galaxy M05", "Samsung Galaxy M14 4G", "Samsung Galaxy M15",
    "Samsung Galaxy M35", "Samsung Galaxy M55", "Samsung Galaxy M55s",
    "Samsung Galaxy F05", "Samsung Galaxy F14 4G", "Samsung Galaxy F15", "Samsung Galaxy F55",
    "Samsung Galaxy C55", "Samsung Galaxy XCover7",
    "Samsung Galaxy Tab S10 Ultra", "Samsung Galaxy Tab S10+",
    "Samsung Galaxy Tab S10 FE (Wi-Fi)", "Samsung Galaxy Tab S10 FE (5G)",
    "Samsung Galaxy Tab S10 FE Plus (Wi-Fi)", "Samsung Galaxy Tab S10 FE Plus (5G)",
    "Samsung Galaxy Tab S9 Ultra", "Samsung Galaxy Tab S9+",
    "Samsung Galaxy Tab S9 FE+", "Samsung Galaxy Tab S9 FE",
    "Samsung Galaxy Tab Active5 (2024)",
    "Samsung Galaxy Tab A9+", "Samsung Galaxy Tab A9",
    "Samsung Galaxy Tab S6 Lite (2024)",
    "Samsung Galaxy Watch Ultra",
    "Samsung Galaxy Watch7",
    "Samsung Galaxy Watch Fan Edition (FE)",
    "Samsung Galaxy Watch6 Classic",
    "Samsung Galaxy Watch6",
    "Samsung Galaxy Watch5 Pro",
    "Samsung Galaxy Watch5",
    "Samsung Galaxy Fit3",
    "Samsung Galaxy Buds Pro",
    "Samsung Galaxy Buds2 Pro",
    "Samsung Galaxy Buds Fan Edition (FE)",
    "Samsung Galaxy Buds2",
    "Samsung Galaxy Buds",
    "Samsung Bespoke 4-Door Flex Refrigerator",
    "Samsung RF28T5021SR",
    "Samsung Family Hub Refrigerator",
    "Samsung 27 cu. ft. Large Capacity Refrigerator",
    "Samsung Bespoke Bottom Freezer Refrigerator",
    "Samsung WA54R7600AV",
    "Samsung 5.4 cu. ft. High-Efficiency Washer",
    "Samsung Smart Dial Front Load Washer",
    "Samsung Bespoke Ultra Capacity Front Load Washer",
    "Samsung Top Load Washer with Active WaterJet",
    "Samsung QN900C Neo QLED 8K TV",
    "Samsung The Frame QLED 4K TV",
    "Samsung OLED S95C TV",
    "Samsung 65-inch Class Crystal UHD 4K Smart TV",
    "Samsung Neo QLED 4K QN90B TV Series",
    "Samsung The Terrace Outdoor TV Series",
    "Samsung The Serif TV Series",
    "Samsung Bespoke Slide-in Gas Range",
    'Samsung NV51K7770DG',
    'Samsung Smart Induction Cooktop',
    'Samsung Smart Freestanding Gas Range',
    'Samsung Electric Range with Air Fry',
    'Samsung Over-the-Range Microwave with Sensor Cook',
     'Samsung StormWash Dishwasher',
     'Samsung Bespoke Smart Dishwasher',
     'Samsung Odyssey G9 Gaming Monitor',
     'Samsung ViewFinity S9 Monitor Series',
     'Samsung ViewFinity S65VC Series (34-inch)',
     'Samsung 27-inch M80C UHD 4K Smart Monitor'
]

# More focused keywords, and more of them.  Removed very generic terms.
categories = {
    "Smartphones": ["phone", "mobile", "smartphone", "android", "5G", "4G", "LTE", "camera", "display", "processor", "storage", "RAM", "battery", "screen", "megapixel", "AMOLED", "refresh rate", "chipset"],
    "Tablets": ["tablet", "android", "display", "pen", "stylus", "Wi-Fi", "5G", "storage", "RAM", "battery", "portable", "screen", "AMOLED", "S Pen", "keyboard"],
    "Wearables": ["smartwatch", "watch", "fitness tracker", "health", "activity", "heart rate", "sleep tracking", "GPS", "Bluetooth", "wearable", "wrist", "ECG", "SpO2", "fitness"],
    "Audio": ["earbuds", "headphones", "wireless", "Bluetooth", "noise cancelling", "ANC", "audio", "sound", "music", "bass", "buds", "True Wireless", "Dolby Atmos"],
    "Refrigerators": ["refrigerator", "fridge", "freezer", "cooling", "smart", "French door", "side-by-side", "top freezer", "bottom freezer", "stainless steel", "capacity", "cubic feet", "cu. ft.", "ice maker", "water dispenser", "FlexZone"],
    "Washing Machines": ["washing machine", "washer", "dryer", "laundry", "front load", "top load", "high-efficiency", "smart", "capacity", "steam", "cubic feet", "cu. ft.", "Super Speed", "Vibration Reduction", "self clean"],
    "TVs": ["TV", "television", "QLED", "OLED", "4K", "8K", "smart TV", "HDR", "display", "screen", "inches", "resolution", "refresh rate", "Hz", "Neo QLED", "Quantum Dot", "HDMI 2.1"],
    "Kitchen Appliances": ["range", "cooktop", "oven", "microwave", "dishwasher", "smart", "stainless steel", "induction", "gas", "electric", "convection", "air fry", "sensor cook", "Steam Clean"],
    "Monitors": ["monitor", "display", "gaming", "4K", "UHD", "curved", "screen", "inches", "resolution", "Hz", "response time", "panel", "IPS", "VA", "FreeSync", "G-Sync"]
}

def generate_description(product_name):
    description = f"{product_name}."
    category_keywords = []
    for cat, keywords in categories.items():
        if any(keyword.lower() in product_name.lower() for keyword in keywords):
            category_keywords = keywords
            break
    if not category_keywords:
        category_keywords = ["electronics", "device", "Samsung", "quality", "reliable"]  #Keep fallback

    # Increased keyword count:
    num_keywords = random.randint(4, 8)  # More keywords per description
    selected_keywords = random.sample(category_keywords, min(num_keywords, len(category_keywords)))
    description += " Features: " + ", ".join(selected_keywords) + "."

    if random.random() < 0.8:  # Increased probability
        if "phone" in category_keywords or "tablet" in category_keywords:
            colors = ["Black", "White", "Silver", "Blue", "Titanium", "Gray", "Phantom Black", "Cream", "Lavender", "Graphite"]
            storage = ["128GB", "256GB", "512GB", "1TB"]
            # Add more specific details
            description += f" Available in {random.choice(colors)} with {random.choice(storage)} storage.  "
            if "phone" in category_keywords:
                description += f" It has a powerful {random.choice(['Snapdragon', 'Exynos'])} processor."
            elif "tablet" in category_keywords:
                description += f" Includes support for the S Pen."

        elif "refrigerator" in category_keywords:
            colors = ["Stainless Steel", "Black Stainless Steel", "White", "Bespoke White Glass", "Bespoke Navy Glass", "Champagne Stainless Steel"]
            description += f" Available in {random.choice(colors)}. "
            if random.random() < 0.5:
                description += " Features a built-in ice maker."

        elif "washing machine" in category_keywords or "dishwasher" in category_keywords:
            colors = ["White", "Stainless Steel", "Black Stainless Steel", "Champagne", "Platinum"]
            description += f" Available in {random.choice(colors)}. "
            if "washing machine" in category_keywords:
                description += f" Offers a {random.choice(['5.0', '5.4', '4.5'])} cu. ft. capacity."
            elif "dishwasher" in category_keywords:
                description += " Features StormWash technology."
        elif "TV" in category_keywords or "monitor" in category_keywords:
          sizes = ["32-inch", "43-inch", "50-inch", "55-inch", "65-inch", "75-inch", "85-inch"]
          description += f" Available in a {random.choice(sizes)} size. "
          if "TV" in category_keywords:
            description += f" Features {random.choice(['4K', '8K'])} resolution."
          elif "monitor" in category_keywords:
              description += f" Features a {random.choice(['144Hz', '240Hz', '60Hz'])} refresh rate"

    return description


# Create the Samsung DataFrame
data_samsung = []
doc_id_counter = 1
for product_name in product_names:
    description = generate_description(product_name)
    data_samsung.append({"docno": f"d{doc_id_counter}", "text": description})
    doc_id_counter += 1

df = pd.DataFrame(data_samsung)

# --- Indexing ---
index_dir = "./tmp_index"
if not os.path.exists(index_dir):
  os.makedirs(index_dir)

indexer = pt.IterDictIndexer(index_dir, overwrite=True, meta={'docno': 20, 'text': 5000}) #Specify meta lengths
index_ref = indexer.index(df.to_dict(orient='records'))
index = pt.IndexFactory.of(index_ref)

# --- Query Expansion and Retrieval ---

# 1.  BM25 (with term frequency control)
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"bm25.k_1": 1.2, "bm25.b": 0.75})  # Tune BM25 parameters

# 2.  Bo1 (with control over number of terms and documents)
bo1 = pt.rewrite.Bo1QueryExpansion(index, fb_terms=10, fb_docs=5) # Limit terms and docs
pipeline_bo1 = bm25 >> bo1

query = "ultra fast cgarzer"
expanded_query_bo1 = pipeline_bo1.transform(pd.DataFrame([{'qid' : '1', 'query' : query}]))
print(f"Original Query: {query}")
print(f"Expanded Query (Bo1): {expanded_query_bo1.loc[0, 'query']}")

# 3.  RM3 (with control over number of terms, documents)
rm3 = pt.rewrite.RM3(index, fb_terms=10, fb_docs=5) #Removed ,original_query_weight=0.5
pipeline_rm3 = bm25 >> rm3

query = "ultra fast cgarzer"
expanded_query_rm3 = pipeline_rm3.transform(pd.DataFrame([{'qid' : '1', 'query' : query}]))
print(f"Original Query: {query}")
print(f"Expanded Query (RM3): {expanded_query_rm3.loc[0, 'query']}")

# 4.  Simple Retrieval (using tuned BM25)
results = bm25.search("ultra fast cgarzer")
print("\nBM25 Retrieval Results:")
print(results)

  if not pt.started():


Original Query: ultra fast cgarzer
Expanded Query (Bo1): applypipeline:off ultra^1.852803792 fast^1.000000000 cgarzer^1.000000000 reliabl^0.416716506 samsung^0.414200473 electron^0.411661969 qualiti^0.411661969 galaxi^0.346094135 devic^0.338587216 featur^0.262126809 s26^0.000000000 s24^0.000000000
16:12:15.125 [main] WARN org.terrier.querying.RM1 -- Did not identify any usable candidate expansion terms from docid 3 among 10 possibilities
16:12:15.131 [main] WARN org.terrier.querying.RM1 -- Did not identify any usable candidate expansion terms from docid 8 among 10 possibilities
Original Query: ultra fast cgarzer
Expanded Query (RM3): applypipeline:off s10^0.105186790 s25^0.130329758 s24^0.164483473 ultra^0.200000018

BM25 Retrieval Results:
  qid  docid docno  rank     score               query
0   1      4    d5     0  3.753579  ultra fast cgarzer
1   1      0    d1     1  3.626198  ultra fast cgarzer
2   1      3    d4     2  3.626198  ultra fast cgarzer
3   1      8    d9     3  3.6

  bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"bm25.k_1": 1.2, "bm25.b": 0.75})  # Tune BM25 parameters


In [34]:
df["text"]

Unnamed: 0,text
0,"Samsung Galaxy S25 Ultra. Features: reliable, ..."
1,"Samsung Galaxy S25+. Features: quality, electr..."
2,"Samsung Galaxy S25. Features: device, electron..."
3,"Samsung Galaxy S26 Ultra. Features: Samsung, e..."
4,Samsung Galaxy S24 Ultra. Features: electronic...
...,...
83,Samsung Bespoke Smart Dishwasher. Features: st...
84,Samsung Odyssey G9 Gaming Monitor. Features: r...
85,Samsung ViewFinity S9 Monitor Series. Features...
86,Samsung ViewFinity S65VC Series (34-inch). Fea...


In [41]:
rm3 = pt.rewrite.RM3(index, fb_terms=10, fb_docs=5)  # Remove original_query_weight
rm3

QueryExpansion(/content/tmp_index/data.properties,5,10,<org.terrier.querying.RM3 at 0x79289e0477d0 jclass=org/terrier/querying/RM3 jself=<LocalRef obj=0x3a6a610 at 0x79289ead3070>>)

In [35]:
df["docno"]

Unnamed: 0,docno
0,d1
1,d2
2,d3
3,d4
4,d5
...,...
83,d84
84,d85
85,d86
86,d87


In [45]:
bo1 = pt.rewrite.Bo1QueryExpansion(
    index,
    fb_terms=15,  # Number of expansion terms
    fb_docs=5     # Number of feedback documents
)

pipeline = pt.BatchRetrieve(index, wmodel="BM25") >> bo1 >> pt.BatchRetrieve(index, wmodel="BM25")

# Example usage
query = "smartphone battery life improvement"
expanded_query = pipeline.search(query)
print(f"Expanded Query: {expanded_query.iloc[0]['query']}")


Expanded Query: applypipeline:off smartphon^1.700035099 batteri^1.402260067 life^1.000000000 improv^1.000000000 storag^0.756857235 ram^0.700035099 5g^0.683429196 android^0.644188463 camera^0.601506712 screen^0.538504689 4g^0.467451717 s10^0.379592516 processor^0.379592516 power^0.379592516 refresh^0.360742151 galaxi^0.346094135 fe^0.330742528


  pipeline = pt.BatchRetrieve(index, wmodel="BM25") >> bo1 >> pt.BatchRetrieve(index, wmodel="BM25")


In [49]:
# Updated code using the recommended Retriever class
bo1 = pt.rewrite.Bo1QueryExpansion(
    index,
    fb_terms=10,    # Reduced to focus on more relevant terms
    fb_docs=10      # Increased for broader document feedback
)


# Replace deprecated BatchRetrieve with Retriever
# pipeline = pt.terrier.Retriever(index, wmodel="BM25") >> bo1 >> pt.terrier.Retriever(index, wmodel="BM25")
# KL model often provides different term selection
kl = pt.rewrite.KLQueryExpansion(index, fb_terms=10, fb_docs=10)
kl_pipeline = pt.terrier.Retriever(index, wmodel="BM25") >> kl >> pt.terrier.Retriever(index, wmodel="BM25")



# Example usage
query = "smartphone battery life improvement"
results = pipeline.search(query)
print(f"Expanded Query: {results.iloc[0]['query']}")


Expanded Query: applypipeline:off smartphon^1.000000000 batteri^1.682541987 life^1.000000000 improv^1.000000000 storag^0.925472620 5g^0.697679668 camera^0.546547786 s10^0.515107099 processor^0.515107099 power^0.515107099 screen^0.488882456 ram^0.477803347 amol^0.477803347


In [56]:
# Create multiple retrieval pipelines
bm25_pipeline = pt.terrier.Retriever(index, wmodel="BM25")
expanded_pipeline = pt.terrier.Retriever(index, wmodel="BM25") >> bo1 >> pt.terrier.Retriever(index, wmodel="BM25")

# Combine results using fusion
fusion_pipeline = bm25_pipeline ^ expanded_pipeline
# Example usage
query = "smartphone battery life improvement"
results = fusion_pipeline.search(query)
print(f"Expanded Query: {results.iloc[0]['query']}")

Expanded Query: smartphone battery life improvement


In [68]:
# Improved KL query expansion with updated PyTerrier API
kl = pt.rewrite.KLQueryExpansion(
    index,
    fb_terms=10,
    fb_docs=3
)

# Fix deprecation warning by using Retriever instead of BatchRetrieve
pipeline_kl = pt.terrier.Retriever(index, wmodel="DPH") >> kl >> pt.terrier.Retriever(index, wmodel="DPH")

# Test with product query and display expanded query
query = "4K TV color accuracy"
results = pipeline_kl.search(query)
print(f"Original query: {query}")
print(f"Expanded query: {results.iloc[0]['query']}")
print("\nTop 5 results:")
display(results[['docno', 'score']].head(5))


Original query: 4K TV color accuracy
Expanded query: applypipeline:off 4k^1.670103013 tv^1.798008875 color^1.000000000 accuraci^1.000000000 resolut^0.576587455 inch^0.404813523 screen^0.358821532 size^0.358821532 neo^0.288293727 8k^0.231550534 qled^0.231550534 smart^0.109515213

Top 5 results:


Unnamed: 0,docno,score
0,d74,7.183903
1,d70,6.869778
2,d72,6.737439
3,d76,5.672228
4,d75,4.566427


In [69]:
import pyterrier as pt
if not pt.started():
    pt.init()

# Improved KL query expansion with updated PyTerrier API
kl = pt.rewrite.KLQueryExpansion(
    index,
    fb_terms=10,
    fb_docs=3
)

# Fix deprecation warning by using Retriever instead of BatchRetrieve
pipeline_kl = pt.terrier.Retriever(index, wmodel="DPH") >> kl >> pt.terrier.Retriever(index, wmodel="DPH")

# Test with product query and display expanded query
query = "4K TV color accuracy"
results = pipeline_kl.search(query)

# Display the original query, expanded query, and top 5 results with text content
print(f"Original query: {query}")
print(f"Expanded query: {results.iloc[0]['query']}")

print("\nTop 5 results with text content:")
for i, row in results.head(5).iterrows():
    docno = row['docno']
    score = row['score']
    # Get document text from the index
    text_content = index.getMetadata("text", index.getDocumentId(docno))
    print(f"Docno: {docno}, Score: {score}")
    print(f"Text: {text_content[:200]}...\n")  # Display first 200 characters of text


Original query: 4K TV color accuracy
Expanded query: applypipeline:off 4k^1.670103013 tv^1.798008875 color^1.000000000 accuraci^1.000000000 resolut^0.576587455 inch^0.404813523 screen^0.358821532 size^0.358821532 neo^0.288293727 8k^0.231550534 qled^0.231550534 smart^0.109515213

Top 5 results with text content:


  if not pt.started():


AttributeError: 'org.terrier.structures.Index' object has no attribute 'getMetadata'

In [58]:
sdm = pt.rewrite.SequentialDependence(
    prox_model="dirichlet",  # Dirichlet LM proximity model
    remove_stopwords=True
)

sdm_pipeline = sdm >> pt.BatchRetrieve(index, wmodel="BM25")

# Example with device query
query = "Samsung Galaxy S25 waterproof rating"
rewritten = sdm_pipeline.transform(pt.new.queries([query]))
print("Rewritten Query Structure:\n", rewritten.iloc[0]['query'])


  sdm_pipeline = sdm >> pt.BatchRetrieve(index, wmodel="BM25")


JavaException: JVM exception occurred: java.lang.ClassNotFoundException: org.terrier.matching.models.dependence.dirichlet java.lang.RuntimeException

In [59]:
rm3 = pt.rewrite.RM3(
    index,
    fb_terms=20,
    fb_docs=5
)

rm3_pipeline = (
    pt.BatchRetrieve(index, wmodel="BM25")
    >> rm3
    >> pt.BatchRetrieve(index, wmodel="BM25")
)

# Application example
results = rm3_pipeline.search("wireless earbuds noise cancellation")
print("Top Expanded Terms:", results.iloc[0]['query'].split()[:10])


Top Expanded Terms: ['applypipeline:off', 'fan^0.000001243', 'edit^0.000001243', 'music^0.039985519', 'audio^0.000133845', 'sound^0.039986718', 'anc^0.000133803', 'dolbi^0.039986718', 'cancel^0.189986721', 'nois^0.189986721']


  pt.BatchRetrieve(index, wmodel="BM25")
  >> pt.BatchRetrieve(index, wmodel="BM25")


In [60]:
linear_expansion = pt.rewrite.linear(
    weightCurrent=0.7,
    weightPrevious=0.3,
    format="terrierql"
)

# Usage in pipeline
linear_pipe = (
    pt.BatchRetrieve(index, wmodel="DPH")
    >> linear_expansion
    >> pt.BatchRetrieve(index, wmodel="DPH")
)


  pt.BatchRetrieve(index, wmodel="DPH")
  >> pt.BatchRetrieve(index, wmodel="DPH")


In [61]:
# Using Wikipedia for external knowledge
wiki_index = pt.IndexFactory.of("/path/to/wikipedia/index")

cross_collection_pipe = (
    pt.BatchRetrieve(wiki_index, wmodel="BM25")
    >> pt.rewrite.Bo1QueryExpansion(wiki_index)
    >> pt.BatchRetrieve(index, wmodel="BM25")
)

# Example: Expanding technical queries
results = cross_collection_pipe.search("OLED burn-in prevention")


JavaException: JVM exception occurred: No IndexLoaders were supported for indexref /path/to/wikipedia/index; It may be your ref has the wrong location. Alternatively, Terrier is misconfigured - did you import the correct package to deal with this indexref? java.lang.UnsupportedOperationException

In [None]:
# Using Wikipedia for external knowledge
wiki_index = pt.IndexFactory.of("/path/to/wikipedia/index")

cross_collection_pipe = (
    pt.BatchRetrieve(wiki_index, wmodel="BM25")
    >> pt.rewrite.Bo1QueryExpansion(wiki_index)
    >> pt.BatchRetrieve(index, wmodel="BM25")
)

# Example: Expanding technical queries
results = cross_collection_pipe.search("OLED burn-in prevention")


In [62]:
controlled_pipeline = (
    pt.BatchRetrieve(index, wmodel="BM25")
    >> pt.rewrite.RM3(index)
    >> pt.rewrite.reset()  # Revert to original query
    >> pt.BatchRetrieve(index, wmodel="BM25")
)


  pt.BatchRetrieve(index, wmodel="BM25")
  >> pt.BatchRetrieve(index, wmodel="BM25")


In [63]:
stash_pipe = (
    pt.BatchRetrieve(index, wmodel="DPH")
    >> pt.rewrite.stash_results()
    >> pt.rewrite.KLQueryExpansion(index)
    >> pt.rewrite.reset_results()
    >> pt.BatchRetrieve(index, wmodel="DPH")
)

# Test with complex query
stash_results = stash_pipe.search("smartwatch heart rate monitoring accuracy")


  pt.BatchRetrieve(index, wmodel="DPH")
  >> pt.BatchRetrieve(index, wmodel="DPH")


ValueError: Input resultset has neither docid nor docno

In [64]:
custom_tokenizer = pt.rewrite.tokenise(
    lambda query: [token.upper() for token in query.split()]
)

token_pipe = custom_tokenizer >> pt.BatchRetrieve(index)

# Handle model numbers
results = token_pipe.search("QA55QN90AKXXL 4K Neo QLED TV")


  token_pipe = custom_tokenizer >> pt.BatchRetrieve(index)


In [65]:
# Example evaluation setup
from pyterrier.measures import *

dataset = pt.get_dataset("vaswani")
experiment = pt.Experiment(
    [pipeline, pipeline_kl, rm3_pipeline],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_measures=[MAP, P@10, Recall@100]
)
print(experiment)


Downloading vaswani topics to /root/.pyterrier/corpora/vaswani/query-text.trec


query-text.trec:   0%|          | 0.00/3.05k [00:00<?, ?iB/s]

Downloading vaswani qrels to /root/.pyterrier/corpora/vaswani/qrels


qrels:   0%|          | 0.00/6.63k [00:00<?, ?iB/s]

TypeError: Experiment() missing 1 required positional argument: 'eval_metrics'