# Experiment 04b: ESCI Ranking — Condition Examples

This notebook shows the actual text for each experimental condition using real data from the dataset. No GPU needed.

In [1]:
import os, sys, json
import numpy as np
from pathlib import Path

sys.path.insert(0, "../..")

SEED = 42
N_QUERIES = 400

from datasets import load_dataset
ds = load_dataset("tasksource/esci", split="train")

# Reconstruct ESCI samples
queries = {}
for row in ds:
    q = row.get('query', '')
    label = row.get('esci_label', '')
    product_title = row.get('product_title', '')
    product_desc = row.get('product_description', '')
    if not q or not product_title:
        continue
    text = product_title
    if product_desc and len(product_desc) > 20:
        text = product_title + " " + product_desc
    if len(text.split()) < 10:
        continue
    if q not in queries:
        queries[q] = {'products': [], 'labels': []}
    queries[q]['products'].append(text)
    queries[q]['labels'].append(label)

# Filter: need at least 1 exact + 1 irrelevant
usable = []
for q, data in queries.items():
    has_exact = 'E' in data['labels'] or 'Exact' in data['labels']
    has_irrel = 'I' in data['labels'] or 'Irrelevant' in data['labels']
    if has_exact and has_irrel and len(data['products']) >= 3:
        usable.append({'query': q, **data})

if not usable:
    # Debug: show what labels look like
    all_labels = set()
    for data in queries.values():
        all_labels.update(data['labels'])
    print(f"WARNING: No usable queries found! Unique labels in dataset: {all_labels}")
    print(f"Total queries collected: {len(queries)}")

np.random.seed(SEED)
np.random.shuffle(usable)
usable = usable[:N_QUERIES]

print(f"Loaded {len(usable)} ESCI queries with products")
del ds

ex = usable[0]
exact_idx = ex['labels'].index('Exact') if 'Exact' in ex['labels'] else ex['labels'].index('E')
irrel_idx = ex['labels'].index('Irrelevant') if 'Irrelevant' in ex['labels'] else ex['labels'].index('I')
rel_product = ex['products'][exact_idx]
irrel_product = ex['products'][irrel_idx]

STOP_WORDS = {
    'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
    'should', 'may', 'might', 'can', 'shall', 'to', 'of', 'in', 'for',
    'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during',
    'before', 'after', 'above', 'below', 'between', 'and', 'but', 'or',
    'not', 'no', 'if', 'then', 'than', 'so', 'up', 'out', 'about',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
    'it', 'its', 'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he',
    'him', 'his', 'she', 'her', 'they', 'them', 'their', 'how', 'when',
    'where', 'why', 'much', 'many', 'some', 'any', 'all', 'each',
    'does', 'also', 'just', 'more', 'most', 'very', 'too', 'only',
}
import re
def extract_keywords(text):
    words = re.sub(r'[^\w\s]', '', text.lower()).split()
    return [w for w in words if w not in STOP_WORDS and len(w) > 2]
from collections import Counter
def make_surrogate_from_doc(passage):
    content_words = extract_keywords(passage)
    if not content_words:
        return "information"
    counts = Counter(content_words)
    return " ".join(w for w, _ in counts.most_common(5))
def make_surrogate_template(passage):
    content_words = extract_keywords(passage)
    if not content_words:
        return "What is this about?"
    counts = Counter(content_words)
    return f"What is {counts.most_common(1)[0][0]}?"

# Product title as surrogate (natural for ad-serving)
title = rel_product.split()[0:8]  # approximate title
surr_title = " ".join(title)

print(f"Query: {ex['query']}")
print(f"Relevant product (E): {rel_product[:120]}...")
print(f"Irrelevant product (I): {irrel_product[:120]}...")
print()

print("QUERY-LIKELIHOOD RANKING on graded-relevance product search.")
print("  Decoder scores the QUERY (not an answer) given encoded product.")
print()


def show_sample(s, doc_key='passage', n=0):
    # Show sample info
    doc = s[doc_key]
    print(f"{'='*80}")
    print(f"SAMPLE {n}")
    print(f"{'='*80}")
    print(f"  Query:    {s['query']}")
    print(f"  Answer:   {s['answer']}")
    print(f"  Document: {doc[:100]}...")
    print(f"  Doc words: {len(doc.split())}")
    print()

def show_conditions(conditions, doc_text):
    # conditions: list of (name, description, encoder_prefix_text_or_None)
    # For bare conditions, encoder_prefix_text is None
    print(f"{'Condition':<30} {'Prefix':<14} {'Encoder input (first 70 chars)'}")
    print(f"{'-'*100}")
    for name, desc, prefix_text in conditions:
        if prefix_text is None:
            enc_preview = doc_text[:70]
            print(f"{name:<30} {'(none)':<14} {enc_preview}...")
        else:
            enc_text = prefix_text + "\n" + doc_text
            print(f"{name:<30} {str(len(prefix_text.split()))+'w':<14} {enc_text[:70]}...")
        if desc:
            print(f"  {'':>28} ^ {desc}")
    print()

conditions = [
    ("bare", "Product text only", None),
    ("oracle_trunc", "Real search query", ex['query']),
    ("surr_title_trunc", "Product title (natural surrogate)", surr_title),
    ("surr_doc_trunc", "Top-5 TF keywords from product", make_surrogate_from_doc(rel_product)),
    ("surr_template_trunc", "'What is [kw]?'", make_surrogate_template(rel_product)),
    ("random_trunc", "~20w from another query's product",
     " ".join(usable[1]['products'][0].split()[:20])),
]
print("For the RELEVANT product:")
show_conditions(conditions, rel_product)

print("KEY DIFFERENCE from Exp 04A: decoder scores the query, not a gold answer.")




Loaded 400 ESCI queries with products
Query: 長縄跳び 子供用 3メートル
Relevant product (E): 縄跳び トレーニング用 長さ調整可 エクササイズ フィットネス ダイエット 大人用 子供用 有酸素運動 シェイプアップ なわとび スポーツ 筋トレ カウンター 室内 屋外 運動会 体育祭 スポーツ用品 (Red&Black) <strong...
Irrelevant product (I): exitora 大繩 長なわ 大縄跳び 大なわとび 長なわとび なわとび 団体競技 団体 競技 スポーツ (白, 7m) "体育や運動会、イベントなどで大人数で楽しめる大縄跳び用のロープです。<br>手になじみやすい木製グリップなので、回し...

QUERY-LIKELIHOOD RANKING on graded-relevance product search.
  Decoder scores the QUERY (not an answer) given encoded product.

For the RELEVANT product:
Condition                      Prefix         Encoder input (first 70 chars)
----------------------------------------------------------------------------------------------------
bare                           (none)         縄跳び トレーニング用 長さ調整可 エクササイズ フィットネス ダイエット 大人用 子供用 有酸素運動 シェイプアップ なわとび スポーツ ...
                               ^ Product text only
oracle_trunc                   3w             長縄跳び 子供用 3メートル
縄跳び トレーニング用 長さ調整可 エクササイズ フィットネス ダイエット 大人用 子供用 有酸素運動 シェイ...
                      