# HyDE Table Retrieval

Hypothetical Document Embeddings for table search:
1. Generate table descriptions using LLM
2. Generate hypothetical table descriptions from queries using LLM
3. Encode and retrieve using MiniLM-L6-v2 + FAISS

## Setup

In [None]:
import pandas as pd
import numpy as np
import json
import faiss
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from tqdm import tqdm

from openai import OpenAI
from functools import reduce
import time

client = OpenAI(api_key="<insert key here>")


  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
# Load tables
tables_df = pd.read_csv('data/wikitables_mini.csv')
print(f"Loaded {len(tables_df)} tables")
print(f"Columns: {list(tables_df.columns)}")
tables_df.head(2)

Loaded 2932 tables
Columns: ['table_id', 'page_title', 'section_title', 'table_caption', 'headers', 'sample_data']


Unnamed: 0,table_id,page_title,section_title,table_caption,headers,sample_data
0,table-0001-249,Auburn Tigers swimming and diving,Summer Olympic Games Beijing 2008,Summer Olympic Games Beijing 2008,"[""Athlete"", ""Nation"", ""Total"", ""Gold"", ""Silver...","[[""[Fr\u00e9d\u00e9rick_Bousquet|Fr\u00e9d\u00..."
1,table-0001-400,Bisphenol A,Low-dose exposure in animals,Low-dose exposure in animals,"[""Dose (\u00b5g/kg/day)"", ""[Environmental_Work...","[[""0.025"", ""\""Permanent changes to genital tra..."


## Generate Table Descriptions (HyDE)

Use LLM to generate natural descriptions from table metadata.

In [None]:
def generate_table_description_with_llm(row, max_chars=256):
    """
    Generate table description using LLM.
    
    TODO: Replace this with actual LLM call.
    """
    
    # Prepare table metadata as prompt context
    metadata = []
    if pd.notna(row['table_caption']):
        metadata.append(f"Caption: {row['table_caption']}")
    if pd.notna(row['page_title']):
        metadata.append(f"Page: {row['page_title']}")
    if pd.notna(row['section_title']):
        metadata.append(f"Section: {row['section_title']}")
    
    try:
        headers = json.loads(row['headers'])
        if headers:
            metadata.append(f"Columns: {', '.join([str(h) for h in headers[:10]])}")
    except:
        pass
    
    try:
        sample_data = json.loads(row['sample_data'])
        if sample_data and len(sample_data) > 0:
            sample_str = str(sample_data[0][:5])
            metadata.append(f"Sample: {sample_str}")
    except:
        pass
    
    metadata_str = '\n'.join(metadata)
        
    # Fallback: concatenate metadata (replace with actual LLM response)
    description = ' '.join(metadata)
    
    return description[:max_chars]

# Test
print("Example LLM-generated descriptions:")
print("=" * 80)
for i in range(3):
    desc = generate_table_description_with_llm(tables_df.iloc[i])
    print(f"\n{i+1}. {desc}")

Example LLM-generated descriptions:

1. Caption: Summer Olympic Games Beijing 2008 Page: Auburn Tigers swimming and diving Section: Summer Olympic Games Beijing 2008 Columns: Athlete, Nation, Total, Gold, Silver, Bronze, Events Sample: ['[Frédérick_Bousquet|Frédérick Bousquet]', '[France|FRA]', 

2. Caption: Low-dose exposure in animals Page: Bisphenol A Section: Low-dose exposure in animals Columns: Dose (µg/kg/day), [Environmental_Working_Group|Environmental Working Group], Study Year Sample: ['0.025', '"Permanent changes to genital tract"', '2005']

3. Caption: Players Page: Charlotte Bobcats all-time roster Section: Players Columns: *, *, [2004_NBA_Expansion_Draft|2004 Expansion Draft], [2004_NBA_Expansion_Draft|2004 Expansion Draft], [2004_NBA_Expansion_Draft|2004 Expansion Draft], [2004_NBA_Expansion_


In [5]:
# Generate descriptions for all tables
print("Generating LLM descriptions for all tables...")
print("NOTE: This will make ~3K LLM API calls. Consider batch processing or caching.")
print()

table_descriptions = []
table_ids = []

for idx, row in tqdm(tables_df.iterrows(), total=len(tables_df)):
    table_descriptions.append(generate_table_description_with_llm(row))
    table_ids.append(row['table_id'])

print(f"Generated {len(table_descriptions)} descriptions")
print(f"Length stats - Mean: {np.mean([len(d) for d in table_descriptions]):.1f}, Max: {max([len(d) for d in table_descriptions])}")

Generating LLM descriptions for all tables...
NOTE: This will make ~3K LLM API calls. Consider batch processing or caching.



100%|██████████| 2932/2932 [00:00<00:00, 36674.41it/s]

Generated 2932 descriptions
Length stats - Mean: 224.0, Max: 256





## Encode Tables

In [6]:
# Load encoder
model_name = 'all-MiniLM-L6-v2'
print(f"Loading {model_name}...")
encoder = SentenceTransformer(model_name)
print(f"Dimension: {encoder.get_sentence_embedding_dimension()}")

Loading all-MiniLM-L6-v2...
Dimension: 384


In [7]:
# Encode descriptions
print("Encoding...")
table_embeddings = encoder.encode(
    table_descriptions, batch_size=32, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)
print(f"Shape: {table_embeddings.shape}")

Encoding...


Batches: 100%|██████████| 92/92 [00:07<00:00, 11.67it/s]

Shape: (2932, 384)





## Build FAISS Index

In [8]:
# Build FAISS index
print("Building index...")
index = faiss.IndexFlatIP(encoder.get_sentence_embedding_dimension())
index.add(table_embeddings.astype('float32'))
print(f"✓ Index built with {index.ntotal} tables")

Building index...
✓ Index built with 2932 tables


### Test Custom Queries

In [9]:
import re

# Test custom queries - change test_query and run
test_query = "olympic medals table"
top_k = 5

print(f"Query: '{test_query}'")
print("=" * 80)

# ============================================================
# TODO: LLM CALL HERE - Generate hypothetical table description from query
# ============================================================
# Prompt: "Given the search query: '{test_query}', generate a description 
# of what a relevant table would contain. Describe the table structure, 
# columns, and type of data it would have. Keep it under 256 characters."
#
# Example LLM call:
# response = openai.ChatCompletion.create(
#     model="gpt-4",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": f"Given the search query: '{test_query}', generate a description of what a relevant table would contain. Describe the table structure, columns, and type of data it would have. Keep it under 256 characters."}
#     ],
#     max_tokens=100,
#     temperature=0.3
# )
# hypothetical_description = response.choices[0].message.content.strip()
# ============================================================

sleep_timer = 5
time.sleep(sleep_timer)

system_msg = 'You are a helpful assistant.'

user_msg = """
I will give you a keyword search query, and I want you to generate a description of an example table relevant to the query.
Follow the guidelines below when generating your description:
1. First generate a short description of the table, no more than 2 sentences.
2. Next generate a short caption for the table.
3. Then, suppose this table is on a website like wikipedia. Generate the title of the web page that this table would exist in.
4. Also, generate the section name of the web page that this table would exist in.
5. Generate the column names of this table.
6. Generate one row of example data. Ensure the data is of the appropriate data type.
7. Finally, output a JSON object with the following fields, 'description', 'table_caption', 'page_title', 'section_title', 'columns', 'sample_data'.

[Query]
world interest rates table

[Answer]

**Reasoning:**
I need to generate metadata associated with an example table relevant to the query. A table containing information about Eurozone interest rates would be highly relevant to this query.

**Description:**
A table summarizing historical Eurozone interest rates, showing—by date—the levels of the deposit facility rate, the main refinancing operations rate, and the marginal lending facility rate set by the European Central Bank.

**Table Caption:**
Interest rates

**Page Title:**
Eurozone

**Section Title:**
Interest rates

**Columns:**
['Date', 'Deposit facility', 'Main refinancing operations', 'Marginal lending facility']

**Sample Row:**
['1999-01-01', '2.00', '3.00', '4.50']

**JSON:**
{
  "description": "A table summarizing historical Eurozone interest rates, listing by date the European Central Bank’s key policy rates: the deposit facility rate, the main refinancing operations rate, and the marginal lending facility rate.",
  "table_caption": "Interest rates",
  "page_title": "Eurozone",
  "section_title": "Interest rates",
  "columns": [
    "Date",
    "Deposit facility",
    "Main refinancing operations",
    "Marginal lending facility"
  ],
  "sample_data": [
      "1999-01-01",
      "2.00",
      "3.00",
      "4.50"
  ]
}

"""

user_msg += f"""
[Query]
{test_query}

[Answer]

**Reasoning:**
"""

# Regex to extract a JSON object (handles quoted strings with escaped characters)
json_regex = re.compile(
    r'\{(?:[^{}"]|"[^"\\]*(?:\\.[^"\\]*)*")*\}',
    re.DOTALL
)

response = client.chat.completions.create(model="gpt-4o",
messages=[{"role": "system", "content": system_msg}, {"role": "user", "content": user_msg}],
max_tokens=4000, 
temperature=0.0)

hypothetical_description = response.choices[0].message.content

match = json_regex.search(hypothetical_description)

if not match:
  # Fallback: use query directly (replace with LLM-generated description)
  hypothetical_description = test_query
  print(f"Hypothetical table description: {hypothetical_description}\n")
else:
  json_str = match.group(0)

  # Parse the extracted JSON
  data = json.loads(json_str)

  metadata = []
  metadata.append(f"Caption: {data['table_caption']}")
  metadata.append(f"Page: {data['page_title']}")
  metadata.append(f"Section: {data['section_title']}")
  metadata.append(f"Columns: {data['columns']}")
  metadata.append(f"Sample: {data['sample_data']}")

  hypothetical_description = ' '.join(metadata)
  print(f"Hypothetical table description: {hypothetical_description}\n")

# Encode and search
test_emb = encoder.encode([hypothetical_description], convert_to_numpy=True, normalize_embeddings=True).astype('float32')
scores_test, indices_test = index.search(test_emb, top_k)

for rank in range(top_k):
    idx = indices_test[0][rank]
    table_id = table_ids[idx]
    row = tables_df[tables_df['table_id'] == table_id].iloc[0]
    
    print(f"\n{rank + 1}. {table_id} (score: {scores_test[0][rank]:.4f})")
    print(f"   Page: {row['page_title']}")
    print(f"   Caption: {row['table_caption']}")
    print(f"   Description: {table_descriptions[idx][:120]}...")
    print("-" * 80)

Query: 'olympic medals table'
Hypothetical table description: Caption: Olympic Medal Count Page: Summer Olympics 2024 Section: Medal Tally Columns: ['Country', 'Gold', 'Silver', 'Bronze', 'Total'] Sample: ['USA', 39, 41, 33, 113]


1. table-0407-338 (score: 0.8251)
   Page: Israel at the Paralympics
   Caption: Medal tables
   Description: Caption: Medal tables Page: Israel at the Paralympics Section: Medal tables Columns: Team (IOC code), № Summer, Gold, Br...
--------------------------------------------------------------------------------

2. table-0529-770 (score: 0.7449)
   Page: Lists of Olympic medalists
   Caption: Summer Olympic Games
   Description: Caption: Summer Olympic Games Page: Lists of Olympic medalists Section: Summer Olympic Games Columns: Games, Medal, Meda...
--------------------------------------------------------------------------------

3. table-0375-830 (score: 0.7427)
   Page: Ice hockey at the Olympic Games
   Caption: Medal winners
   Description: Caption: M

## Evaluation

### Load Queries and Relevance Judgments

In [10]:
# Load queries
queries = {}
with open('data/queries.txt', 'r') as f:
    for line in f:
        parts = line.strip().split(None, 1)
        if len(parts) == 2:
            query_id, query_text = parts
            queries[query_id] = query_text

print(f"Loaded {len(queries)} queries")
print("Examples:", list(queries.items())[:3])

# Load qrels (relevance judgments)
qrels = defaultdict(dict)
with open('data/qrels.txt', 'r') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 4:
            query_id, table_id, relevance = parts[0], parts[2], int(parts[3])
            qrels[query_id][table_id] = relevance

qrels = dict(qrels)
print(f"Loaded qrels for {len(qrels)} queries")

Loaded 60 queries
Examples: [('1', 'world interest rates table'), ('2', '2008 beijing olympics'), ('3', 'fast cars')]
Loaded qrels for 60 queries


---

### Generate Hypothetical Descriptions from Queries (HyDE)

In [None]:
query_ids = list(queries.keys())
query_texts = [queries[qid] for qid in query_ids]

print(f"Generating hypothetical table descriptions for {len(query_texts)} queries...")
print("NOTE: This will make ~60 LLM API calls.")
print()

hypothetical_descriptions = []

for query_text in tqdm(query_texts):
    sleep_timer = 5
    time.sleep(sleep_timer)

    system_msg = 'You are a helpful assistant.'

    user_msg = """
    I will give you a keyword search query, and I want you to generate a description of an example table relevant to the query.
    Follow the guidelines below when generating your description:
    1. First generate a short description of the table, no more than 2 sentences.
    2. Next generate a short caption for the table.
    3. Then, suppose this table is on a website like wikipedia. Generate the title of the web page that this table would exist in.
    4. Also, generate the section name of the web page that this table would exist in.
    5. Generate the column names of this table.
    6. Generate one row of example data. Ensure the data is of the appropriate data type.
    7. Finally, output a JSON object with the following fields, 'description', 'table_caption', 'page_title', 'section_title', 'columns', 'sample_data'.

    [Query]
    world interest rates table

    [Answer]

    **Reasoning:**
    I need to generate metadata associated with an example table relevant to the query. A table containing information about Eurozone interest rates would be highly relevant to this query.

    **Description:**
    A table summarizing historical Eurozone interest rates, showing—by date—the levels of the deposit facility rate, the main refinancing operations rate, and the marginal lending facility rate set by the European Central Bank.

    **Table Caption:**
    Interest rates

    **Page Title:**
    Eurozone

    **Section Title:**
    Interest rates

    **Columns:**
    ['Date', 'Deposit facility', 'Main refinancing operations', 'Marginal lending facility']

    **Sample Row:**
    ['1999-01-01', '2.00', '3.00', '4.50']

    **JSON:**
    {
    "description": "A table summarizing historical Eurozone interest rates, listing by date the European Central Bank’s key policy rates: the deposit facility rate, the main refinancing operations rate, and the marginal lending facility rate.",
    "table_caption": "Interest rates",
    "page_title": "Eurozone",
    "section_title": "Interest rates",
    "columns": [
        "Date",
        "Deposit facility",
        "Main refinancing operations",
        "Marginal lending facility"
    ],
    "sample_data": [
        "1999-01-01",
        "2.00",
        "3.00",
        "4.50"
    ]
    }

    """

    user_msg += f"""
    [Query]
    {query_text}

    [Answer]

    **Reasoning:**
    """

    # Regex to extract a JSON object (handles quoted strings with escaped characters)
    json_regex = re.compile(
        r'\{(?:[^{}"]|"[^"\\]*(?:\\.[^"\\]*)*")*\}',
        re.DOTALL
    )

    response = client.chat.completions.create(model="gpt-4o",
    messages=[{"role": "system", "content": system_msg}, {"role": "user", "content": user_msg}],
    max_tokens=4000, 
    temperature=0.0)

    hypothetical_description = response.choices[0].message.content

    match = json_regex.search(hypothetical_description)

    if not match:
        # Fallback: use query directly (replace with LLM-generated description)
        hypothetical_description = query_text
        print(f"Hypothetical table description: {hypothetical_description}\n")
    else:
        json_str = match.group(0)

        # Parse the extracted JSON
        data = json.loads(json_str)

        metadata = []
        metadata.append(f"Caption: {data['table_caption']}")
        metadata.append(f"Page: {data['page_title']}")
        metadata.append(f"Section: {data['section_title']}")
        metadata.append(f"Columns: {data['columns']}")
        metadata.append(f"Sample: {data['sample_data']}")

        hypothetical_description = ' '.join(metadata)
        print(f"Hypothetical table description: {hypothetical_description}\n")

    hypothetical_descriptions.append(hypothetical_description)

print(f"Generated {len(hypothetical_descriptions)} hypothetical descriptions")
print(f"\nExamples:")
for i in range(3):
    print(f"  Query: {query_texts[i]}")
    print(f"  Hypothetical: {hypothetical_descriptions[i]}")
    print()

Generating hypothetical table descriptions for 60 queries...
NOTE: This will make ~60 LLM API calls.



  2%|▏         | 1/60 [00:08<08:20,  8.48s/it]

Hypothetical table description: Caption: Global Central Bank Interest Rates Page: World Interest Rates Section: Current Interest Rates by Country Columns: ['Country', 'Central Bank', 'Interest Rate (%)'] Sample: ['United States', 'Federal Reserve', '5.25']



  3%|▎         | 2/60 [00:16<07:54,  8.18s/it]

Hypothetical table description: Caption: 2008 Beijing Olympics Medal Tally Page: 2008 Beijing Olympics Section: Medal Tally Columns: ['Country', 'Gold', 'Silver', 'Bronze', 'Total'] Sample: ['China', 48, 22, 30, 100]



  5%|▌         | 3/60 [00:24<07:41,  8.09s/it]

Hypothetical table description: Caption: Fastest Production Cars Page: Fast Cars Section: Top Speed Records Columns: ['Car Model', 'Manufacturer', 'Top Speed (mph)', 'Year'] Sample: ['Chiron Super Sport 300+', 'Bugatti', '304', '2019']



  7%|▋         | 4/60 [00:31<07:19,  7.85s/it]

Hypothetical table description: Caption: Clothing Size Conversion Chart Page: Clothing Size Guide Section: Size Conversion Tables Columns: ['Category', 'US Size', 'EU Size', 'UK Size'] Sample: ['Men', 'M', '48', '38']



  8%|▊         | 5/60 [00:39<07:00,  7.64s/it]

Hypothetical table description: Caption: Phases of the Moon Page: Lunar Phases Section: Moon Phases Calendar Columns: ['Date', 'Phase'] Sample: ['2023-10-14', 'New Moon']



 10%|█         | 6/60 [00:46<06:47,  7.55s/it]

Hypothetical table description: Caption: Population by State Page: Demographics of the United States Section: State Populations Columns: ['State', 'Population', 'Census Year'] Sample: ['California', '39,538,223', '2020']



 12%|█▏        | 7/60 [00:55<06:59,  7.91s/it]

Hypothetical table description: Caption: Prime Ministers of the United Kingdom Page: List of Prime Ministers of the United Kingdom Section: Prime Ministers Columns: ['Name', 'Term Start', 'Term End', 'Political Party', 'Notable Achievements'] Sample: ['Winston Churchill', '1940-05-10', '1945-07-26', 'Conservative', 'Led Britain to victory in World War II']



 13%|█▎        | 8/60 [01:02<06:47,  7.84s/it]

Hypothetical table description: Caption: iPod Models and Specifications Page: iPod Section: Models and Specifications Columns: ['Model', 'Release Date', 'Storage Capacity', 'Display Size', 'Available Colors'] Sample: ['iPod Classic', '2001-10-23', '5 GB', '2 inches', 'White']



 15%|█▌        | 9/60 [01:11<06:46,  7.97s/it]

Hypothetical table description: Caption: Popular BitTorrent Clients Page: BitTorrent Section: BitTorrent Clients Columns: ['Client Name', 'Developer', 'Platform', 'License'] Sample: ['qBittorrent', 'qBittorrent Project', 'Windows, macOS, Linux', 'GPLv2']



 17%|█▋        | 10/60 [01:20<06:55,  8.31s/it]

Hypothetical table description: Caption: Olympus Digital SLR Camera Models Page: Olympus Digital SLR Cameras Section: Camera Models and Specifications Columns: ['Model', 'Release Date', 'Sensor Type', 'Megapixels'] Sample: ['Olympus E-1', '2003-10-01', 'Four Thirds', '5.0']



 18%|█▊        | 11/60 [01:28<06:46,  8.29s/it]

Hypothetical table description: Caption: Elemental Composition of the Sun Page: Sun Section: Composition Columns: ['Element', 'Symbol', 'Percentage by Mass'] Sample: ['Hydrogen', 'H', '73.46']



 20%|██        | 12/60 [01:36<06:29,  8.11s/it]

Hypothetical table description: Caption: Running Shoe Models Page: Running Shoes Section: Shoe Models and Specifications Columns: ['Brand', 'Model Name', 'Type', 'Weight (g)', 'Price (USD)'] Sample: ['Nike', 'Air Zoom Pegasus 38', 'Road', 285, 120]



 22%|██▏       | 13/60 [01:45<06:41,  8.54s/it]

Hypothetical table description: Caption: Fuel Consumption of Car Models Page: Automobile Fuel Efficiency Section: Fuel Consumption Data Columns: ['Make', 'Model', 'Year', 'Fuel Type', 'City MPG', 'Highway MPG'] Sample: ['Toyota', 'Camry', '2020', 'Gasoline', '28', '39']



 23%|██▎       | 14/60 [01:55<06:52,  8.97s/it]

Hypothetical table description: Caption: Daily Stock Quotes Page: Company Stock Information Section: Historical Stock Quotes Columns: ['Date', 'Open', 'Close', 'High', 'Low', 'Volume'] Sample: ['2023-10-01', '150.00', '155.00', '157.00', '149.00', '1,200,000']



 25%|██▌       | 15/60 [02:05<06:53,  9.19s/it]

Hypothetical table description: Caption: Top Grossing Movies Worldwide Page: List of Highest-Grossing Films Section: Top Grossing Movies Columns: ['Title', 'Release Year', 'Worldwide Gross (in billions)', 'Director'] Sample: ['Avatar', '2009', '2.923', 'James Cameron']



 27%|██▋       | 16/60 [02:13<06:32,  8.92s/it]

Hypothetical table description: Caption: Nutritional Values of Common Foods Page: Nutrition Information Section: Nutritional Values Columns: ['Food Item', 'Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fats (g)'] Sample: ['Apple', '52', '0.3', '14', '0.2']



 28%|██▊       | 17/60 [02:21<06:11,  8.64s/it]

Hypothetical table description: Caption: State Capitals and Largest Cities Page: United States Geography Section: State Capitals and Largest Cities Columns: ['State', 'Capital', 'Largest City'] Sample: ['California', 'Sacramento', 'Los Angeles']



 30%|███       | 18/60 [02:30<05:59,  8.56s/it]

Hypothetical table description: Caption: Professional Wrestlers Page: Professional Wrestling Section: Notable Wrestlers Columns: ['Ring Name', 'Real Name', 'Weight Class', 'Debut Year'] Sample: ['The Rock', 'Dwayne Johnson', 'Heavyweight', '1996']



 32%|███▏      | 19/60 [02:40<06:08,  8.99s/it]

Hypothetical table description: Caption: Income Statement Page: Financial Statements of XYZ Corporation Section: Income Statement Columns: ['Fiscal Year', 'Revenue', 'Cost of Goods Sold', 'Gross Profit', 'Operating Expenses', 'Net Income'] Sample: ['2022', '5000000', '2000000', '3000000', '1000000', '2000000']



 33%|███▎      | 20/60 [02:47<05:41,  8.55s/it]

Hypothetical table description: Caption: Dog Breeds Information Page: List of Dog Breeds Section: Breed Characteristics Columns: ['Breed Name', 'Country of Origin', 'Size', 'Temperament'] Sample: ['Labrador Retriever', 'Canada', 'Large', 'Friendly']



 35%|███▌      | 21/60 [02:55<05:21,  8.26s/it]

Hypothetical table description: Caption: Ibanez Guitar Models Page: Ibanez Guitars Section: Guitar Models Columns: ['Model Name', 'Type', 'Year Introduced', 'Notable Features'] Sample: ['RG550', 'Electric', '1987', 'Super Wizard neck, Edge tremolo system']



 37%|███▋      | 22/60 [03:03<05:18,  8.39s/it]

Hypothetical table description: Caption: Used Cellphones for Sale Page: Used Cellphones Marketplace Section: Available Listings Columns: ['Brand', 'Model', 'Condition', 'Price', 'Seller Location'] Sample: ['Apple', 'iPhone 12', 'Good', '500', 'New York, NY']



 38%|███▊      | 23/60 [03:12<05:08,  8.33s/it]

Hypothetical table description: Caption: Major World Religions Page: World Religions Section: Overview of Major Religions Columns: ['Religion', 'Adherents (millions)', 'Primary Regions', 'Founding Date'] Sample: ['Christianity', '2400', 'Worldwide', '1st century AD']



 40%|████      | 24/60 [03:19<04:53,  8.16s/it]

Hypothetical table description: Caption: Daily Stock Performance Page: Stock Market Overview Section: Stock Performance Data Columns: ['Date', 'Closing Price', 'Volume', 'Percentage Change'] Sample: ['2023-10-01', '150.25', '1,200,000', '1.5%']



 42%|████▏     | 25/60 [03:27<04:44,  8.14s/it]

Hypothetical table description: Caption: Best Picture Winners Page: Academy Awards Section: Best Picture Columns: ['Year', 'Film', 'Director', 'Production Company'] Sample: ['2022', 'CODA', 'Sian Heder', 'Apple Original Films']



 43%|████▎     | 26/60 [03:36<04:45,  8.39s/it]

Hypothetical table description: Caption: 2008 Olympic Gold Medal Winners Page: 2008 Summer Olympics Section: Gold Medalists Columns: ['Athlete', 'Country', 'Sport', 'Event'] Sample: ['Michael Phelps', 'USA', 'Swimming', '100m Butterfly']



 45%|████▌     | 27/60 [03:44<04:34,  8.30s/it]

Hypothetical table description: Caption: Official Currencies by Country Page: List of World Currencies Section: Currencies by Country Columns: ['Country', 'Currency', 'Currency Code'] Sample: ['Japan', 'Yen', 'JPY']



 47%|████▋     | 28/60 [03:53<04:26,  8.32s/it]

Hypothetical table description: Caption: Significant Scientific Discoveries Page: History of Scientific Discoveries Section: Major Discoveries Columns: ['Year', 'Discovery', 'Scientist(s)', 'Field of Science'] Sample: ['1928', 'Penicillin', 'Alexander Fleming', 'Biology']



 48%|████▊     | 29/60 [04:01<04:17,  8.32s/it]

Hypothetical table description: Caption: PGA Tournament Leaderboard Page: PGA Tour Section: Current Leaderboard Columns: ['Position', 'Player', 'Score', 'Round', 'Total'] Sample: ['1', 'John Doe', '-10', '68', '278']



 50%|█████     | 30/60 [04:09<04:08,  8.29s/it]

Hypothetical table description: Caption: Common Pain Medications Page: Pain Management Section: Medications Columns: ['Medication Name', 'Active Ingredient', 'Typical Uses', 'Potential Side Effects'] Sample: ['Ibuprofen', 'Ibuprofen', 'Pain relief, inflammation reduction', 'Stomach upset, dizziness']



 52%|█████▏    | 31/60 [04:17<03:57,  8.20s/it]

Hypothetical table description: Caption: Football Clubs and Their Cities Page: List of Football Clubs by City Section: Football Clubs and Cities Columns: ['Club Name', 'City', 'Country', 'Founded Year'] Sample: ['Manchester United', 'Manchester', 'United Kingdom', '1878']



 53%|█████▎    | 32/60 [04:25<03:46,  8.09s/it]

Hypothetical table description: Caption: Average Cost of Healthy Foods by Region Page: Healthy Eating Section: Cost of Healthy Foods Columns: ['Region', 'Food Item', 'Average Cost (USD)', 'Unit'] Sample: ['North America', 'Quinoa', '4.50', 'per pound']



 55%|█████▌    | 33/60 [04:33<03:35,  7.99s/it]

Hypothetical table description: Caption: Tourist Attractions in World Capitals Page: World Capitals Section: Tourist Attractions Columns: ['City', 'Attraction', 'Description'] Sample: ['Paris', 'Eiffel Tower', 'An iconic iron lattice tower located on the Champ de Mars.']



 57%|█████▋    | 34/60 [04:41<03:26,  7.95s/it]

Hypothetical table description: Caption: Mortality Rates by Disease Page: Global Disease Mortality Statistics Section: Mortality Rates Columns: ['Year', 'Disease', 'Deaths per 100,000'] Sample: ['2020', 'Ischemic Heart Disease', '150.0']



 58%|█████▊    | 35/60 [04:49<03:22,  8.10s/it]

Hypothetical table description: Caption: Cigarette Brands Market Share Page: Tobacco Industry in [Country] Section: Market Share of Cigarette Brands Columns: ['Brand', 'Market Share (%)', 'Country', 'Year'] Sample: ['Marlboro', '34.5', 'USA', '2022']



 60%|██████    | 36/60 [04:58<03:19,  8.30s/it]

Hypothetical table description: Caption: Apple's Global Smartphone Market Share Page: Apple Inc. Section: Market Share Columns: ['Year', 'Market Share (%)', 'Units Shipped (Millions)', 'Rank'] Sample: ['2022', '15.6', '230.0', '2']



 62%|██████▏   | 37/60 [05:06<03:07,  8.14s/it]

Hypothetical table description: Caption: Nutritional Values of Healthy Foods Page: Healthy Foods Section: Nutritional Information Columns: ['Food Item', 'Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fats (g)'] Sample: ['Broccoli', '34', '2.8', '6.6', '0.4']



 63%|██████▎   | 38/60 [05:14<02:57,  8.06s/it]

Hypothetical table description: Caption: Hormones and Their Effects Page: Human Endocrine System Section: Hormonal Functions and Effects Columns: ['Hormone', 'Source Gland', 'Primary Function', 'Physiological Effects'] Sample: ['Insulin', 'Pancreas', 'Regulates blood glucose levels', 'Lowers blood sugar by facilitating cellular glucose uptake']



 65%|██████▌   | 39/60 [05:23<02:55,  8.36s/it]




 67%|██████▋   | 40/60 [05:31<02:43,  8.20s/it]

Hypothetical table description: Caption: Lake Altitudes Page: List of Lakes by Altitude Section: Altitude Information Columns: ['Lake Name', 'Country', 'Altitude (meters)'] Sample: ['Lake Titicaca', 'Peru/Bolivia', '3812']



 68%|██████▊   | 41/60 [05:39<02:36,  8.25s/it]

Hypothetical table description: Caption: Laptop CPU Specifications Page: Laptop Processors Section: CPU Specifications Columns: ['Model', 'Base Clock Speed (GHz)', 'Cores', 'TDP (W)'] Sample: ['Intel Core i7-1165G7', '2.80', '4', '28']



 70%|███████   | 42/60 [05:48<02:30,  8.38s/it]

Hypothetical table description: Caption: Asian Countries and Their Currencies Page: Currencies of Asia Section: Currency Information by Country Columns: ['Country', 'Currency', 'Currency Code', 'Symbol'] Sample: ['Japan', 'Yen', 'JPY', '¥']



 72%|███████▏  | 43/60 [05:55<02:19,  8.21s/it]

Hypothetical table description: Caption: Disease Risk Factors Page: Health and Diseases Section: Risk Factors Columns: ['Disease', 'Primary Risk Factors', 'Preventive Measures'] Sample: ['Heart Disease', 'Smoking, High Blood Pressure, High Cholesterol', 'Regular Exercise, Healthy Diet, Smoking Cessation']



 73%|███████▎  | 44/60 [06:04<02:11,  8.19s/it]

Hypothetical table description: Caption: External Drive Specifications Page: External Drives Section: Drive Capacities and Specifications Columns: ['Model', 'Capacity (TB)', 'Type', 'Price (USD)'] Sample: ['Seagate Backup Plus', '2', 'HDD', '59.99']



 75%|███████▌  | 45/60 [06:11<01:59,  7.98s/it]

Hypothetical table description: Caption: Baseball Team Captains Page: Baseball Team Leadership Section: Team Captains Columns: ['Captain Name', 'Team', 'Years as Captain', 'Position'] Sample: ['Derek Jeter', 'New York Yankees', '2003-2014', 'Shortstop']



 77%|███████▋  | 46/60 [06:18<01:48,  7.78s/it]

Hypothetical table description: Caption: Maryland Counties Population Page: Demographics of Maryland Section: County Populations Columns: ['County', 'Population', 'Year'] Sample: ['Montgomery County', 1052567, 2022]



 78%|███████▊  | 47/60 [06:26<01:42,  7.85s/it]

Hypothetical table description: Caption: Countries and Capitals Page: List of Countries and Capitals Section: Countries and Capitals Columns: ['Country', 'Capital', 'Continent', 'Population of Capital'] Sample: ['France', 'Paris', 'Europe', '2,148,000']



 80%|████████  | 48/60 [06:35<01:37,  8.15s/it]

Hypothetical table description: Caption: Disease Incidence Rates Page: Global Disease Statistics Section: Incidence Rates Columns: ['Disease', 'Region', 'Year', 'Incidence Rate (per 100,000)'] Sample: ['Influenza', 'North America', '2022', '150']



 82%|████████▏ | 49/60 [06:44<01:32,  8.39s/it]

Hypothetical table description: Caption: EU Member Countries and Year of Accession Page: European Union Enlargement Section: Member Countries and Accession Years Columns: ['Country', 'Year Joined'] Sample: ['Germany', 1958]



 83%|████████▎ | 50/60 [06:52<01:22,  8.22s/it]

Hypothetical table description: Caption: County Areas in Ireland Page: Counties of Ireland Section: Geographical Data Columns: ['County', 'Area (sq km)'] Sample: ['Dublin', '921']



 85%|████████▌ | 51/60 [07:00<01:13,  8.16s/it]

Hypothetical table description: Caption: Nutritional Values of Cereals Page: Cereal Nutrition Section: Nutritional Information Columns: ['Cereal Name', 'Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)'] Sample: ['Corn Flakes', '100', '2', '24', '0.2']



 87%|████████▋ | 52/60 [07:08<01:04,  8.01s/it]

Hypothetical table description: Caption: ERP Systems Pricing Page: Enterprise Resource Planning (ERP) Systems Section: Pricing Comparison Columns: ['Vendor', 'Product Name', 'Pricing Model', 'Starting Price'] Sample: ['SAP', 'SAP S/4HANA', 'Subscription', '$3,000/month']



 88%|████████▊ | 53/60 [07:15<00:55,  7.93s/it]

Hypothetical table description: Caption: Average Lifespan of Cat Breeds Page: Cat Breeds Section: Lifespan Columns: ['Breed', 'Average Lifespan (years)', 'Notable Health Considerations'] Sample: ['Siamese', '12-15', 'Prone to respiratory issues']



 90%|█████████ | 54/60 [07:23<00:46,  7.82s/it]

Hypothetical table description: Caption: Broadway Musicals and Directors Page: Broadway Musicals Section: Directors Columns: ['Musical Title', 'Director', 'Premiere Year', 'Theater'] Sample: ['Hamilton', 'Thomas Kail', '2015', 'Richard Rodgers Theatre']



 92%|█████████▏| 55/60 [07:31<00:39,  7.84s/it]

Hypothetical table description: Caption: Common Infections and Treatments Page: Infectious Diseases Section: Treatment Guidelines Columns: ['Infection Name', 'Type', 'Recommended Medication', 'Treatment Duration'] Sample: ['Strep Throat', 'Bacterial', 'Amoxicillin', '10 days']



 93%|█████████▎| 56/60 [07:39<00:31,  7.98s/it]

Hypothetical table description: Caption: Types of Food Page: Food Classification Section: Food Types Columns: ['Category', 'Typical Ingredients', 'Common Examples'] Sample: ['Fruit', 'Natural sugars, fiber, vitamins', 'Apple, Banana, Orange']



 95%|█████████▌| 57/60 [07:47<00:24,  8.08s/it]

Hypothetical table description: Caption: Board Games and Player Counts Page: Board Games Section: Number of Players Columns: ['Game Name', 'Minimum Players', 'Maximum Players'] Sample: ['Catan', 3, 4]



 97%|█████████▋| 58/60 [08:02<00:20, 10.17s/it]

Hypothetical table description: Caption: Google Product Reviews Page: Google Products Section: User Reviews Columns: ['Product Name', 'Average Rating', 'Number of Reviews', 'Review Summary'] Sample: ['Google Pixel 6', '4.5', '1500', 'Users praise the camera quality and battery life.']



 98%|█████████▊| 59/60 [08:10<00:09,  9.47s/it]

Hypothetical table description: Caption: Closest Constellations to Earth Page: Constellations Section: Closest Constellations Columns: ['Constellation Name', 'Closest Star', 'Distance from Earth (light-years)'] Sample: ['Centaurus', 'Proxima Centauri', '4.24']



100%|██████████| 60/60 [08:19<00:00,  8.33s/it]

Hypothetical table description: Caption: Video Game Age Ratings Page: Video Game Ratings Section: Age Ratings Columns: ['Game Title', 'Release Date', 'Age Rating', 'Rating Board'] Sample: ['The Legend of Zelda: Breath of the Wild', '2017-03-03', 'E10+', 'ESRB']

Generated 60 hypothetical descriptions

Examples:
  Query: world interest rates table
  Hypothetical: Caption: Global Central Bank Interest Rates Page: World Interest Rates Section: Current Interest Rates by Country Columns: ['Country', 'Central Bank', 'Interest Rate (%)'] Sample: ['United States', 'Federal Reserve', '5.25']

  Query: 2008 beijing olympics
  Hypothetical: Caption: 2008 Beijing Olympics Medal Tally Page: 2008 Beijing Olympics Section: Medal Tally Columns: ['Country', 'Gold', 'Silver', 'Bronze', 'Total'] Sample: ['China', 48, 22, 30, 100]

  Query: fast cars
  Hypothetical: Caption: Fastest Production Cars Page: Fast Cars Section: Top Speed Records Columns: ['Car Model', 'Manufacturer', 'Top Speed (mph)', 'Year']




### Encode Hypothetical Descriptions and Retrieve

In [12]:
# Encode hypothetical descriptions (NOT raw queries)
print(f"Encoding {len(hypothetical_descriptions)} hypothetical descriptions...")
query_embeddings = encoder.encode(
    hypothetical_descriptions, batch_size=32, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

# Search top 100
k = 100
print(f"Searching top-{k}...")
scores, indices = index.search(query_embeddings.astype('float32'), k)

results = {qid: [table_ids[idx] for idx in indices[i]] for i, qid in enumerate(query_ids)}
print(f"✓ Retrieved {len(results)} query results")

# Show examples
print("\n" + "=" * 80)
print("EXAMPLE RESULTS (HyDE)")
print("=" * 80)
for qid in list(queries.keys())[:3]:
    print(f"\nQuery {qid}: '{queries[qid]}'")
    print(f"Hypothetical: {hypothetical_descriptions[query_ids.index(qid)][:80]}...")
    for rank, tid in enumerate(results[qid][:3], 1):
        rel = qrels.get(qid, {}).get(tid, 0)
        score_val = scores[query_ids.index(qid)][rank-1]
        page = tables_df[tables_df['table_id'] == tid].iloc[0]['page_title']
        print(f"  {rank}. {tid} (score: {score_val:.3f}, rel: {rel}) - {page}")
print("=" * 80)

Encoding 60 hypothetical descriptions...


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.08it/s]

Searching top-100...
✓ Retrieved 60 query results

EXAMPLE RESULTS (HyDE)

Query 1: 'world interest rates table'
Hypothetical: Caption: Global Central Bank Interest Rates Page: World Interest Rates Section: ...
  1. table-0730-168 (score: 0.705, rel: 0) - List of Renminbi exchange rates
  2. table-0370-614 (score: 0.702, rel: 2) - Eurozone
  3. table-0552-599 (score: 0.673, rel: 0) - Rupee

Query 2: '2008 beijing olympics'
Hypothetical: Caption: 2008 Beijing Olympics Medal Tally Page: 2008 Beijing Olympics Section: ...
  1. table-0407-338 (score: 0.741, rel: 0) - Israel at the Paralympics
  2. table-0620-231 (score: 0.740, rel: 1) - Inna Zhukova
  3. table-0876-75 (score: 0.739, rel: 1) - Irina Risenzon

Query 3: 'fast cars'
Hypothetical: Caption: Fastest Production Cars Page: Fast Cars Section: Top Speed Records Colu...
  1. table-0990-862 (score: 0.739, rel: 1) - Speed Dreams
  2. table-1254-979 (score: 0.731, rel: 2) - Carbir Race Cars
  3. table-1275-187 (score: 0.678, rel: 2) - Pi




### Calculate Metrics

In [13]:
# Evaluation functions
def recall_at_k(retrieved, relevant, k):
    if len(relevant) == 0:
        return 0.0
    retrieved_at_k = set(retrieved[:k])
    return len(retrieved_at_k & relevant) / len(relevant)

def ndcg_at_k(retrieved, relevance, k):
    if len(relevance) == 0:
        return 0.0
    dcg = sum(relevance.get(retrieved[i], 0) / np.log2(i + 2) for i in range(min(k, len(retrieved))))
    ideal_rels = sorted(relevance.values(), reverse=True)[:k]
    idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_rels))
    return dcg / idcg if idcg > 0 else 0.0

In [14]:
# Evaluate
k_values = [1, 5, 10, 20]
metrics = defaultdict(list)

for query_id, retrieved in results.items():
    if query_id not in qrels:
        continue
    relevance = qrels[query_id]
    relevant = set(tid for tid, rel in relevance.items() if rel > 0)
    
    for k in k_values:
        metrics[f'Recall@{k}'].append(recall_at_k(retrieved, relevant, k))
        metrics[f'nDCG@{k}'].append(ndcg_at_k(retrieved, relevance, k))

# Print results
print("\n" + "="*60)
print("HYDE EVALUATION RESULTS")
print("="*60)
print("\nRecall:")
for k in k_values:
    print(f"  Recall@{k:2d}: {np.mean(metrics[f'Recall@{k}']):.4f}")
print("\nnDCG:")
for k in k_values:
    print(f"  nDCG@{k:2d}  : {np.mean(metrics[f'nDCG@{k}']):.4f}")
print("="*60)


HYDE EVALUATION RESULTS

Recall:
  Recall@ 1: 0.1071
  Recall@ 5: 0.2647
  Recall@10: 0.3811
  Recall@20: 0.5166

nDCG:
  nDCG@ 1  : 0.5167
  nDCG@ 5  : 0.5187
  nDCG@10  : 0.5200
  nDCG@20  : 0.5427


## Inspect Results

In [15]:
# Inspect specific query results
query_id = '1'

print(f"Query {query_id}: {queries[query_id]}")
print(f"Hypothetical description: {hypothetical_descriptions[query_ids.index(query_id)]}")
print("="*80)

for i, table_id in enumerate(results[query_id][:5], 1):
    row = tables_df[tables_df['table_id'] == table_id].iloc[0]
    rel = qrels.get(query_id, {}).get(table_id, 0)
    score_val = scores[query_ids.index(query_id)][i-1]
    
    print(f"\n{i}. {table_id} (score: {score_val:.4f}, relevance: {rel})")
    print(f"   Page: {row['page_title']}")
    print(f"   Section: {row['section_title']}")
    print(f"   Caption: {row['table_caption']}")
    
    try:
        headers = json.loads(row['headers'])
        print(f"   Headers: {headers[:5]}{'...' if len(headers) > 5 else ''}")
    except:
        pass
    
    try:
        sample = json.loads(row['sample_data'])
        print(f"   Sample ({len(sample)} rows): {sample[0][:3]}...")
    except:
        pass
    
    print(f"   LLM Description: {table_descriptions[table_ids.index(table_id)]}")
    print("-" * 80)

Query 1: world interest rates table
Hypothetical description: Caption: Global Central Bank Interest Rates Page: World Interest Rates Section: Current Interest Rates by Country Columns: ['Country', 'Central Bank', 'Interest Rate (%)'] Sample: ['United States', 'Federal Reserve', '5.25']

1. table-0730-168 (score: 0.7047, relevance: 0)
   Page: List of Renminbi exchange rates
   Section: List of World Bank nominal exchange rates
   Caption: List of World Bank nominal exchange rates
   Headers: ['World Bank annual average middle exchange rate for US dollar to Chinese yuan ( 1 US dollar to Chinese yuan )', 'World Bank annual average middle exchange rate for US dollar to Chinese yuan ( 1 US dollar to Chinese yuan )', 'World Bank annual average middle exchange rate for US dollar to Chinese yuan ( 1 US dollar to Chinese yuan )', 'World Bank annual average middle exchange rate for US dollar to Chinese yuan ( 1 US dollar to Chinese yuan )', 'World Bank annual average middle exchange rate for US