# HyDE Table Retrieval

Hypothetical Document Embeddings for table search:
1. Generate table descriptions using LLM
2. Generate hypothetical table descriptions from queries using LLM
3. Encode and retrieve using MiniLM-L6-v2 + FAISS

## Setup

In [None]:
import pandas as pd
import numpy as np
import json
import faiss
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from tqdm import tqdm

# TODO: Add LLM library imports (e.g., openai, anthropic, etc.)
# import openai

## Load Data

In [None]:
# Load tables
tables_df = pd.read_csv('data/wikitables_mini.csv')
print(f"Loaded {len(tables_df)} tables")
print(f"Columns: {list(tables_df.columns)}")
tables_df.head(2)

## Generate Table Descriptions (HyDE)

Use LLM to generate natural descriptions from table metadata.

In [None]:
def generate_table_description_with_llm(row, max_chars=256):
    """
    Generate table description using LLM.
    
    TODO: Replace this with actual LLM call.
    """
    
    # Prepare table metadata as prompt context
    metadata = []
    if pd.notna(row['table_caption']):
        metadata.append(f"Caption: {row['table_caption']}")
    if pd.notna(row['page_title']):
        metadata.append(f"Page: {row['page_title']}")
    if pd.notna(row['section_title']):
        metadata.append(f"Section: {row['section_title']}")
    
    try:
        headers = json.loads(row['headers'])
        if headers:
            metadata.append(f"Columns: {', '.join([str(h) for h in headers[:10]])}")
    except:
        pass
    
    try:
        sample_data = json.loads(row['sample_data'])
        if sample_data and len(sample_data) > 0:
            sample_str = str(sample_data[0][:5])
            metadata.append(f"Sample: {sample_str}")
    except:
        pass
    
    metadata_str = '\n'.join(metadata)
    
    # ============================================================
    # TODO: LLM CALL HERE
    # ============================================================
    # Prompt: "Given the following table metadata, write a natural 
    # description of what this table contains in 1-2 sentences.
    # Keep it under 256 characters.\n\n{metadata_str}"
    #
    # Example LLM call:
    # response = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": "You are a helpful assistant that describes tables."},
    #         {"role": "user", "content": f"Given the following table metadata, write a natural description of what this table contains in 1-2 sentences. Keep it under 256 characters.\n\n{metadata_str}"}
    #     ],
    #     max_tokens=100,
    #     temperature=0.3
    # )
    # description = response.choices[0].message.content.strip()
    # ============================================================
    
    # Fallback: concatenate metadata (replace with actual LLM response)
    description = ' '.join(metadata)
    
    return description[:max_chars]

# Test
print("Example LLM-generated descriptions:")
print("=" * 80)
for i in range(3):
    desc = generate_table_description_with_llm(tables_df.iloc[i])
    print(f"\n{i+1}. {desc}")

In [None]:
# Generate descriptions for all tables
print("Generating LLM descriptions for all tables...")
print("NOTE: This will make ~3K LLM API calls. Consider batch processing or caching.")
print()

table_descriptions = []
table_ids = []

for idx, row in tqdm(tables_df.iterrows(), total=len(tables_df)):
    table_descriptions.append(generate_table_description_with_llm(row))
    table_ids.append(row['table_id'])

print(f"Generated {len(table_descriptions)} descriptions")
print(f"Length stats - Mean: {np.mean([len(d) for d in table_descriptions]):.1f}, Max: {max([len(d) for d in table_descriptions])}")

## Encode Tables

In [None]:
# Load encoder
model_name = 'all-MiniLM-L6-v2'
print(f"Loading {model_name}...")
encoder = SentenceTransformer(model_name)
print(f"Dimension: {encoder.get_sentence_embedding_dimension()}")

In [None]:
# Encode descriptions
print("Encoding...")
table_embeddings = encoder.encode(
    table_descriptions, batch_size=32, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)
print(f"Shape: {table_embeddings.shape}")

## Build FAISS Index

In [None]:
# Build FAISS index
print("Building index...")
index = faiss.IndexFlatIP(encoder.get_sentence_embedding_dimension())
index.add(table_embeddings.astype('float32'))
print(f"✓ Index built with {index.ntotal} tables")

### Test Custom Queries

In [None]:
# Test custom queries - change test_query and run
test_query = "olympic medals table"
top_k = 5

print(f"Query: '{test_query}'")
print("=" * 80)

# ============================================================
# TODO: LLM CALL HERE - Generate hypothetical table description from query
# ============================================================
# Prompt: "Given the search query: '{test_query}', generate a description 
# of what a relevant table would contain. Describe the table structure, 
# columns, and type of data it would have. Keep it under 256 characters."
#
# Example LLM call:
# response = openai.ChatCompletion.create(
#     model="gpt-4",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": f"Given the search query: '{test_query}', generate a description of what a relevant table would contain. Describe the table structure, columns, and type of data it would have. Keep it under 256 characters."}
#     ],
#     max_tokens=100,
#     temperature=0.3
# )
# hypothetical_description = response.choices[0].message.content.strip()
# ============================================================

# Fallback: use query directly (replace with LLM-generated description)
hypothetical_description = test_query
print(f"Hypothetical table description: {hypothetical_description}\n")

# Encode and search
test_emb = encoder.encode([hypothetical_description], convert_to_numpy=True, normalize_embeddings=True).astype('float32')
scores_test, indices_test = index.search(test_emb, top_k)

for rank in range(top_k):
    idx = indices_test[0][rank]
    table_id = table_ids[idx]
    row = tables_df[tables_df['table_id'] == table_id].iloc[0]
    
    print(f"\n{rank + 1}. {table_id} (score: {scores_test[0][rank]:.4f})")
    print(f"   Page: {row['page_title']}")
    print(f"   Caption: {row['table_caption']}")
    print(f"   Description: {table_descriptions[idx][:120]}...")
    print("-" * 80)

## Evaluation

### Load Queries and Relevance Judgments

In [None]:
# Load queries
queries = {}
with open('data/queries.txt', 'r') as f:
    for line in f:
        parts = line.strip().split(None, 1)
        if len(parts) == 2:
            query_id, query_text = parts
            queries[query_id] = query_text

print(f"Loaded {len(queries)} queries")
print("Examples:", list(queries.items())[:3])

# Load qrels (relevance judgments)
qrels = defaultdict(dict)
with open('data/qrels.txt', 'r') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 4:
            query_id, table_id, relevance = parts[0], parts[2], int(parts[3])
            qrels[query_id][table_id] = relevance

qrels = dict(qrels)
print(f"Loaded qrels for {len(qrels)} queries")

---

### Generate Hypothetical Descriptions from Queries (HyDE)

In [None]:
query_ids = list(queries.keys())
query_texts = [queries[qid] for qid in query_ids]

print(f"Generating hypothetical table descriptions for {len(query_texts)} queries...")
print("NOTE: This will make ~60 LLM API calls.")
print()

hypothetical_descriptions = []

for query_text in tqdm(query_texts):
    # ============================================================
    # TODO: LLM CALL HERE - Generate hypothetical table description
    # ============================================================
    # Prompt: "Given the search query: '{query_text}', generate a description 
    # of what a relevant table would contain. Describe the table structure, 
    # columns, and type of data it would have. Keep it under 256 characters."
    #
    # Example LLM call:
    # response = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": "You are a helpful assistant."},
    #         {"role": "user", "content": f"Given the search query: '{query_text}', generate a description of what a relevant table would contain. Describe the table structure, columns, and type of data it would have. Keep it under 256 characters."}
    #     ],
    #     max_tokens=100,
    #     temperature=0.3
    # )
    # hyp_desc = response.choices[0].message.content.strip()
    # ============================================================
    
    # Fallback: use query directly (replace with LLM response)
    hyp_desc = query_text
    hypothetical_descriptions.append(hyp_desc)

print(f"Generated {len(hypothetical_descriptions)} hypothetical descriptions")
print(f"\nExamples:")
for i in range(3):
    print(f"  Query: {query_texts[i]}")
    print(f"  Hypothetical: {hypothetical_descriptions[i]}")
    print()

### Encode Hypothetical Descriptions and Retrieve

In [None]:
# Encode hypothetical descriptions (NOT raw queries)
print(f"Encoding {len(hypothetical_descriptions)} hypothetical descriptions...")
query_embeddings = encoder.encode(
    hypothetical_descriptions, batch_size=32, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

# Search top 100
k = 100
print(f"Searching top-{k}...")
scores, indices = index.search(query_embeddings.astype('float32'), k)

results = {qid: [table_ids[idx] for idx in indices[i]] for i, qid in enumerate(query_ids)}
print(f"✓ Retrieved {len(results)} query results")

# Show examples
print("\n" + "=" * 80)
print("EXAMPLE RESULTS (HyDE)")
print("=" * 80)
for qid in list(queries.keys())[:3]:
    print(f"\nQuery {qid}: '{queries[qid]}'")
    print(f"Hypothetical: {hypothetical_descriptions[query_ids.index(qid)][:80]}...")
    for rank, tid in enumerate(results[qid][:3], 1):
        rel = qrels.get(qid, {}).get(tid, 0)
        score_val = scores[query_ids.index(qid)][rank-1]
        page = tables_df[tables_df['table_id'] == tid].iloc[0]['page_title']
        print(f"  {rank}. {tid} (score: {score_val:.3f}, rel: {rel}) - {page}")
print("=" * 80)

### Calculate Metrics

In [None]:
# Evaluation functions
def recall_at_k(retrieved, relevant, k):
    if len(relevant) == 0:
        return 0.0
    retrieved_at_k = set(retrieved[:k])
    return len(retrieved_at_k & relevant) / len(relevant)

def ndcg_at_k(retrieved, relevance, k):
    if len(relevance) == 0:
        return 0.0
    dcg = sum(relevance.get(retrieved[i], 0) / np.log2(i + 2) for i in range(min(k, len(retrieved))))
    ideal_rels = sorted(relevance.values(), reverse=True)[:k]
    idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_rels))
    return dcg / idcg if idcg > 0 else 0.0

In [None]:
# Evaluate
k_values = [1, 5, 10, 20]
metrics = defaultdict(list)

for query_id, retrieved in results.items():
    if query_id not in qrels:
        continue
    relevance = qrels[query_id]
    relevant = set(tid for tid, rel in relevance.items() if rel > 0)
    
    for k in k_values:
        metrics[f'Recall@{k}'].append(recall_at_k(retrieved, relevant, k))
        metrics[f'nDCG@{k}'].append(ndcg_at_k(retrieved, relevance, k))

# Print results
print("\n" + "="*60)
print("HYDE EVALUATION RESULTS")
print("="*60)
print("\nRecall:")
for k in k_values:
    print(f"  Recall@{k:2d}: {np.mean(metrics[f'Recall@{k}']):.4f}")
print("\nnDCG:")
for k in k_values:
    print(f"  nDCG@{k:2d}  : {np.mean(metrics[f'nDCG@{k}']):.4f}")
print("="*60)

## Inspect Results

In [None]:
# Inspect specific query results
query_id = '1'

print(f"Query {query_id}: {queries[query_id]}")
print(f"Hypothetical description: {hypothetical_descriptions[query_ids.index(query_id)]}")
print("="*80)

for i, table_id in enumerate(results[query_id][:5], 1):
    row = tables_df[tables_df['table_id'] == table_id].iloc[0]
    rel = qrels.get(query_id, {}).get(table_id, 0)
    score_val = scores[query_ids.index(query_id)][i-1]
    
    print(f"\n{i}. {table_id} (score: {score_val:.4f}, relevance: {rel})")
    print(f"   Page: {row['page_title']}")
    print(f"   Section: {row['section_title']}")
    print(f"   Caption: {row['table_caption']}")
    
    try:
        headers = json.loads(row['headers'])
        print(f"   Headers: {headers[:5]}{'...' if len(headers) > 5 else ''}")
    except:
        pass
    
    try:
        sample = json.loads(row['sample_data'])
        print(f"   Sample ({len(sample)} rows): {sample[0][:3]}...")
    except:
        pass
    
    print(f"   LLM Description: {table_descriptions[table_ids.index(table_id)]}")
    print("-" * 80)