In [1]:
%pip install faiss-cpu pandas numpy sentence-transformers tqdm

Note: you may need to restart the kernel to use updated packages.


In [12]:
# Cell 2 (REPLACEMENT)

import json
import time
import numpy as np
import pandas as pd
import faiss
import re  # <-- ADD THIS IMPORT
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import sys

# Check Python version
print(f"Python version: {sys.version}")
print(f"FAISS version: {faiss.__version__}")

Python version: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]
FAISS version: 1.12.0


In [13]:
# --- Configuration ---
INPUT_CUAD_FILE = '../Data/CUADv1.json'
INPUT_CHUNKS_FILE = '../Data/processed_chunks.csv'
INPUT_EMBEDDINGS_FILE = '../Data/embeddings.npy'
NUM_CONTRACTS_TO_PROCESS = 100  # Must match the number from your first script
K = 10  # Number of chunks to retrieve (increased from 5 to improve recall)


In [14]:
print("Loading all data sources...")

# Load the embeddings
embeddings = np.load(INPUT_EMBEDDINGS_FILE).astype('float32') # FAISS needs float32
# Normalize embeddings for cosine similarity (L2-normalize)
emb_norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / np.maximum(emb_norms, 1e-12)
print(f"Loaded and normalized embeddings. Shape: {embeddings.shape}")

# Load the chunk metadata
df_chunks = pd.read_csv(INPUT_CHUNKS_FILE)
# Create a simple mapping from index (0, 1, 2...) to chunk_text
chunk_id_to_text = pd.Series(df_chunks['chunk_text'].values, index=df_chunks.index).to_dict()

print(f"Loaded {len(df_chunks)} chunks.")

Loading all data sources...
Loaded and normalized embeddings. Shape: (2009, 384)
Loaded 2009 chunks.


In [15]:
print("Loading SBERT model (all-MiniLM-L6-v2)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

Loading SBERT model (all-MiniLM-L6-v2)...
Model loaded successfully.
Model loaded successfully.


In [16]:
print("Building FAISS index...")
dimension = embeddings.shape[1]  # Should be 384 for this model

# Use IndexFlatIP (inner product) on L2-normalized vectors -> cosine similarity
index = faiss.IndexFlatIP(dimension)

# Add all our normalized embeddings to the index
index.add(embeddings)

print(f"Successfully built index with {index.ntotal} vectors (IndexFlatIP, normalized vectors -> cosine similarity).")

Building FAISS index...
Successfully built index with 2009 vectors (IndexFlatIP, normalized vectors -> cosine similarity).


In [17]:
print(f"Extracting questions and answers for the first {NUM_CONTRACTS_TO_PROCESS} contracts...")
qa_pairs = []

with open(INPUT_CUAD_FILE, 'r') as f:
    cuad_data = json.load(f)

for i, article in enumerate(cuad_data['data']):
    if i >= NUM_CONTRACTS_TO_PROCESS:
        break
    
    contract_title = article['title']
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            # We only care about answerable questions
            if not qa['is_impossible']:
                # There can be multiple gold answers
                gold_answers = [ans['text'] for ans in qa['answers']]
                qa_pairs.append({
                    'title': contract_title,
                    'question': qa['question'],
                    'gold_answers': gold_answers
                })

print(f"Found {len(qa_pairs)} answerable questions.")

Extracting questions and answers for the first 100 contracts...
Found 1260 answerable questions.


In [18]:
print(f"Running evaluation for {len(qa_pairs)} questions...")

results = []
total_latency = 0
SIM_THRESHOLD = 0.66  # Cosine similarity threshold for semantic match (tune as needed)

# Collect a few diagnostic failures for inspection
diagnostics = []
MAX_DIAG = 5

for qa in tqdm(qa_pairs):
    question_text = qa['question']
    gold_answers = qa['gold_answers']
    
    # 1. Embed and normalize the question
    start_time = time.time()
    question_embedding = model.encode([question_text]).astype('float32')
    question_embedding = question_embedding / np.linalg.norm(question_embedding, axis=1, keepdims=True)
    
    # 2. Search the index (using normalized vectors + IndexFlatIP -> cosine)
    distances, indices = index.search(question_embedding, K)
    end_time = time.time()
    total_latency += (end_time - start_time)
    
    retrieved_chunk_indices = indices[0]
    
    # Pre-compute cleaned gold answers and their embeddings
    gold_embs = []
    gold_strs = []
    for gold in gold_answers:
        cleaned_gold = re.sub(r'\s+', ' ', gold).strip()
        # remove punctuation and lowercase for string-based matching
        cleaned_gold_nopunct = re.sub(r"[^\w\s]", '', cleaned_gold).lower()
        gold_strs.append(cleaned_gold_nopunct)
        ge = model.encode([cleaned_gold]).astype('float32')
        ge = ge / np.linalg.norm(ge, axis=1, keepdims=True)
        gold_embs.append(ge[0])
    
    found_hit = False
    first_hit_rank = 0
    num_relevant_hits = 0
    
    # For diagnostics, store top retrieved texts
    top_texts = [re.sub(r"[^\w\s]", '', chunk_id_to_text[idx]).lower() for idx in retrieved_chunk_indices]

    for rank, chunk_index in enumerate(retrieved_chunk_indices, start=1):
        retrieved_text = chunk_id_to_text[chunk_index]
        retrieved_text_nopunct = re.sub(r"[^\w\s]", '', retrieved_text).lower()
        chunk_is_relevant = False
        
        # Check if ANY gold answer matches by cleaned-string OR semantic similarity
        for g_idx, cleaned_gold_nopunct in enumerate(gold_strs):
            # 1) exact-ish string match on cleaned text
            if cleaned_gold_nopunct and (cleaned_gold_nopunct in retrieved_text_nopunct):
                chunk_is_relevant = True
                break
            # 2) semantic match: cosine similarity between gold-answer embedding and chunk embedding
            sim = float(np.dot(gold_embs[g_idx], embeddings[chunk_index]))
            if sim >= SIM_THRESHOLD:
                chunk_is_relevant = True
                break
        
        if chunk_is_relevant:
            num_relevant_hits += 1
            if not found_hit:
                found_hit = True
                first_hit_rank = rank

    results.append({
        'found_hit': found_hit,
        'first_hit_rank': first_hit_rank,
        'num_relevant_hits': num_relevant_hits
    })

    # Save a few diagnostic failures
    if (not found_hit) and (len(diagnostics) < MAX_DIAG):
        diagnostics.append({
            'question': question_text,
            'gold_answers': gold_answers,
            'top_texts': top_texts
        })

print("Evaluation complete.")

# Print a few diagnostics
if diagnostics:
    print('\nSample failed queries (question, gold_answers, top retrieved chunk texts):')
    for d in diagnostics:
        print('\nQuestion:', d['question'])
        print('Gold answers:', d['gold_answers'])
        print('Top retrieved (cleaned):')
        for t in d['top_texts']:
            print(' -', t[:250].strip())


Running evaluation for 1260 questions...


  0%|          | 0/1260 [00:00<?, ?it/s]

Evaluation complete.

Sample failed queries (question, gold_answers, top retrieved chunk texts):

Question: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
Gold answers: ['DISTRIBUTOR AGREEMENT']
Top retrieved (cleaned):
 - or deletions thereto whether by alterations addenda or otherwise shall be of no force and effect unless expressly consented to by edge in writing the provisions of this agreement supersede all contemporaneous oral agreements and all prior oral and wr
 - redacted copy confidential treatment requested confidential portions of this document have been redacted and have been separately filed with the commission 1 endorsement agreement this agreement is entered into on january 13 2005 between professional
 - during the term of this agreement the contractor shall provide all information relating to the offering the renewable note program or the note portfolio reasonably requeste

In [None]:
print("\n--- FAISS Evaluation Results ---")

# Average Latency
avg_latency = (total_latency / len(qa_pairs)) * 1000 # in milliseconds
print(f"Average Query Latency: {avg_latency:.2f} ms")

# Index Build Time (for FAISS, this is just the 'index.add()' time, which is fast)
print(f"Index Build Time: (Near-instant for FAISS 'add' step in Cell 6)")

# Retrieval Quality
total_questions = len(results)

# Recall@K
recall_at_k = sum(1 for r in results if r['found_hit']) / total_questions
print(f"Recall@{K}: {recall_at_k * 100:.2f}%")

# MRR (Mean Reciprocal Rank)
mrr = sum(1.0 / r['first_hit_rank'] if r['first_hit_rank'] > 0 else 0 for r in results) / total_questions
print(f"MRR (Mean Reciprocal Rank): {mrr:.4f}")

# Precision@K
precision_at_k = sum(r['num_relevant_hits'] / K for r in results) / total_questions
print(f"Precision@{K}: {precision_at_k * 100:.2f}%")


--- FAISS Evaluation Results ---
Average Query Latency: 5.65 ms
Index Build Time: (Near-instant for FAISS 'add' step in Cell 6)
Recall@10: 25.56%
MRR (Mean Reciprocal Rank): 0.1361
Precision@10: 6.19%


: 