# Embedding Score Stability: Adding Documents Should Not Change Absolute Scores

This notebook tests a fundamental property of vector similarity search:

> **Hypothesis**: Adding new documents to a ChromaDB collection should NOT change the absolute
> similarity scores of already-indexed documents. Only the relative ranking (position) may change
> if the new documents contain better-matching content.

This matters because it validates that our RAG system's scoring is **stable and trustworthy** -
a score of 0.85 today should still mean 0.85 after ingesting more documents.

In [None]:
import shutil
import tempfile
from pathlib import Path

import setup_notebook  # noqa: F401 - fixes import path
from rag_pipeline.config.parameter_sets import get_param_set
from rag_pipeline.core.embeddings import chunk_pdf, embed_chunks, query_embeddings
from rag_pipeline.utils.directory_utils import get_test_data_dir

## Configuration

We use two topically different PDFs and a fresh temporary ChromaDB directory to ensure isolation.

In [None]:
test_data = get_test_data_dir()
params = get_param_set("fast")

# Document A: first PDF to index (baseline)
# Document B: added later to test score stability
doc_a_path = test_data / "2303.18223v16.pdf"
doc_b_path = test_data / "2005.11401v4.pdf"

# Limit pages for faster processing
MAX_PAGES = 5

# Queries to test (mix of topics relevant to either document)
TEST_QUERIES = [
    "Large Language Models",
    "Healthcare applications",
    "Machine learning training",
    "Neural network architecture",
]

# Use a large top_k to capture as many Document A chunks as possible in both rounds
TOP_K = 100

# Floating-point tolerance for score comparison
SCORE_TOLERANCE = 1e-6

# Create a fresh temporary directory for this experiment
tmp_dir = Path(tempfile.mkdtemp(prefix="score_stability_"))
persist_dir = tmp_dir / "chroma"
persist_dir.mkdir(parents=True, exist_ok=True)

print(f"Document A: {doc_a_path.name}")
print(f"Document B: {doc_b_path.name}")
print(f"Parameters: {params.chunking.chunk_size} chunk size, {params.chunking.chunk_overlap} overlap")
print(f"Embedding model: {params.embedding.model_name}")
print(f"Temp directory: {tmp_dir}")

## Step 1: Chunk and embed Document A (baseline)

In [None]:
chunking_a = chunk_pdf(
    doc_a_path,
    chunk_size=params.chunking.chunk_size,
    chunk_overlap=params.chunking.chunk_overlap,
    max_pages=MAX_PAGES,
)

chunks_a, records_a = embed_chunks(
    chunking_a,
    params.embedding.model_name,
    persist_dir=str(persist_dir),
    deduplicate=True,
)

print(f"Document A: {chunks_a} chunks created, {records_a} records stored")

## Step 2: Query baseline (only Document A in the collection)

In [None]:
# Record scores BEFORE adding Document B
scores_before: dict[str, dict[str, float]] = {}  # {query: {record_id: score}}

for query in TEST_QUERIES:
    results = query_embeddings(
        query,
        params.embedding.model_name,
        persist_dir=str(persist_dir),
        top_k=TOP_K,
    )

    scores_before[query] = {r["record_id"]: r["similarity_score"] for r in results["all_results"]}

    n_results = len(results["all_results"])
    top_score = results["all_results"][0]["similarity_score"] if n_results > 0 else 0
    print(f"Query: {query!r:40s} -> {n_results} results, top score: {top_score:.4f}")

print(f"\nBaseline recorded for {len(TEST_QUERIES)} queries")

## Step 3: Add Document B to the same collection

In [None]:
chunking_b = chunk_pdf(
    doc_b_path,
    chunk_size=params.chunking.chunk_size,
    chunk_overlap=params.chunking.chunk_overlap,
    max_pages=MAX_PAGES,
)

chunks_b, records_total = embed_chunks(
    chunking_b,
    params.embedding.model_name,
    persist_dir=str(persist_dir),
    deduplicate=True,
)

print(f"Document B: {chunks_b} chunks created")
print(f"Total records in collection: {records_total}")
print(f"(was {records_a} before adding Document B)")

## Step 4: Re-run the same queries (Document A + B in the collection)

In [None]:
# Record scores AFTER adding Document B
scores_after: dict[str, dict[str, float]] = {}

for query in TEST_QUERIES:
    results = query_embeddings(
        query,
        params.embedding.model_name,
        persist_dir=str(persist_dir),
        top_k=TOP_K,
    )

    scores_after[query] = {r["record_id"]: r["similarity_score"] for r in results["all_results"]}

    n_results = len(results["all_results"])
    top_score = results["all_results"][0]["similarity_score"] if n_results > 0 else 0
    print(f"Query: {query!r:40s} -> {n_results} results, top score: {top_score:.4f}")

print(f"\nPost-addition scores recorded for {len(TEST_QUERIES)} queries")

## Step 5: Compare absolute scores for Document A chunks

For every Document A chunk that appears in both the before and after result sets,
the absolute similarity score must be identical (within floating-point tolerance).

In [None]:
all_passed = True
comparison_data = []  # For visualization later

for query in TEST_QUERIES:
    before = scores_before[query]
    after = scores_after[query]

    # Find Document A chunks present in both result sets
    common_ids = set(before.keys()) & set(after.keys())

    mismatches = []
    for record_id in sorted(common_ids):
        score_before = before[record_id]
        score_after = after[record_id]
        diff = abs(score_before - score_after)

        comparison_data.append(
            {
                "query": query,
                "record_id": record_id,
                "score_before": score_before,
                "score_after": score_after,
                "diff": diff,
            }
        )

        if diff > SCORE_TOLERANCE:
            mismatches.append((record_id, score_before, score_after, diff))

    status = "PASS" if not mismatches else "FAIL"
    print(f"Query: {query!r:40s} -> {len(common_ids)} common chunks, {status}")

    if mismatches:
        all_passed = False
        for record_id, sb, sa, d in mismatches:
            print(f"  MISMATCH {record_id}: {sb:.6f} -> {sa:.6f} (diff={d:.2e})")

print()
if all_passed:
    print("ALL QUERIES PASSED: Absolute scores are stable after adding documents.")
else:
    print("SOME QUERIES FAILED: Scores changed after adding documents!")

In [None]:
# Hard assertion - this cell will fail if scores changed
assert all_passed, (
    "Absolute similarity scores changed after adding documents! "
    "This violates the expected behavior of cosine similarity in ChromaDB."
)
print(f"Verified: {len(comparison_data)} score comparisons all within tolerance ({SCORE_TOLERANCE})")

## Step 6: Show how rankings can change

While absolute scores stay the same, the ranking position of Document A chunks
may change because Document B chunks may now appear with higher scores.

In [None]:
for query in TEST_QUERIES:
    before = scores_before[query]
    after = scores_after[query]

    # Rankings: sorted by score descending
    rank_before = {rid: i + 1 for i, rid in enumerate(sorted(before, key=before.get, reverse=True))}
    rank_after = {rid: i + 1 for i, rid in enumerate(sorted(after, key=after.get, reverse=True))}

    # Check Document A chunks that moved in ranking
    doc_a_ids = set(before.keys())  # All IDs from before are Document A
    moved = []
    for rid in sorted(doc_a_ids & set(rank_after.keys())):
        rb = rank_before[rid]
        ra = rank_after[rid]
        if rb != ra:
            moved.append((rid, rb, ra))

    # Count new Document B chunks that appeared in top results
    new_ids = set(after.keys()) - doc_a_ids

    print(f"\nQuery: {query!r}")
    print(f"  Document B chunks in results: {len(new_ids)}")
    print(f"  Document A chunks that changed rank: {len(moved)}")
    if moved:
        for rid, rb, ra in moved[:5]:  # Show top 5 rank changes
            direction = "down" if ra > rb else "up"
            print(f"    {rid}: rank {rb} -> {ra} ({direction})")

## Step 7: Visualize score stability

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Before vs After scores (should be on the diagonal)
ax1 = axes[0]
before_scores = [d["score_before"] for d in comparison_data]
after_scores = [d["score_after"] for d in comparison_data]

ax1.scatter(before_scores, after_scores, alpha=0.6, edgecolors="black", linewidth=0.5)

# Perfect stability line
score_min = min(min(before_scores), min(after_scores))
score_max = max(max(before_scores), max(after_scores))
ax1.plot([score_min, score_max], [score_min, score_max], "r--", label="Perfect stability")

ax1.set_xlabel("Score before adding Document B")
ax1.set_ylabel("Score after adding Document B")
ax1.set_title("Absolute Score Stability")
ax1.legend()
ax1.set_aspect("equal")

# Plot 2: Distribution of score differences
ax2 = axes[1]
diffs = [d["diff"] for d in comparison_data]
ax2.hist(diffs, bins=30, edgecolor="black", alpha=0.7)
ax2.axvline(x=SCORE_TOLERANCE, color="r", linestyle="--", label=f"Tolerance ({SCORE_TOLERANCE:.0e})")
ax2.set_xlabel("Absolute score difference")
ax2.set_ylabel("Count")
ax2.set_title("Score Difference Distribution")
ax2.legend()

plt.tight_layout()
plt.savefig("embedding_score_stability.png", dpi=150, bbox_inches="tight")
plt.show()

print(f"Max difference: {max(diffs):.2e}")
print(f"Mean difference: {sum(diffs) / len(diffs):.2e}")

## Summary

In [None]:
print("=" * 70)
print("EMBEDDING SCORE STABILITY EXPERIMENT")
print("=" * 70)
print(f"Document A: {doc_a_path.name} ({records_a} records)")
print(f"Document B: {doc_b_path.name} ({records_total - records_a} records added)")
print(f"Total collection size after: {records_total} records")
print(f"Queries tested: {len(TEST_QUERIES)}")
print(f"Score comparisons: {len(comparison_data)}")
print(f"Max score difference: {max(d['diff'] for d in comparison_data):.2e}")
print(f"Result: {'PASS - Scores are stable' if all_passed else 'FAIL - Scores changed!'}")
print("=" * 70)
print()
print("Conclusion:")
if all_passed:
    print("  Adding documents to a ChromaDB collection does NOT affect the")
    print("  absolute similarity scores of previously indexed documents.")
    print("  Only rankings may change when new, better-matching content is added.")
else:
    print("  UNEXPECTED: Scores changed! This needs investigation.")
print()
print("This is expected because cosine similarity is computed independently")
print("for each query-document pair. The score depends only on the query")
print("embedding and the document chunk embedding, not on other documents")
print("in the collection.")

In [None]:
# Clean up temporary directory
shutil.rmtree(tmp_dir, ignore_errors=True)
print(f"Cleaned up {tmp_dir}")
print("\u2705 Embedding score stability experiment completed successfully")