In [3]:
"""Visualize embeddings of source files from the parent repository.

Each Python source file is embedded at multiple truncation points (25%, 50%,
75%, 100% of lines). Points from the same file are connected with a line,
showing how the embedding drifts as more of the file is included.
"""

import glob
import os

from embedding_visualizer import PrincipalComponent, TextEmbedding, visualize_embeddings

REPO_ROOT = os.path.dirname(os.getcwd())


# Collect all Python files, excluding caches
py_files = sorted(glob.glob(os.path.join(REPO_ROOT, "**/*.py"), recursive=True)) \
    + sorted(glob.glob(os.path.join(REPO_ROOT, "**/*.html"), recursive=True))
py_files = [
    f
    for f in py_files
    if "__pycache__" not in f
]

docs = []
for filepath in py_files:
    with open(filepath) as f:
        lines = f.readlines()

    rel_path = os.path.relpath(filepath, REPO_ROOT)
    directory = os.path.dirname(rel_path) or "root"
    total = len(lines)

    fractions = [0.25, 0.5, 0.75, 1.0] if total >= 4 else [1.0]

    for frac in fractions:
        n = max(1, int(total * frac))
        text = "".join(lines[:n])
        pct = int(frac * 100)
        docs.append(
            {
                "text": text,
                "label": directory,
                "line-id": rel_path,
                "hover": f"{rel_path} ({n}/{total} lines, {pct}%)",
            }
        )

print(f"Created {len(docs)} documents from {len(py_files)} files\n")

# --- Plot 1: t-SNE ---
plot = visualize_embeddings(docs=docs, projection="t-sne", title="Repository Source Files (t-SNE)")
plot.display()
plot.to_html("repo_tsne.html")
print("Saved repo_tsne.html\n")

# --- Plot 2: Custom axes ---
plot2 = visualize_embeddings(
    docs=docs,
    x_projection=PrincipalComponent(1),
    y_projection=TextEmbedding("API client and HTTP requests"),
    title="Repository Source Files (PC1 vs API similarity)",
)
plot2.display()
plot2.to_html("repo_custom.html")
print("Saved repo_custom.html")


Created 44 documents from 11 files

Computing embeddings for 44 documents...
Computing t-SNE (perplexity=30)...


Saved repo_tsne.html

Computing embeddings for 44 documents...
Computing custom projections...


Saved repo_custom.html
