In [22]:
import io
import zipfile
import requests
import frontmatter
import logging
from tqdm import tqdm
from minsearch import Index

def read_repo_data(repo_owner, repo_name, branch="main"):
    """
      Download and parse all markdown files from a GitHub repository.
    Yields one document (dict) at a time to avoid loading everything into memory.
    
    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)

    """
    url = f"https://codeload.github.com/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"
    resp = requests.get(url)

    if resp.status_code == 404 and branch == "main":
        # Try fallback to master
        return read_repo_data(repo_owner, repo_name, branch="master")
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: HTTP {resp.status_code}")
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            if not filename.lower().endswith((".md", ".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="replace")
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data.update({
                        "filename": filename,
                        "repo": repo_name,
                        "owner": repo_owner,
                        "branch": branch
                    })
                    yield data
            except Exception as e:
                logging.warning("Error processing %s: %s", filename, e)
                continue

In [23]:
def sliding_window(seq, size, step):
    """Yield overlapping chunks from a long string."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    for i in range(0, n, step):
        yield {"start": i, "chunk": seq[i:i+size]}
        if i + size >= n:
            break

In [25]:
print(" Downloading and chunking documents...")
all_chunks = []

for doc in tqdm(read_repo_data("evidentlyai", "docs"), desc="Processing files"):
    doc_copy = doc.copy()
    content = doc_copy.pop("content", "")
    for chunk in sliding_window(content, size=2000, step=1000):
        chunk.update(doc_copy)
        all_chunks.append(chunk)

print(f"Collected {len(all_chunks)} chunks. Building index...")

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)
index.fit(all_chunks)

print(" Indexing complete!")

def text_search(query, num_results=2):
    results = index.search(query, num_results=num_results)
    for i, r in enumerate(results, 1):
        print(f"\n🔎 Result {i}")
        print(f"📄 Title: {r.get('title', '(no title)')}")
        print(f"📝 Description: {r.get('description', '(no description)')}")
        print(f"📂 File: {r['filename']}")
        print(f"📖 Content Preview:\n{r['chunk'][:400]}...")
        print("-" * 80)


# Example search
query = "What should be in a test dataset for AI evaluation?"
text_search(query)

📥 Downloading and chunking documents...


Processing files: 95it [00:01, 61.60it/s]

✅ Collected 575 chunks. Building index...
✅ Indexing complete!

🔎 Result 1
📄 Title: RAG evaluation dataset
📝 Description: Synthetic data for RAG.
📂 File: docs-main/synthetic-data/rag_data.mdx
📖 Content Preview:
Retrieval-Augmented Generation (RAG) systems rely on retrieving answers from a knowledge base before generating responses. To evaluate them effectively, you need a test dataset that reflects what the system *should* know.

Instead of manually creating test cases, you can generate them directly from your knowledge source, ensuring accurate and relevant ground truth data.

## Create a RAG test datas...
--------------------------------------------------------------------------------

🔎 Result 2
📄 Title: LLM Evaluation
📝 Description: Evaluate text outputs in under 5 minutes
📂 File: docs-main/quickstart_llm.mdx
📖 Content Preview:
 Inputs, context, and outputs (for RAG evaluation)
</Info>

<Info>
  **Collecting live data**. You can also trace inputs and outputs from your LLM app and d


