In [1]:
import io
import os
import zipfile
import requests
import frontmatter
import logging
import asyncio
from tqdm import tqdm
from minsearch import Index
from typing import List, Any
from dotenv import load_dotenv
from openai import OpenAI
from pydantic_ai import Agent

# Load environment variables
load_dotenv()

True

In [2]:
def read_repo_data(repo_owner, repo_name, branch="main"):
    """
    Download and parse all markdown files from a GitHub repository.
    Yields one document (dict) at a time to avoid loading everything into memory.

    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)
    """
    url = f"https://codeload.github.com/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"
    resp = requests.get(url)

    if resp.status_code == 404 and branch == "main":
        # Try fallback to master
        return read_repo_data(repo_owner, repo_name, branch="master")

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: HTTP {resp.status_code}")
    
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            if not filename.lower().endswith((".md", ".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="replace")
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data.update({
                        "filename": filename,
                        "repo": repo_name,
                        "owner": repo_owner,
                        "branch": branch
                    })
                    yield data
            except Exception as e:
                logging.warning("Error processing %s: %s", filename, e)
                continue


In [3]:
def sliding_window(seq, size, step):
    """Yield overlapping chunks from a long string."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    for i in range(0, n, step):
        yield {"start": i, "chunk": seq[i:i+size]}
        if i + size >= n:
            break

In [4]:
print("📥 Downloading and chunking documents...")
evidently_chunks = []

for doc in tqdm(read_repo_data("evidentlyai", "docs"), desc="Processing files"):
    doc_copy = doc.copy()
    content = doc_copy.pop("content", "")
    for chunk in sliding_window(content, size=2000, step=1000):
        chunk.update(doc_copy)
        evidently_chunks.append(chunk)

print(f"✅ Collected {len(evidently_chunks)} chunks. Building index...")

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)
index.fit(evidently_chunks)

print("🎯 Indexing complete!")


📥 Downloading and chunking documents...


Processing files: 95it [00:01, 61.57it/s]

✅ Collected 575 chunks. Building index...
🎯 Indexing complete!





In [7]:
openai_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY")  # Changed from OPENAI_API_KEY
)

# Test the connection (optional)
try:
    test_response = openai_client.chat.completions.create(
        model="deepseek/deepseek-r1:free",
        messages=[{"role": "user", "content": "Say 'Connection successful' if you can read this."}],
        max_tokens=50
    )
    print(f"🔌 OpenRouter Connection Test: {test_response.choices[0].message.content}")
except Exception as e:
    print(f"❌ Connection Error: {e}")
    print("Make sure you have OPENROUTER_API_KEY in your .env file")


🔌 OpenRouter Connection Test: 


In [8]:
def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    results = index.search(query, num_results=5)
    print(f"🔍 Found {len(results)} results for query: '{query}'")
    return results

# Test the search function
test_results = text_search("test dataset")
if test_results:
    print(f"Sample result: {test_results[0].get('chunk', '')[:200]}...")


🔍 Found 5 results for query: 'test dataset'
Sample result: Retrieval-Augmented Generation (RAG) systems rely on retrieving answers from a knowledge base before generating responses. To evaluate them effectively, you need a test dataset that reflects what the ...


In [12]:
def answer_question_manual(question: str) -> str:
    """
    Answer a question by manually searching and then using the LLM.
    This approach doesn't require tool support from the model.
    """
    # First, search for relevant information
    search_results = text_search(question)
    
    # Format the search results as context
    context = "\n\n---\n\n".join([
        f"Result {i+1} (from {result.get('filename', 'unknown')}):\n{result.get('chunk', '')}"
        for i, result in enumerate(search_results)
    ])
    
    # Create the prompt with the context
    prompt = f"""You are an expert assistant that answers questions about the Evidently project 
(https://github.com/evidentlyai/evidently) using ONLY the information provided in the context below.

Context from Evidently documentation:
{context}

User question: {question}

Instructions:
- Answer based ONLY on the provided context
- Be concise and clear
- If the answer is not in the context, say "I could not find this information in the Evidently documentation"
- Do not invent features or functionality

Answer:"""
    
    try:
        response = openai_client.chat.completions.create(
            model="deepseek/deepseek-r1:free",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions about Evidently."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=1000
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error getting answer: {e}"

In [13]:
question = "What components are required in a test dataset to evaluate AI?"

print(f"❓ Question: {question}")
print("🤔 Thinking (using manual search + LLM approach)...")

answer = answer_question_manual(question)
print(f"\n💡 Answer:\n{answer}")

❓ Question: What components are required in a test dataset to evaluate AI?
🤔 Thinking (using manual search + LLM approach)...
🔍 Found 5 results for query: 'What components are required in a test dataset to evaluate AI?'

💡 Answer:
To evaluate an AI system using Evidently, the required components in a test dataset depend on the use case but generally include:

1. **User-like questions** (generated from your knowledge base or scenario descriptions).  
   - For RAG systems: questions paired with **ground truth answers** extracted from the knowledge source.  
   - Optionally, include the **context** used to generate answers (e.g., source documents).  

2. **Adversarial or edge-case inputs** (e.g., tricky queries to test robustness).  

3. **Persona-specific inputs** (e.g., questions tailored to specific user types).  

These components can be generated directly in Evidently Cloud by:  
- Uploading your knowledge base to create RAG test cases (questions + answers).  
- Describing scenarios 