In [4]:
import io
import zipfile
import requests
import frontmatter
import logging
from tqdm import tqdm
from minsearch import Index
from minsearch import VectorSearch
from sentence_transformers import SentenceTransformer

def read_repo_data(repo_owner, repo_name, branch="main"):
    """
      Download and parse all markdown files from a GitHub repository.
    Yields one document (dict) at a time to avoid loading everything into memory.
    
    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)

    """
    url = f"https://codeload.github.com/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"
    resp = requests.get(url)

    if resp.status_code == 404 and branch == "main":
        # Try fallback to master
        return read_repo_data(repo_owner, repo_name, branch="master")
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: HTTP {resp.status_code}")
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            if not filename.lower().endswith((".md", ".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="replace")
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data.update({
                        "filename": filename,
                        "repo": repo_name,
                        "owner": repo_owner,
                        "branch": branch
                    })
                    yield data
            except Exception as e:
                logging.warning("Error processing %s: %s", filename, e)
                continue

In [5]:
def sliding_window(seq, size, step):
    """Yield overlapping chunks from a long string."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    for i in range(0, n, step):
        yield {"start": i, "chunk": seq[i:i+size]}
        if i + size >= n:
            break

In [7]:
print(" Downloading and chunking documents...")
evidently_chunks = []

for doc in tqdm(read_repo_data("evidentlyai", "docs"), desc="Processing files"):
    doc_copy = doc.copy()
    content = doc_copy.pop("content", "")
    for chunk in sliding_window(content, size=2000, step=1000):
        chunk.update(doc_copy)
        evidently_chunks.append(chunk)

print(f"Collected {len(evidently_chunks)} chunks. Building index...")

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)
index.fit(evidently_chunks)

print(" Indexing complete!")


 Downloading and chunking documents...


Processing files: 95it [00:01, 54.99it/s]

Collected 575 chunks. Building index...
 Indexing complete!





In [8]:
# Initialize an empty list to store embeddings for each chunk
evidently_embeddings = []

# Load a pre-trained sentence transformer model for creating embeddings
# 'multi-qa-distilbert-cos-v1' is good for semantic search and question-answering
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

# Loop through each document chunk in evidently_chunks
for d in tqdm(evidently_chunks):  # tqdm shows a progress bar
    # Encode the chunk text into a vector (embedding)
    v = embedding_model.encode(d['chunk'])
    # Append the embedding to the list
    evidently_embeddings.append(v)

# Convert the list of embeddings into a NumPy array for efficient numerical operations
evidently_embeddings = np.array(evidently_embeddings)

# Initialize a vector search index (for semantic similarity search)
evidently_vindex = VectorSearch()

# Fit the vector index with embeddings and associate them with their corresponding chunks
# This prepares the index for fast vector-based search later
evidently_vindex.fit(evidently_embeddings, evidently_chunks)


100%|████████████████████████████████████████████| 575/575 [05:00<00:00,  1.91it/s]


NameError: name 'np' is not defined

In [None]:
# Function for simple text-based search using the pre-built index
def text_search(query):
    # Search the index for the query and return top 5 results
    return index.search(query, num_results=5)


# Function for semantic vector-based search
def vector_search(query):
    # Encode the query into a vector using the embedding model
    q = embedding_model.encode(query)
    # Search the vector index for the top 5 most similar chunks
    return evidently_vindex.search(q, num_results=5)


# Function that combines both text search and vector search results
def hybrid_search(query):
    # Perform text-based search
    text_results = text_search(query)
    # Perform vector-based search
    vector_results = vector_search(query)
    
    # Combine results and remove duplicates based on filename
    seen_ids = set()         # To track which files have already been added
    combined_results = []    # Final list of combined search results

    # Loop through all results from both searches
    for result in text_results + vector_results:
        # Only add the result if its filename hasn't been seen yet
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    # Return the deduplicated combined results
    return combined_results



In [None]:
print('Question: What should a test dataset contain for AI evaluation? [text search]')
results = text_search("What should a test dataset contain for AI evaluation?")
for r in results:
    print(r['filename'], r['chunk'][:100])  # print first 200 characters


In [None]:
print('Question: What components are required in a test dataset to evaluate AI? [text search]')
results = text_search("How to install Evidently AI?")
for r in results:
    print(r['filename'], r['chunk'][:100])  # print first 200 characters


In [None]:

print('Question: What should a test dataset contain for AI evaluation? [Vector search]')
results = vector_search(" What should a test dataset contain for AI evaluation?")
for r in results:
    print(r['filename'], r['chunk'][:200])


In [None]:

print('Question:What components are required in a test dataset to evaluate AI? [Vector search]')
results = vector_search("What components are required in a test dataset to evaluate AI?")
for r in results:
    print(r['filename'], r['chunk'][:200])


In [None]:
print('Question: What should a test dataset contain for AI evaluation? [Hybrid search]')
results = hybrid_search("What should a test dataset contain for AI evaluation?")
for r in results:
    print(r['filename'], r['chunk'][:200])