In [23]:
import io
import os
import zipfile
import requests
import frontmatter
import logging
import asyncio
from tqdm import tqdm
from minsearch import Index
from typing import List, Any, Dict, Tuple, Optional
from dotenv import load_dotenv
from openai import OpenAI
from pydantic_ai import Agent
import json
import secrets
from pathlib import Path
from datetime import datetime
import numpy as np
from collections import defaultdict, Counter
from sentence_transformers import SentenceTransformer
from pydantic_ai.messages import ModelMessagesTypeAdapter

# Load environment variables
load_dotenv()

True

In [24]:
def read_repo_data(repo_owner, repo_name, branch="main"):
    """
    Download and parse all markdown files from a GitHub repository.
    Yields one document (dict) at a time to avoid loading everything into memory.

    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)
    """
    url = f"https://codeload.github.com/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"
    resp = requests.get(url)

    if resp.status_code == 404 and branch == "main":
        # Try fallback to master
        return read_repo_data(repo_owner, repo_name, branch="master")

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: HTTP {resp.status_code}")

    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            if not filename.lower().endswith((".md", ".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="replace")
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data.update({
                        "filename": filename,
                        "repo": repo_name,
                        "owner": repo_owner,
                        "branch": branch
                    })
                    yield data
            except Exception as e:
                logging.warning("Error processing %s: %s", filename, e)
                continue

In [25]:
def sliding_window(seq, size, step):
    """Yield overlapping chunks from a long string."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    for i in range(0, n, step):
        yield {"start": i, "chunk": seq[i:i+size]}
        if i + size >= n:
            break

In [26]:
evidently_chunks = []

for doc in tqdm(read_repo_data("evidentlyai", "docs"), desc="Processing files"):
    doc_copy = doc.copy()
    content = doc_copy.pop("content", "")
    for chunk in sliding_window(content, size=2000, step=1000):
        chunk.update(doc_copy)
        evidently_chunks.append(chunk)

print(f"Collected {len(evidently_chunks)} chunks. Building index...")

# Build text search index
index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)
index.fit(evidently_chunks)

print("Text indexing complete!")

Processing files: 95it [00:01, 58.58it/s]

Collected 575 chunks. Building index...
Text indexing complete!





In [27]:
class VectorSearch:
    """
    Simple vector search implementation using cosine similarity.
    """
    def __init__(self):
        self.embeddings = None
        self.documents = None
    
    def fit(self, embeddings: np.ndarray, documents: List[Dict]):
        """
        Store embeddings and associated documents.
        """
        self.embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # Normalize
        self.documents = documents
    
    def search(self, query_embedding: np.ndarray, num_results: int = 5) -> List[Dict]:
        """
        Search for most similar documents using cosine similarity.
        """
        if self.embeddings is None:
            return []
        
        # Normalize query embedding
        query_norm = query_embedding / np.linalg.norm(query_embedding)
        
        # Calculate cosine similarities
        similarities = np.dot(self.embeddings, query_norm)
        
        # Get top k indices
        top_indices = np.argsort(similarities)[-num_results:][::-1]
        
        # Return documents with similarity scores
        results = []
        for idx in top_indices:
            doc = self.documents[idx].copy()
            doc['similarity_score'] = float(similarities[idx])
            results.append(doc)
        
        return results

In [28]:
print("Creating embeddings for semantic search...")

# Initialize an empty list to store embeddings for each chunk
evidently_embeddings = []

# Load a pre-trained sentence transformer model for creating embeddings
print("Loading embedding model...")
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

# Loop through each document chunk in evidently_chunks
print("Encoding document chunks...")
for d in tqdm(evidently_chunks, desc="Creating embeddings"):
    # Create a combined text for better context
    text_to_encode = d['chunk']
    if 'title' in d and d['title']:
        text_to_encode = f"{d['title']}. {text_to_encode}"
    if 'filename' in d:
        # Extract meaningful parts from filename
        filename_parts = d['filename'].replace('/', ' ').replace('_', ' ').replace('.mdx', '').replace('.md', '')
        text_to_encode = f"{filename_parts}. {text_to_encode}"
    
    # Encode the enhanced text into a vector (embedding)
    v = embedding_model.encode(text_to_encode, show_progress_bar=False)
    
    # Append the embedding to the list
    evidently_embeddings.append(v)

# Convert the list of embeddings into a NumPy array
evidently_embeddings = np.array(evidently_embeddings)

# Initialize and fit vector search index
evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

print("Vector indexing complete!")


Creating embeddings for semantic search...
Loading embedding model...
Encoding document chunks...


Creating embeddings: 100%|█████████████████████████████████████████| 575/575 [05:21<00:00,  1.79it/s]

Vector indexing complete!





In [29]:
openai_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY")
)

# Test the connection
try:
    test_response = openai_client.chat.completions.create(
        model="deepseek/deepseek-r1:free",
        messages=[{"role": "user", "content": "Say 'Connection successful' if you can read this."}],
        max_tokens=50
    )
    print(f"OpenRouter Connection Test: {test_response.choices[0].message.content}")
except Exception as e:
    print(f"Connection Error: {e}")
    print("Make sure you have OPENROUTER_API_KEY in your .env file")


OpenRouter Connection Test: 


In [30]:
def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.
    """
    results = index.search(query, num_results=5)
    print(f"Text search found {len(results)} results for query: '{query}'")
    return results

def vector_search(query: str) -> List[Any]:
    """
    Perform semantic vector-based search.
    """
    # Encode the query into a vector using the embedding model
    q = embedding_model.encode(query)
    # Search the vector index for the top 5 most similar chunks
    results = evidently_vindex.search(q, num_results=5)
    print(f"Vector search found {len(results)} results for query: '{query}'")
    return results

def hybrid_search(query: str, alpha: float = 0.5, num_results: int = 5) -> List[Any]:
    """
    Perform hybrid search combining text and vector search.
    
    Args:
        query: Search query
        alpha: Weight for text search (1-alpha for vector search)
        num_results: Number of results to return
    """
    # Get results from both search methods
    text_results = index.search(query, num_results=num_results*2)
    
    q_embedding = embedding_model.encode(query)
    vector_results = evidently_vindex.search(q_embedding, num_results=num_results*2)
    
    # Create a scoring dictionary
    doc_scores = {}
    
    # Add text search scores
    for i, doc in enumerate(text_results):
        doc_key = doc.get('filename', '') + str(doc.get('start', 0))
        # Use inverse rank as score
        text_score = 1.0 / (i + 1)
        doc_scores[doc_key] = {
            'doc': doc,
            'text_score': text_score * alpha,
            'vector_score': 0
        }
    
    # Add vector search scores
    for doc in vector_results:
        doc_key = doc.get('filename', '') + str(doc.get('start', 0))
        vector_score = doc.get('similarity_score', 0) * (1 - alpha)
        
        if doc_key in doc_scores:
            doc_scores[doc_key]['vector_score'] = vector_score
        else:
            doc_scores[doc_key] = {
                'doc': doc,
                'text_score': 0,
                'vector_score': vector_score
            }
    
    # Calculate combined scores
    for key in doc_scores:
        doc_scores[key]['combined_score'] = doc_scores[key]['text_score'] + doc_scores[key]['vector_score']
    
    # Sort by combined score and return top results
    sorted_docs = sorted(doc_scores.values(), key=lambda x: x['combined_score'], reverse=True)
    results = [item['doc'] for item in sorted_docs[:num_results]]
    
    print(f"Hybrid search found {len(results)} results for query: '{query}'")
    return results

# Test all search methods
test_query = "test dataset"
print("\n Testing search methods:")
text_results = text_search(test_query)
vector_results = vector_search(test_query)
hybrid_results = hybrid_search(test_query)



 Testing search methods:
Text search found 5 results for query: 'test dataset'
Vector search found 5 results for query: 'test dataset'
Hybrid search found 5 results for query: 'test dataset'


In [31]:
def answer_question_manual(question: str, search_method: str = 'hybrid') -> str:
    """
    Answer a question by searching and then using the LLM.
    
    Args:
        question: The question to answer
        search_method: 'text', 'vector', or 'hybrid'
    """
    # Select search method
    if search_method == 'text':
        search_results = text_search(question)
    elif search_method == 'vector':
        search_results = vector_search(question)
    else:  # hybrid
        search_results = hybrid_search(question)

    # Format the search results as context
    context = "\n\n---\n\n".join([
        f"Result {i+1} (from {result.get('filename', 'unknown')}):\n{result.get('chunk', '')}"
        for i, result in enumerate(search_results)
    ])

    # Create the prompt with the context
    prompt = f"""You are an expert assistant that answers questions about the Evidently project 
(https://github.com/evidentlyai/evidently) using ONLY the information provided in the context below.

Context from Evidently documentation:
{context}

User question: {question}

Instructions:
- Answer based ONLY on the provided context
- Be concise and clear
- If the answer is not in the context, say "I could not find this information in the Evidently documentation"
- Do not invent features or functionality

Answer:"""

    try:
        response = openai_client.chat.completions.create(
            model="deepseek/deepseek-r1:free",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions about Evidently."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=1000
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error getting answer: {e}"

# Test question answering
question = "What components are required in a test dataset to evaluate AI?"
print(f"\n❓ Question: {question}")
print("Thinking (using hybrid search + LLM approach)...")
answer = answer_question_manual(question, search_method='hybrid')
print(f"\n Answer:\n{answer}")



❓ Question: What components are required in a test dataset to evaluate AI?
Thinking (using hybrid search + LLM approach)...
Hybrid search found 5 results for query: 'What components are required in a test dataset to evaluate AI?'

 Answer:
To evaluate an AI system (specifically a RAG system) using Evidently, the test dataset should include:

1. **User-like questions** - Automatically generated queries that mimic real user inputs
2. **Ground truth answers** - Corresponding correct answers derived directly from the knowledge base
3. **Optional context** - Source material from your knowledge base that was used to generate each answer (can be included if needed)

These components are generated automatically when using Evidently's RAG test dataset creation tool. The dataset can be refined by adding variations, removing irrelevant cases, or manually editing questions/responses before use in evaluation.


In [32]:
LOG_DIR = Path(os.getenv('LOGS_DIRECTORY', 'logs'))
LOG_DIR.mkdir(exist_ok=True)

def log_entry(agent, messages, source="user"):
    tools = []
    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())
    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)
    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

def log_interaction_to_file(agent, messages, source='user'):
    entry = log_entry(agent, messages, source)
    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)
    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename
    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)
    return filepath


In [33]:
def evaluate_search_quality(
    search_function, 
    test_queries: List[Tuple[str, List[str]]], 
    num_results: int = 5,
    log_results: bool = True
) -> Dict[str, Any]:
    """
    Evaluate search quality using hit rate and MRR metrics.
    """
    results = []
    timestamp = datetime.now()

    print(f"Starting evaluation with {len(test_queries)} test queries...")

    for idx, (query, expected_docs) in enumerate(test_queries, 1):
        print(f"  Query {idx}/{len(test_queries)}: '{query[:50]}...'")

        try:
            # Execute search
            if hasattr(search_function, '__self__'):  # Method
                search_results = search_function(query, num_results=num_results)
            else:  # Function
                search_results = search_function(query)[:num_results]

            # Extract filenames from results
            retrieved_docs = [doc.get('filename', '') for doc in search_results]

            # Calculate hit rate (binary: found at least one relevant doc)
            relevant_found = any(doc in expected_docs for doc in retrieved_docs)

            # Calculate MRR (Mean Reciprocal Rank)
            mrr = 0
            first_relevant_rank = None
            for i, doc in enumerate(retrieved_docs):
                if doc in expected_docs:
                    mrr = 1 / (i + 1)
                    first_relevant_rank = i + 1
                    break

            # Calculate Precision@k
            relevant_in_results = sum(1 for doc in retrieved_docs if doc in expected_docs)
            precision_at_k = relevant_in_results / len(retrieved_docs) if retrieved_docs else 0

            # Store detailed result
            result = {
                'query': query,
                'expected_docs': expected_docs,
                'retrieved_docs': retrieved_docs,
                'hit': relevant_found,
                'mrr': mrr,
                'precision_at_k': precision_at_k,
                'first_relevant_rank': first_relevant_rank,
                'num_relevant_found': relevant_in_results
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing query: {e}")
            results.append({
                'query': query,
                'expected_docs': expected_docs,
                'retrieved_docs': [],
                'hit': False,
                'mrr': 0,
                'precision_at_k': 0,
                'first_relevant_rank': None,
                'num_relevant_found': 0,
                'error': str(e)
            })

    # Calculate aggregate metrics
    hit_rate = sum(r['hit'] for r in results) / len(results) if results else 0
    avg_mrr = sum(r['mrr'] for r in results) / len(results) if results else 0
    avg_precision = sum(r['precision_at_k'] for r in results) / len(results) if results else 0

    evaluation_summary = {
        'timestamp': timestamp.isoformat(),
        'num_queries': len(test_queries),
        'num_results_per_query': num_results,
        'metrics': {
            'hit_rate': hit_rate,
            'mean_reciprocal_rank': avg_mrr,
            'mean_precision_at_k': avg_precision
        },
        'detailed_results': results
    }

    # Log results if requested
    if log_results:
        log_evaluation_results(evaluation_summary)

    return evaluation_summary

def log_evaluation_results(evaluation_data: Dict[str, Any]) -> Path:
    """
    Log evaluation results to a JSON file.
    """
    ts = datetime.fromisoformat(evaluation_data['timestamp'])
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)
    filename = f"search_evaluation_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename
    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(evaluation_data, f_out, indent=2, default=str)
    print(f"📝 Evaluation results saved to: {filepath}")
    return filepath

def print_evaluation_report(evaluation_summary: Dict[str, Any]):
    """
    Print a formatted evaluation report.
    """
    print("\n" + "="*60)
    print("📊 EVALUATION REPORT")
    print("="*60)
    print(f"Timestamp: {evaluation_summary['timestamp']}")
    print(f"Number of queries: {evaluation_summary['num_queries']}")
    print(f"Results per query: {evaluation_summary['num_results_per_query']}")

    print("\n📈 AGGREGATE METRICS:")
    metrics = evaluation_summary['metrics']
    print(f"  • Hit Rate: {metrics['hit_rate']:.2%}")
    print(f"  • Mean Reciprocal Rank (MRR): {metrics['mean_reciprocal_rank']:.3f}")
    print(f"  • Mean Precision@{evaluation_summary['num_results_per_query']}: {metrics['mean_precision_at_k']:.2%}")

    print("\n🔍 QUERY-LEVEL RESULTS:")
    for i, result in enumerate(evaluation_summary['detailed_results'][:5], 1):
        print(f"\n  Query {i}: \"{result['query'][:50]}...\"")
        print(f"    • Hit: {'✅' if result['hit'] else '❌'}")
        print(f"    • MRR: {result['mrr']:.3f}")
        print(f"    • Precision: {result['precision_at_k']:.2%}")
        if result['first_relevant_rank']:
            print(f"    • First relevant at rank: {result['first_relevant_rank']}")

    if len(evaluation_summary['detailed_results']) > 5:
        print(f"\n  ... and {len(evaluation_summary['detailed_results']) - 5} more queries")

    print("\n" + "="*60)

In [34]:
test_queries = [
    # Testing and evaluation
    ("What components are required in a test dataset to evaluate AI?", 
     ["docs-main/docs/library/tests.mdx", 
      "docs-main/examples/LLM_regression_testing.mdx"]),

    ("How to run evaluations in Evidently?", 
     ["docs-main/docs/library/evaluations_overview.mdx",
      "docs-main/docs/library/tests.mdx"]),

    # Data drift
    ("Data drift detection methods",
     ["docs-main/metrics/preset_data_drift.mdx",
      "docs-main/metrics/customize_data_drift.mdx"]),

    ("How to customize embedding drift detection?",
     ["docs-main/metrics/customize_embedding_drift.mdx",
      "docs-main/metrics/explainer_drift.mdx"]),

    # Monitoring and dashboards
    ("How to create monitoring dashboards?",
     ["docs-main/docs/platform/dashboard_overview.mdx",
      "docs-main/docs/platform/dashboard_add_panels.mdx"]),

    ("Dashboard panel types and configuration",
     ["docs-main/docs/platform/dashboard_panel_types.mdx",
      "docs-main/docs/platform/dashboard_add_panels_ui.mdx"]),

    # Reports and output formats
    ("How to generate reports in Evidently?",
     ["docs-main/docs/library/report.mdx",
      "docs-main/docs/library/output_formats.mdx"]),

    # Synthetic data
    ("Generate synthetic test data",
     ["docs-main/synthetic-data/introduction.mdx",
      "docs-main/synthetic-data/input_data.mdx",
      "docs-main/docs/library/synthetic_data_api.mdx"]),

    # Alerts and monitoring
    ("Setting up alerts for model monitoring",
     ["docs-main/docs/platform/alerts.mdx",
      "docs-main/docs/platform/dashboard_overview.mdx"]),

    # Data and metrics
    ("Understanding data definition and descriptors",
     ["docs-main/docs/library/data_definition.mdx",
      "docs-main/docs/library/descriptors.mdx"])
]

In [35]:
def compare_search_methods(test_queries):
    """
    Compare text, vector, and hybrid search performance.
    """
    print("\n" + "="*60)
    print(" COMPARING SEARCH METHODS")
    print("="*60)
    
    methods = {
        'text': lambda q: text_search(q),
        'vector': lambda q: vector_search(q),
        'hybrid': lambda q: hybrid_search(q, alpha=0.5)
    }
    
    comparison_results = {}
    
    for method_name, search_fn in methods.items():
        print(f"\n📊 Evaluating {method_name.upper()} search...")
        
        eval_results = evaluate_search_quality(
            search_function=search_fn,
            test_queries=test_queries,
            num_results=5,
            log_results=False  # Don't log intermediate results
        )
        
        metrics = eval_results['metrics']
        comparison_results[method_name] = {
            'hit_rate': metrics['hit_rate'],
            'mrr': metrics['mean_reciprocal_rank'],
            'precision': metrics['mean_precision_at_k'],
            'detailed_results': eval_results['detailed_results']
        }
        
        print(f"  • Hit Rate: {metrics['hit_rate']:.2%}")
        print(f"  • MRR: {metrics['mean_reciprocal_rank']:.3f}")
        print(f"  • Precision@5: {metrics['mean_precision_at_k']:.2%}")
    
    # Find best method for each metric
    print("\n BEST METHODS:")
    print(f"  • Best Hit Rate: {max(comparison_results.items(), key=lambda x: x[1]['hit_rate'])[0].upper()}")
    print(f"  • Best MRR: {max(comparison_results.items(), key=lambda x: x[1]['mrr'])[0].upper()}")
    print(f"  • Best Precision: {max(comparison_results.items(), key=lambda x: x[1]['precision'])[0].upper()}")
    
    # Show improvement from text to hybrid
    if 'text' in comparison_results and 'hybrid' in comparison_results:
        hit_rate_improvement = comparison_results['hybrid']['hit_rate'] - comparison_results['text']['hit_rate']
        mrr_improvement = comparison_results['hybrid']['mrr'] - comparison_results['text']['mrr']
        
        print(f"\n📈 Hybrid vs Text Search Improvement:")
        print(f"  • Hit Rate: {hit_rate_improvement:+.1%}")
        print(f"  • MRR: {mrr_improvement:+.3f}")
    
    return comparison_results

# Run comparison
comparison = compare_search_methods(test_queries)




 COMPARING SEARCH METHODS

📊 Evaluating TEXT search...
Starting evaluation with 10 test queries...
  Query 1/10: 'What components are required in a test dataset to ...'
Text search found 5 results for query: 'What components are required in a test dataset to evaluate AI?'
  Query 2/10: 'How to run evaluations in Evidently?...'
Text search found 5 results for query: 'How to run evaluations in Evidently?'
  Query 3/10: 'Data drift detection methods...'
Text search found 5 results for query: 'Data drift detection methods'
  Query 4/10: 'How to customize embedding drift detection?...'
Text search found 5 results for query: 'How to customize embedding drift detection?'
  Query 5/10: 'How to create monitoring dashboards?...'
Text search found 5 results for query: 'How to create monitoring dashboards?'
  Query 6/10: 'Dashboard panel types and configuration...'
Text search found 5 results for query: 'Dashboard panel types and configuration'
  Query 7/10: 'How to generate reports in Evidently?

In [36]:

def optimize_hybrid_alpha(test_queries, alphas=[0.0, 0.25, 0.5, 0.75, 1.0]):
    """
    Find the optimal alpha value for hybrid search.
    """
    print("\n" + "="*60)
    print(" OPTIMIZING HYBRID SEARCH ALPHA")
    print("="*60)
    
    results = []
    
    for alpha in alphas:
        print(f"\n🔧 Testing alpha={alpha:.2f}")
        
        hybrid_fn = lambda q: hybrid_search(q, alpha=alpha)
        
        eval_results = evaluate_search_quality(
            search_function=hybrid_fn,
            test_queries=test_queries,
            num_results=5,
            log_results=False
        )
        
        metrics = eval_results['metrics']
        results.append({
            'alpha': alpha,
            'hit_rate': metrics['hit_rate'],
            'mrr': metrics['mean_reciprocal_rank'],
            'precision': metrics['mean_precision_at_k']
        })
        
        print(f"  • Hit Rate: {metrics['hit_rate']:.2%}")
        print(f"  • MRR: {metrics['mean_reciprocal_rank']:.3f}")
    
    # Find optimal alpha
    best_by_mrr = max(results, key=lambda x: x['mrr'])
    best_by_hit_rate = max(results, key=lambda x: x['hit_rate'])
    
    print(f"\n🎯 OPTIMAL ALPHA VALUES:")
    print(f"  • Best for MRR: α={best_by_mrr['alpha']:.2f} (MRR={best_by_mrr['mrr']:.3f})")
    print(f"  • Best for Hit Rate: α={best_by_hit_rate['alpha']:.2f} (Hit Rate={best_by_hit_rate['hit_rate']:.2%})")
    
    return results

# Optimize alpha
alpha_results = optimize_hybrid_alpha(test_queries)



 OPTIMIZING HYBRID SEARCH ALPHA

🔧 Testing alpha=0.00
Starting evaluation with 10 test queries...
  Query 1/10: 'What components are required in a test dataset to ...'
Hybrid search found 5 results for query: 'What components are required in a test dataset to evaluate AI?'
  Query 2/10: 'How to run evaluations in Evidently?...'
Hybrid search found 5 results for query: 'How to run evaluations in Evidently?'
  Query 3/10: 'Data drift detection methods...'
Hybrid search found 5 results for query: 'Data drift detection methods'
  Query 4/10: 'How to customize embedding drift detection?...'
Hybrid search found 5 results for query: 'How to customize embedding drift detection?'
  Query 5/10: 'How to create monitoring dashboards?...'
Hybrid search found 5 results for query: 'How to create monitoring dashboards?'
  Query 6/10: 'Dashboard panel types and configuration...'
Hybrid search found 5 results for query: 'Dashboard panel types and configuration'
  Query 7/10: 'How to generate reports in

In [37]:
print("\n" + "="*60)
print("🚀 FINAL EVALUATION WITH OPTIMIZED HYBRID SEARCH")
print("="*60)

# Use the best alpha from optimization (you can adjust based on results)
best_alpha = 0.5  # Adjust based on optimization results

final_search = lambda q: hybrid_search(q, alpha=best_alpha)

final_evaluation = evaluate_search_quality(
    search_function=final_search,
    test_queries=test_queries,
    num_results=5,
    log_results=True
)

print_evaluation_report(final_evaluation)




🚀 FINAL EVALUATION WITH OPTIMIZED HYBRID SEARCH
Starting evaluation with 10 test queries...
  Query 1/10: 'What components are required in a test dataset to ...'
Hybrid search found 5 results for query: 'What components are required in a test dataset to evaluate AI?'
  Query 2/10: 'How to run evaluations in Evidently?...'
Hybrid search found 5 results for query: 'How to run evaluations in Evidently?'
  Query 3/10: 'Data drift detection methods...'
Hybrid search found 5 results for query: 'Data drift detection methods'
  Query 4/10: 'How to customize embedding drift detection?...'
Hybrid search found 5 results for query: 'How to customize embedding drift detection?'
  Query 5/10: 'How to create monitoring dashboards?...'
Hybrid search found 5 results for query: 'How to create monitoring dashboards?'
  Query 6/10: 'Dashboard panel types and configuration...'
Hybrid search found 5 results for query: 'Dashboard panel types and configuration'
  Query 7/10: 'How to generate reports in Evide