In [19]:
from openai import OpenAI
from pydantic import BaseModel
import json
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

openai_client = OpenAI()

## Load and Parse GitHub Data

In [2]:
import docs # script to read, parse, and chunk GitHub data

github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)
chunks = docs.chunk_documents(parsed_data)

In [4]:
len(chunks)

575

In [6]:
chunks[1]

{'start': 0,
 'content': '<Update label="2025-07-18" description="Evidently v0.7.11">\n  ## **Evidently 0.7.11**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.11).\n\nExample notebooks:\n- Synthetic data generation: [code example](https://github.com/evidentlyai/evidently/blob/main/examples/cookbook/datagen.ipynb)\n\n</Update>\n\n<Update label="2025-07-09" description="Evidently v0.7.10">\n  ## **Evidently 0.7.10**\n    Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.10).\n  \nNEW: automated prompt optimization. Read the release blog on [prompt optimization for LLM judges](https://www.evidentlyai.com/blog/llm-judge-prompt-optimization).\n\nExample notebooks:\n- Code review binary LLM judge prompt optimization: [code example](https://github.com/evidentlyai/evidently/blob/main/examples/cookbook/prompt_optimization_code_review_example.ipynb)\n- Topic multi-class LLM judge prompt optimization: [code e

## Create the search Index

In [7]:
from minsearch import Index

index = Index(
    text_fields=["content", "filename", "title", "description"],
)

index.fit(chunks)

<minsearch.minsearch.Index at 0x1b1f4cf7890>

### create agent tools (search and read)

In [8]:
from typing import Any, Dict, List, TypedDict

class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    title: str
    description: str
    filename: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - filename (str): The path or name of the source file.
    """
    return index.search(
        query=query,
        num_results=5,
    )

In [10]:
test_search= search("How to create a custom agent with tools")
test_search

[{'start': 0,
  'content': "You can build fully custom Metrics/Tests to handle any column- or dataset-level evaluations. This lets you implement business metrics, weighted scores, etc.\n\n<Info>\n  There are ways to customize your evals that do not require creating Metrics from scratch:\n\n  * Add a [custom text descriptor](/metrics/customize_descriptor) for row-level evaluations.\n\n  * Use a built-in template to create a custom [LLM-based evaluator](/metrics/customize_llm_judge).\n\n  * Implement a [custom data drift](/metrics/customize_data_drift) detection method reusing existing renders.&#x20;\n</Info>\n\nCreating a custom Metric involves:\n\n* (Required). Implementing the Metric **calculation method**.\n\n* (Optional). Defining the **default Test conditions** that apply when you run Tests for this Metric (with or without Reference) without passing a custom condition.\n\n* (Optional). Creating a **custom visualization** for this Metric using Plotly. If you skip this, the Metric wi

In [11]:
type(test_search[0])

dict

In [15]:
#Set up the file index for quick access to complete documents:
file_index = {}

for doc in parsed_data:
    filename = doc['filename']
    file_index[filename] = doc

In [16]:
from typing import Optional

def read_file(filename: str) -> Optional[str]:
    """
    Retrieve the content of a file from the repository.

    Args:
        filename (str): The name or path of the file to read.

    Returns:
        Optional[str]: The file content as a string if the file exists;
        otherwise, returns None.
    """
    if filename in file_index:
        return file_index[filename]['content']
    return None

In [17]:
read_file('docs/platform/datasets_workflow.mdx')

'<Tip>\n  You must first connect to [Evidently Cloud](/docs/setup/cloud) and [create a Project](/docs/platform/projects_manage).\n</Tip>\n\n## Upload a Dataset\n\n<Tabs>\n  <Tab title="Python">\n    Prepare your dataset as an Evidently Dataset with the corresponding data definition. To upload a Dataset to the specified Project in workspace `ws`, use the `add_dataset` method:\n\n    ```python\n    eval_data = Dataset.from_pandas(\n        source_df,\n        data_definition=DataDefinition()\n    )\n    ws.add_dataset(\n        dataset = eval_data, \n        name = "dataset_name",\n        project_id = project.id, \n        description = "Optional description")\n    ```\n\n    You must always specify the dataset `name` that you will see in the UI. The description is optional.\n  </Tab>\n  <Tab title="UI">\n    To upload any existing dataset as a CSV file, click on "Add dataset". When you upload the Dataset, you must also add a [**data definition**](/docs/library/data_definition). This le

# Agent using Pydantic AI

In [12]:
# Define comprehensive instructions for our documentation agent
instructions = """
You are an assistant that helps improve and generate high-quality documentation for the project.

You have access to the following tools:
- search — Use this to explore topics in depth. Make multiple search calls if needed to gather comprehensive information.
- read_file — Use this when code snippets are missing or when you need to retrieve the full content of a file for context.

Critical Rule

Before generating or finalizing any code example or technical explanation, you must always call `read_file`
to cross-check the correctness of the code.
Do not rely solely on search results or assumptions — always verify by reading the actual file content.

If `read_file` cannot be used or the file content is unavailable, clearly state:
> "Unable to verify with read_file."

When answering a question:
1. Provide file references for all source materials.  
   Use this format:  
   [{filename}](https://github.com/evidentlyai/docs/blob/main/{filename})
2. If the topic is covered in multiple documents, cite all relevant sources.
3. Include code examples whenever they clarify or demonstrate the concept.
4. Be concise, accurate, and helpful — focus on clarity and usability for developers.
5. If documentation is missing or unclear, infer from context and note that explicitly.

Example Citation

See the full implementation in [metrics/api_reference.md](https://github.com/evidentlyai/docs/blob/main/metrics/api_reference.md).
""".strip()

In [20]:
agent_tools = [search, read_file]

from pydantic_ai import Agent

agent = Agent(
    name="docs_agent",
    instructions=instructions,
    tools=agent_tools,
    model='gpt-4o-mini'
)

In [21]:
# Test the agent with a single query:
results = await agent.run(user_prompt='how do I run llm as a judge evals?')


In [22]:
results

AgentRunResult(output='To run an LLM (Language Model) as a judge for evaluations, you can follow a structured approach using the Evidently library in Python. This process leverages LLMs to assess responses against defined criteria or references. Here’s a step-by-step guide based on the tutorial from the Evidently documentation:\n\n### Step 1: Setup\n\n1. **Install Evidently**:\n   ```bash\n   pip install evidently\n   ```\n\n2. **Import Required Modules**:\n   ```python\n   import pandas as pd\n   import numpy as np\n   from evidently import Dataset, DataDefinition, Report, BinaryClassification\n   from evidently.llm.templates import BinaryClassificationPromptTemplate\n   ```\n\n3. **Set Up Your OpenAI API Key**:\n   ```python\n   import os\n   os.environ["OPENAI_API_KEY"] = "YOUR_KEY"\n   ```\n\n### Step 2: Create an Evaluation Dataset\n\nCreate a toy Q&A dataset with questions, target responses, new responses, and their labels.\n\n```python\ndata = [\n    ["Question?", "Target respon

In [24]:
print(results.output)

To run an LLM (Language Model) as a judge for evaluations, you can follow a structured approach using the Evidently library in Python. This process leverages LLMs to assess responses against defined criteria or references. Here’s a step-by-step guide based on the tutorial from the Evidently documentation:

### Step 1: Setup

1. **Install Evidently**:
   ```bash
   pip install evidently
   ```

2. **Import Required Modules**:
   ```python
   import pandas as pd
   import numpy as np
   from evidently import Dataset, DataDefinition, Report, BinaryClassification
   from evidently.llm.templates import BinaryClassificationPromptTemplate
   ```

3. **Set Up Your OpenAI API Key**:
   ```python
   import os
   os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
   ```

### Step 2: Create an Evaluation Dataset

Create a toy Q&A dataset with questions, target responses, new responses, and their labels.

```python
data = [
    ["Question?", "Target response.", "New response.", "label", "comment"],
    # Ad

In [26]:
# You can also access the complete message history
results.all_messages()

[ModelRequest(parts=[UserPromptPart(content='how do I run llm as a judge evals?', timestamp=datetime.datetime(2025, 10, 27, 1, 56, 47, 551568, tzinfo=datetime.timezone.utc))], instructions='You are an assistant that helps improve and generate high-quality documentation for the project.\n\nYou have access to the following tools:\n- search — Use this to explore topics in depth. Make multiple search calls if needed to gather comprehensive information.\n- read_file — Use this when code snippets are missing or when you need to retrieve the full content of a file for context.\n\nCritical Rule\n\nBefore generating or finalizing any code example or technical explanation, you must always call `read_file`\nto cross-check the correctness of the code.\nDo not rely solely on search results or assumptions — always verify by reading the actual file content.\n\nIf `read_file` cannot be used or the file content is unavailable, clearly state:\n> "Unable to verify with read_file."\n\nWhen answering a que

In [27]:
results.new_messages()

[ModelRequest(parts=[UserPromptPart(content='how do I run llm as a judge evals?', timestamp=datetime.datetime(2025, 10, 27, 1, 56, 47, 551568, tzinfo=datetime.timezone.utc))], instructions='You are an assistant that helps improve and generate high-quality documentation for the project.\n\nYou have access to the following tools:\n- search — Use this to explore topics in depth. Make multiple search calls if needed to gather comprehensive information.\n- read_file — Use this when code snippets are missing or when you need to retrieve the full content of a file for context.\n\nCritical Rule\n\nBefore generating or finalizing any code example or technical explanation, you must always call `read_file`\nto cross-check the correctness of the code.\nDo not rely solely on search results or assumptions — always verify by reading the actual file content.\n\nIf `read_file` cannot be used or the file content is unavailable, clearly state:\n> "Unable to verify with read_file."\n\nWhen answering a que

results.all_messages() → the entire conversation history the agent knows about up to and including this run (system/instructions, previous turns, tool calls, current run output—everything). 
Pydantic AI

results.new_messages() → only the messages created in the current run (handy for appending just the delta to your store/logs). 
Pydantic AI
+1

In [28]:
#Usage and Cost Tracking
results.usage()

RunUsage(input_tokens=11570, cache_read_tokens=3072, output_tokens=997, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, requests=3, tool_calls=2)

In [37]:
print(dir(results))                 # list attributes/methods


['__annotations__', '__class__', '__class_getitem__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_new_message_index', '_output_tool_name', '_set_output_tool_return', '_state', '_traceparent', '_traceparent_value', 'all_messages', 'all_messages_json', 'new_messages', 'new_messages_json', 'output', 'response', 'timestamp', 'usage']


In [47]:
for message in results.new_messages():
    print(message.kind)

    for part in message.parts:
        print(part.part_kind)

    print()

request
user-prompt

response
tool-call

request
tool-return

response
tool-call

request
tool-return

response
text



In [40]:

from toyaikit.chat import IPythonChatInterface
from toyaikit.chat.runners import PydanticAIRunner

In [44]:
runner = PydanticAIRunner(
    chat_interface=IPythonChatInterface(),
    agent=agent
)

In [46]:
await runner.run();

Chat ended.
