In [1]:
from openai import OpenAI
from pydantic import BaseModel
import json
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from typing import Any, Dict, List, TypedDict
load_dotenv()

openai_client = OpenAI()

## Fetch Web Page Tool

In [33]:
import requests
import json

In [34]:
# Jina Reader API Base
JINA_READER_BASE = "https://r.jina.ai/"

In [55]:
TARGET_URL = "https://en.wikipedia.org/wiki/Rose"  # Target URL to scrape

In [56]:
markdown_url = JINA_READER_BASE + TARGET_URL


In [57]:
markdown_url

'https://r.jina.ai/https://en.wikipedia.org/wiki/Rose'

In [58]:
try:
    response = requests.get(markdown_url)
    response.raise_for_status() # Raises an exception for 4xx or 5xx status codes
    
    clean_markdown = response.text
    print("--- Clean Content (Markdown) ---")
    print(clean_markdown[:500] + "...") # Print the first 500 characters
    
except requests.exceptions.RequestException as e:
    print(f"Error fetching page: {e}")

--- Clean Content (Markdown) ---
Title: Rose

URL Source: https://en.wikipedia.org/wiki/Rose

Published Time: 2002-02-01T12:22:29Z

Markdown Content:
| Rose Temporal range: Eocene–Recent [PreꞒ](https://en.wikipedia.org/wiki/Precambrian "Precambrian") [Ꞓ](https://en.wikipedia.org/wiki/Cambrian "Cambrian") [O](https://en.wikipedia.org/wiki/Ordovician "Ordovician") [S](https://en.wikipedia.org/wiki/Silurian "Silurian") [D](https://en.wikipedia.org/wiki/Devonian "Devonian") [C](https://en.wikipedia.org/wiki/Carboniferous "Carbonife...


In [59]:
print(clean_markdown)

Title: Rose

URL Source: https://en.wikipedia.org/wiki/Rose

Published Time: 2002-02-01T12:22:29Z

Markdown Content:
| Rose Temporal range: Eocene–Recent [PreꞒ](https://en.wikipedia.org/wiki/Precambrian "Precambrian") [Ꞓ](https://en.wikipedia.org/wiki/Cambrian "Cambrian") [O](https://en.wikipedia.org/wiki/Ordovician "Ordovician") [S](https://en.wikipedia.org/wiki/Silurian "Silurian") [D](https://en.wikipedia.org/wiki/Devonian "Devonian") [C](https://en.wikipedia.org/wiki/Carboniferous "Carboniferous") [P](https://en.wikipedia.org/wiki/Permian "Permian") [T](https://en.wikipedia.org/wiki/Triassic "Triassic") [J](https://en.wikipedia.org/wiki/Jurassic "Jurassic") [K](https://en.wikipedia.org/wiki/Cretaceous "Cretaceous") [Pg](https://en.wikipedia.org/wiki/Paleogene "Paleogene") [N](https://en.wikipedia.org/wiki/Neogene "Neogene") |
| --- |
| [![Image 1](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Rosa_rubiginosa_1.jpg/250px-Rosa_rubiginosa_1.jpg)](https://en.wikipedia.org/w

In [None]:
json_url = JINA_READER_BASE + TARGET_URL
headers = {
    "X-Return-Format": "json" 
    # Optional: Add your API key for higher rate limits
    # "Authorization": "Bearer YOUR_JINA_API_KEY"
}

In [64]:
import requests
import json

json_url = "https://r.jina.ai/https://en.wikipedia.org/wiki/Rose"

# ✅ CORRECT HEADERS: Explicitly set the format to 'json'
headers = {
    "X-Return-Format": "json" 
    # Add your API key here if you want higher rate limits:
    # "Authorization": "Bearer YOUR_JINA_API_KEY" 
}

try:
    response = requests.get(json_url, headers=headers)
    
    # This line checks the status code (400, 404, 500, etc.)
    response.raise_for_status() 
    
    structured_data = response.json()
    
    print("--- Structured Content (JSON) ---")
    print(f"Title: {structured_data.get('data', {}).get('title')}")
    print(f"Extracted Content Length: {len(structured_data.get('data', {}).get('content', ''))} characters")

except requests.exceptions.RequestException as e:
    # This will catch the 400 error if it persists
    print(f"Error fetching JSON: {e}")
except json.JSONDecodeError:
    print("Error: Could not decode JSON response. The server may have returned plain HTML or an error message instead of JSON.")

Error fetching JSON: 400 Client Error: Bad Request for url: https://r.jina.ai/https://en.wikipedia.org/wiki/Rose


In [2]:
import requests
import json
import urllib.parse  # Good practice for encoding URLs
from dataclasses import dataclass
from typing import Dict, Any

@dataclass
class WebPageCotent:
    """
    Container for fetched web page content.

    Attributes:
        wiki_url: Original Wikipedia URL requested.
        title:    Extracted title from the response (or 'N/A' on failure).
        content:  Extracted main content/body (may be empty on failure).
    """
    wiki_url: str
    title: str
    content: str

def fetch_web_content(wiki_url: str) -> WebPageCotent:
    """
    Fetch content for a Wikipedia page via the Jina reader proxy.

    The function URL-encodes the provided Wikipedia URL, requests the Jina reader
    endpoint, and attempts to parse a JSON payload where content is expected under
    a 'data' object (i.e., {"data": {"title": "...", "content": "..."}}).

    If any network/HTTP/JSON parsing error occurs, it returns a WebPageCotent with
    title='N/A' and a short error message in `content`.

    Args:
        wiki_url: Full Wikipedia URL (e.g., "https://en.wikipedia.org/wiki/Rose").

    Returns:
        WebPageCotent: Dataclass containing the requested URL, title, and content.
    """
    # 1. URL-encode the target URL (best practice) and prepend the Jina reader base
    encoded_target_url: str = urllib.parse.quote(wiki_url, safe='')
    json_url: str = f"https://r.jina.ai/{encoded_target_url}"

    # 2. Use the standard 'Accept' header for JSON response
    headers: Dict[str, str] = {
        "Accept": "application/json"
    }

    print(f"Requesting URL: {json_url}")
    print(f"With Headers: {headers}")

    try:
        response = requests.get(json_url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors (like 400, 404)

        # Decode the JSON response
        structured_data: Any = response.json()

        # Jina's JSON structure usually nests content under a 'data' object
        data: Dict[str, Any] = structured_data.get('data', {}) if isinstance(structured_data, dict) else {}

        # Access the clean content and title
        title: str = data.get('title', 'N/A')
        content: str = data.get('content', '')
        wiki_url = wiki_url  # preserve original variable per existing structure

    except (requests.RequestException, json.JSONDecodeError, ValueError) as e:
        title = 'N/A'
        content = f'Error fetching or parsing JSON response: {e}'
        wiki_url = wiki_url

    return WebPageCotent(
        wiki_url=wiki_url,
        title=title,
        content=content
    )


## Append/Search Tool

In [3]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any


summary_storage: List[Dict[str, Any]] = []
class SearchAppendTool:
    """
    Minimal FAISS-backed semantic search helper that lets you append
    text summaries with custom integer IDs and query them locally.
    """

    def __init__(self) -> None:
        
        # Initialize the embedding model
        self.model: SentenceTransformer = SentenceTransformer("all-MiniLM-L6-v2")
        D: int = 384  # Dimension of the 'all-MiniLM-L6-v2' model

        # 1. Create the base index (supports incremental addition)
        base_index = faiss.IndexFlatIP(D)

        # 2. Wrap with IndexIDMap to enable custom integer IDs
        self.index: faiss.IndexIDMap = faiss.IndexIDMap(base_index)

        # Local storage to map IDs back to text/metadata
        self.next_available_id: int = 0

    def append_webpage_summary(self, summary: str, wiki_url: str, title: str) -> None:
        """
        Encode a single summary and append it to the FAISS index with a custom ID.

        Args:
            summary:  The text chunk/summary to store.
            wiki_url: Source URL for reference.
            title:    Title associated with the content.
        """
        # 1. Vectorize the text (normalize=True for cosine similarity with IP)
        vector = self.model.encode([summary], normalize_embeddings=True)

        # 2. Prepare the data for FAISS (float32, 2D array)
        vectors_to_add = vector.astype("float32")

        # FAISS requires a 1D int64 array of IDs
        ids_to_add = np.array([self.next_available_id], dtype=np.int64)

        # 3. Append the vector and custom ID to the index
        self.index.add_with_ids(vectors_to_add, ids_to_add)

        # 4. Update local storage (ID -> item mapping)
        summary_storage.append(
            {"id": self.next_available_id, "summary": summary, "wiki_url": wiki_url, "title": title}
        )

        # Increment ID counter (was a no-op previously)
        self.next_available_id += 1

        # Keep original behavior: no return value
        return

    def search_local(self, query: str) -> List[Dict[str, Any]]:
        """
        Perform a semantic search over locally indexed summaries.

        Args:
            query: The search query string.

        Returns:
            A list of stored items (dicts) that match best, up to top-3,
            each including: {"id", "summary", "wiki_url", "title"}.
        """
        # 1. Vectorize the query
        query_vector = self.model.encode([query], normalize_embeddings=True).astype("float32")

        # 2. Perform the search (top-3)
        distances, indices = self.index.search(query_vector, 3)

        # 3. Retrieve the corresponding summaries
        results: List[Dict[str, Any]] = []
        for idx in indices[0]:
            if idx == -1:
                continue  # Skip empty slots
            # Find the summary in local storage
            for item in summary_storage:
                if item["id"] == int(idx):
                    results.append(item)
                    break

        return results


## Pydantic AI

In [4]:
from pydantic_ai import Agent
from pydantic import BaseModel, Field

from typing import List

In [5]:
from pydantic_ai.messages import FunctionToolCallEvent, FunctionToolResultEvent

class NamedCallback:

    def __init__(self, agent):
        self.agent_name = agent.name

    async def print_function_calls(self, ctx, event):
        # Detect nested streams
        if hasattr(event, "__aiter__"):
            async for sub in event:
                await self.print_function_calls(ctx, sub)
            return

        if isinstance(event, FunctionToolCallEvent):
            print("CALL →", event.part.tool_name, event.part.args_as_dict(), event.tool_call_id)
        elif isinstance(event, FunctionToolResultEvent):
            print("RES  ←", event.result.tool_name, event.tool_call_id, event.result.content)

    async def __call__(self, ctx, event):
        return await self.print_function_calls(ctx, event)

In [6]:
instructions = """
You are an AI assistant that fetches, summarizes, and answers questions about Wikipedia articles.

### Tools
- **fetch_web_content(url)**: Retrieve a Wikipedia article’s content and title.
- **append_webpage_summary(summary, wiki_url, title)**: Store a short summary in the local knowledge base.
- **search_local(query)**: Search stored summaries for relevant information.

### Workflow
1. When a user provides a Wikipedia URL:
   - Fetch the article with `fetch_web_content`.
   - Summarize it in 2–4 sentences (clear, factual, neutral).
   - Store it using `append_webpage_summary` (include URL and title).
2. When a user asks a question:
   - Use `search_local` to find relevant summaries.
   - Combine information from the top results to form a concise, factual answer.
   - **Always include references** — each answer must cite the **Wikipedia page title** and **URL** for every source used.

### Behavior Guidelines
- Be accurate, concise, and neutral.
- Prefer existing stored summaries before fetching new data.
- If unsure, explain reasoning clearly.
- Example reference format: “According to *Rose* (https://en.wikipedia.org/wiki/Rose), ...”
""".strip()


In [7]:
search_tools = SearchAppendTool()

In [8]:
first_agent = Agent(
    name='clarifier',
    instructions=instructions,
    tools=[search_tools.append_webpage_summary, search_tools.search_local, fetch_web_content],
    model='gpt-4o-mini',
)

first_agent_callback = NamedCallback(first_agent)

In [9]:
question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"

In [10]:
clarifier_results1 = await first_agent.run(
    user_prompt=question,
    event_stream_handler=first_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Capybara'} call_vT807v29wZhhNlgVDbBIt40v
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FCapybara
With Headers: {'Accept': 'application/json'}
RES  ← fetch_web_content call_vT807v29wZhhNlgVDbBIt40v WebPageCotent(wiki_url='https://en.wikipedia.org/wiki/Capybara', title='Capybara', content='[![Image 1: This is a good article. Click here for more information.](https://upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/20px-Symbol_support_vote.svg.png)](https://en.wikipedia.org/wiki/Wikipedia:Good_articles* "This is a good article. Click here for more information.")\n\n[![Image 2: Page semi-protected](https://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png)](https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi "This article is semi-protected until March 17, 2028 at 19:23 UTC.")\n\nFrom Wikipedia, the free en

In [11]:
print(clarifier_results1.output)

The **capybara** (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America. It inhabits savannas and dense forests, living near bodies of water, and is known for its social nature, typically found in groups of 10-20, but can form larger herds. Capybaras primarily feed on grasses and aquatic plants, and they are semi-aquatic, displaying excellent swimming abilities. Despite hunting pressures in certain areas, they are not considered endangered and have adapted successfully to some urban environments where they can even be kept as pets (source: *Capybara* - https://en.wikipedia.org/wiki/Capybara).


In [12]:
summary_storage

[{'id': 0,
  'summary': "The capybara (Hydrochoerus hydrochaeris) is the world's largest rodent, native to South America. It thrives in savannas and dense forests near water sources and is known for its social behavior, often living in groups of 10-20 individuals that can swell to larger numbers in certain seasons. Capybaras primarily eat grasses and aquatic plants and are semi-aquatic, being excellent swimmers. They are not considered threatened and have adapted well to some urban environments, where they are sometimes kept as pets.",
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'}]

In [13]:
question=""" Index each of this wiki pages to the local vector DB:
— https://en.wikipedia.org/wiki/Lesser_capybara
— https://en.wikipedia.org/wiki/Hydrochoerus

 — https://en.wikipedia.org/wiki/Neochoerus

— https://en.wikipedia.org/wiki/Caviodon

— https://en.wikipedia.org/wiki/Neochoerus_aesopi
"""

In [14]:
clarifier_results1 = await first_agent.run(
    user_prompt=question,
    event_stream_handler=first_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Lesser_capybara'} call_siBHCbK9RhbXljFF3j0a9HmH
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Hydrochoerus'} call_RTnNeDQsqUGFrAoxJBGrP1iS
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Neochoerus'} call_goXm2x1ZQhaYgWpiNtLZMMc4
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Caviodon'} call_ohq7whLv29QgNTlpIlupogyY
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Neochoerus_aesopi'} call_Ea684bs4Ff2YA5Zxwk6pmihJ
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLesser_capybara
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FHydrochoerus
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FNeochoerus
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.

In [15]:
summary_storage

[{'id': 0,
  'summary': "The capybara (Hydrochoerus hydrochaeris) is the world's largest rodent, native to South America. It thrives in savannas and dense forests near water sources and is known for its social behavior, often living in groups of 10-20 individuals that can swell to larger numbers in certain seasons. Capybaras primarily eat grasses and aquatic plants and are semi-aquatic, being excellent swimmers. They are not considered threatened and have adapted well to some urban environments, where they are sometimes kept as pets.",
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'},
 {'id': 1,
  'summary': 'The genus _Hydrochoerus_ includes two extant species: the Common Capybara (_Hydrochoerus hydrochaeris_) and the Lesser Capybara (_Hydrochoerus isthmius_). These are the largest living rodents, found in semi-aquatic environments across South America, and are also known for their social behavior and dietary habits focused on grasses.',
  'wiki_url': 'ht

In [16]:
question=""" What are threats to capybara populations?"""

In [17]:
clarifier_results1 = await first_agent.run(
    user_prompt=question,
    event_stream_handler=first_agent_callback
)

CALL → search_local {'query': 'capybara threats population'} call_AjwLlZoydSYQ2WvgT9Jfcjam
RES  ← search_local call_AjwLlZoydSYQ2WvgT9Jfcjam [{'id': 0, 'summary': "The capybara (Hydrochoerus hydrochaeris) is the world's largest rodent, native to South America. It thrives in savannas and dense forests near water sources and is known for its social behavior, often living in groups of 10-20 individuals that can swell to larger numbers in certain seasons. Capybaras primarily eat grasses and aquatic plants and are semi-aquatic, being excellent swimmers. They are not considered threatened and have adapted well to some urban environments, where they are sometimes kept as pets.", 'wiki_url': 'https://en.wikipedia.org/wiki/Capybara', 'title': 'Capybara'}, {'id': 2, 'summary': 'The Lesser Capybara (_Hydrochoerus isthmius_) is a large semi-aquatic rodent native to parts of South America, particularly Panama and Colombia. It is smaller than the common capybara and primarily inhabits aquatic enviro

In [18]:
clarifier_results1.output

'While capybaras are currently not considered threatened and have adapted well to urban environments, potential threats to their populations can include habitat destruction, hunting, and competition for food resources with livestock and other species. Urbanization and agricultural expansion can lead to loss of natural habitats, which affects their social structures and feeding patterns. \n\nFor more detailed information, you can visit the [Capybara Wikipedia page](https://en.wikipedia.org/wiki/Capybara).'

In [32]:
clarifier_results1.all_messages()[2].parts[0].content

[{'id': 0,
  'summary': "The capybara (Hydrochoerus hydrochaeris) is the world's largest rodent, native to South America. It thrives in savannas and dense forests near water sources and is known for its social behavior, often living in groups of 10-20 individuals that can swell to larger numbers in certain seasons. Capybaras primarily eat grasses and aquatic plants and are semi-aquatic, being excellent swimmers. They are not considered threatened and have adapted well to some urban environments, where they are sometimes kept as pets.",
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'},
 {'id': 2,
  'summary': 'The Lesser Capybara (_Hydrochoerus isthmius_) is a large semi-aquatic rodent native to parts of South America, particularly Panama and Colombia. It is smaller than the common capybara and primarily inhabits aquatic environments, where it grazes on grasses and other plants. This species has been classified as data deficient due to limited information a