In [1]:
from openai import OpenAI
from pydantic import BaseModel
import json
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from typing import Any, Dict, List, TypedDict
load_dotenv()

openai_client = OpenAI()

## Fetch Web Page Tool

In [2]:
import requests
import json
import urllib.parse  # Good practice for encoding URLs
from dataclasses import dataclass
from typing import Dict, Any, List, Optional


@dataclass
class WebPageCotent:
    """
    Container for fetched web page content.

    Attributes:
        wiki_url: Original Wikipedia URL requested.
        title:    Extracted title from the response (or 'N/A' on failure).
        content:  Extracted main content/body (may be empty on failure).
    """
    wiki_url: str
    title: str
    content: str


# Global URL → ID map and in-memory storage for previously processed pages
URL_TO_ID: Dict[str, int] = {}
summary_storage: List[Dict[str, Any]] = []


def fetch_web_content(wiki_url: str) -> WebPageCotent:
    """
    Fetch content for a Wikipedia page via the Jina reader proxy.

    The function URL-encodes the provided Wikipedia URL, requests the Jina reader
    endpoint, and attempts to parse a JSON payload where content is expected under
    a 'data' object (i.e., {"data": {"title": "...", "content": "..."}}).

    If any network/HTTP/JSON parsing error occurs, it returns a WebPageCotent with
    title='N/A' and a short error message in `content`.

    If the URL has already been processed and stored in `summary_storage`, this
    returns that stored title/summary instead of refetching.
    """
    # Reuse from local cache if present
    if wiki_url in URL_TO_ID:
        print("URL already fetched: retrieve from local storage.")
        idx: Optional[int] = URL_TO_ID.get(wiki_url)
        if idx is not None:
            for item in summary_storage:
                if item.get("id") == idx:
                    return WebPageCotent(
                        wiki_url=wiki_url,
                        title=item.get("title", "N/A"),
                        content=item.get("summary", ""),
                    )

    # 1. URL-encode the target URL (best practice) and prepend the Jina reader base
    encoded_target_url: str = urllib.parse.quote(wiki_url, safe="")
    json_url: str = f"https://r.jina.ai/{encoded_target_url}"

    # 2. Use the standard 'Accept' header for JSON response
    headers: Dict[str, str] = {
        "Accept": "application/json"
    }

    print(f"Requesting URL: {json_url}")
    print(f"With Headers: {headers}")

    try:
        response = requests.get(json_url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors (like 400, 404)

        # Decode the JSON response
        structured_data: Any = response.json()

        # Jina's JSON structure usually nests content under a 'data' object
        data: Dict[str, Any] = structured_data.get("data", {}) if isinstance(structured_data, dict) else {}

        # Access the clean content and title
        title: str = data.get("title", "N/A")
        content: str = data.get("content", "")

    except (requests.RequestException, json.JSONDecodeError, ValueError) as e:
        title = "N/A"
        content = f"Error fetching or parsing JSON response: {e}"

    return WebPageCotent(
        wiki_url=wiki_url,
        title=title,
        content=content
    )


## Append/Search Tool

In [3]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Optional



class SearchAppendTool:
    """
    Minimal FAISS-backed semantic search helper that lets you append
    text summaries with custom integer IDs and query them locally.
    """

    def __init__(self) -> None:
        
        # Initialize the embedding model
        self.model: SentenceTransformer = SentenceTransformer("all-MiniLM-L6-v2")
        D: int = 384  # Dimension of the 'all-MiniLM-L6-v2' model

        # 1. Create the base index (supports incremental addition)
        base_index = faiss.IndexFlatIP(D)

        # 2. Wrap with IndexIDMap to enable custom integer IDs
        self.index: faiss.IndexIDMap = faiss.IndexIDMap(base_index)

        # Local storage to map IDs back to text/metadata
        self.next_available_id: int = 0


    def append_webpage_summary(self, summary: str, wiki_url: str, title: str) -> None:
        """
        Encode a single summary and append it to the FAISS index with a custom ID.

        Args:
            summary:  The text chunk/summary to store.
            wiki_url: Source URL for reference.
            title:    Title associated with the content.
        """
        # 1. Vectorize the text (normalize=True for cosine similarity with IP)
        vector = self.model.encode([summary], normalize_embeddings=True)

        # 2. Prepare the data for FAISS (float32, 2D array)
        vectors_to_add = vector.astype("float32")

        # FAISS requires a 1D int64 array of IDs
        ids_to_add = np.array([self.next_available_id], dtype=np.int64)

        # 3. Append the vector and custom ID to the index
        self.index.add_with_ids(vectors_to_add, ids_to_add)

        # 4. Update local storage (ID -> item mapping)
        summary_storage.append(
            {"id": self.next_available_id, "summary": summary, "wiki_url": wiki_url, "title": title}
        )

        URL_TO_ID[wiki_url] = self.next_available_id
        # Increment ID counter (was a no-op previously)
        self.next_available_id += 1

        # Keep original behavior: no return value
        return

    def search_local(self, query: str) -> List[Dict[str, Any]]:
        """
        Perform a semantic search over locally indexed summaries.

        Args:
            query: The search query string.

        Returns:
            A list of stored items (dicts) that match best, up to top-3,
            each including: {"id", "summary", "wiki_url", "title"}.
        """
        # 1. Vectorize the query
        query_vector = self.model.encode([query], normalize_embeddings=True).astype("float32")

        # 2. Perform the search (top-3)
        distances, indices = self.index.search(query_vector, 3)

        # 3. Retrieve the corresponding summaries
        results: List[Dict[str, Any]] = []
        for idx in indices[0]:
            if idx == -1:
                continue  # Skip empty slots
            # Find the summary in local storage
            for item in summary_storage:
                if item["id"] == int(idx):
                    results.append(item)
                    break

        return results


## Pydantic AI

In [4]:
from pydantic_ai import Agent
from pydantic import BaseModel, Field

from typing import List

In [5]:
from pydantic_ai.messages import FunctionToolCallEvent, FunctionToolResultEvent

class NamedCallback:

    def __init__(self, agent):
        self.agent_name = agent.name

    async def print_function_calls(self, ctx, event):
        # Detect nested streams
        if hasattr(event, "__aiter__"):
            async for sub in event:
                await self.print_function_calls(ctx, sub)
            return

        if isinstance(event, FunctionToolCallEvent):
            print("CALL →", event.part.tool_name, event.part.args_as_dict(), event.tool_call_id)
        elif isinstance(event, FunctionToolResultEvent):
            print("RES  ←", event.result.tool_name, event.tool_call_id, event.result.content)

    async def __call__(self, ctx, event):
        return await self.print_function_calls(ctx, event)

In [6]:
instructions = """
You are an AI assistant that fetches, summarizes, and answers questions about Wikipedia articles.

### Tools
- **fetch_web_content(url)**: Retrieve a Wikipedia article’s content and title.
- **append_webpage_summary(summary, wiki_url, title)**: Store a short summary in the local knowledge base.
- **search_local(query)**: Search stored summaries for relevant information.

### Workflow
1. When a user provides a Wikipedia URL:
   - Fetch the article with `fetch_web_content`.
   - Summarize it in 2–4 sentences (clear, factual, neutral).
   - Store it using `append_webpage_summary` (include URL and title).
2. When a user asks a question:
   - Use `search_local` to find relevant summaries.
   - Combine information from the top results to form a concise, factual answer.
   - **Always include references** — each answer must cite the **Wikipedia page title** and **URL** for every source used.

### Behavior Guidelines
- Be accurate, concise, and neutral.
- Prefer existing stored summaries before fetching new data.
- If unsure, explain reasoning clearly.
- Example reference format: “According to *Rose* (https://en.wikipedia.org/wiki/Rose), ...”
""".strip()


In [7]:
search_tools = SearchAppendTool()

In [8]:
wiki_agent = Agent(
    name='wikiagent',
    instructions=instructions,
    tools=[search_tools.append_webpage_summary, search_tools.search_local, fetch_web_content],
    model='gpt-4o-mini',
)

wiki_agent_callback = NamedCallback(wiki_agent)

In [9]:
question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"

In [10]:
wiki_agent_results1 = await wiki_agent.run(
    user_prompt=question,
    event_stream_handler=wiki_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Capybara'} call_L5ycgwK5ZFW4ric4vYCPSDOB
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FCapybara
With Headers: {'Accept': 'application/json'}
RES  ← fetch_web_content call_L5ycgwK5ZFW4ric4vYCPSDOB WebPageCotent(wiki_url='https://en.wikipedia.org/wiki/Capybara', title='Capybara', content='[![Image 1: This is a good article. Click here for more information.](https://upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/20px-Symbol_support_vote.svg.png)](https://en.wikipedia.org/wiki/Wikipedia:Good_articles* "This is a good article. Click here for more information.")\n\n[![Image 2: Page semi-protected](https://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png)](https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi "This article is semi-protected until March 17, 2028 at 19:23 UTC.")\n\nFrom Wikipedia, the free en

In [11]:
print(wiki_agent_results1.output)

The *capybara* (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions (according to *Capybara* [Wikipedia](https://en.wikipedia.org/wiki/Capybara)).


In [12]:
summary_storage

[{'id': 0,
  'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions.',
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'}]

In [13]:
##test rerun of the same url 
wiki_agent_results2 = await wiki_agent.run(
    user_prompt=question,
    event_stream_handler=wiki_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Capybara'} call_ShXbbIVQpaZ4HqoCe3F7zOQq
URL already fetched: retrieve from local storage.
RES  ← fetch_web_content call_ShXbbIVQpaZ4HqoCe3F7zOQq WebPageCotent(wiki_url='https://en.wikipedia.org/wiki/Capybara', title='Capybara', content='The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions.')
CALL → append_webpage_summary {'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America, where it inhabits savannas and dense forests near water. These social animals 

In [20]:
question=""" Index each of this wiki pages to the local vector DB:
— https://en.wikipedia.org/wiki/Lesser_capybara
— https://en.wikipedia.org/wiki/Hydrochoerus

 — https://en.wikipedia.org/wiki/Neochoerus

— https://en.wikipedia.org/wiki/Caviodon

— https://en.wikipedia.org/wiki/Neochoerus_aesopi
"""

In [21]:
wiki_results3 = await wiki_agent.run(
    user_prompt=question,
    event_stream_handler=wiki_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Lesser_capybara'} call_9fqQN60RKjpCQhheQqnQVUdS
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Hydrochoerus'} call_EcRpTcEUNZZOUCTw7gwgiQgB
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Neochoerus'} call_sg3agyAhLVkLUluJHZaRC82i
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Caviodon'} call_Cs7KNzRMSlPjnKghFC5YSDPC
CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Neochoerus_aesopi'} call_PetFjN8aQiWKG1n0RVR0RAjV
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLesser_capybara
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FHydrochoerus
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.ai/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FNeochoerus
With Headers: {'Accept': 'application/json'}
Requesting URL: https://r.jina.

In [22]:
summary_storage

[{'id': 0,
  'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions.',
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'},
 {'id': 1,
  'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America, where it inhabits savannas and dense forests near water. These social animals can live in groups of up to 100 and primarily feed on grasses and aquatic plants. Capybaras are excellent swimmers and are not currently considered threatened, though they are hunted in some areas for meat and hides.',
 

In [23]:
question=""" What are threats to capybara populations?"""

In [24]:
wiki_agent_results4 = await wiki_agent.run(
    user_prompt=question,
    event_stream_handler=wiki_agent_callback
)

CALL → search_local {'query': 'threats to capybara populations'} call_dY22hBHE3EjZT7PQfzX7JF7R
RES  ← search_local call_dY22hBHE3EjZT7PQfzX7JF7R [{'id': 0, 'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions.', 'wiki_url': 'https://en.wikipedia.org/wiki/Capybara', 'title': 'Capybara'}, {'id': 1, 'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America, where it inhabits savannas and dense forests near water. These social animals can live in groups of up to 100 and primarily feed on grasses and aquatic plants. Ca

In [25]:
wiki_agent_results4.output

'Capybara populations are generally not considered threatened, but they face some risks, primarily from hunting for their meat and hides in various regions. As the largest rodents in the world, capybaras inhabit savannas and dense forests near water bodies. Their populations are primarily stable, particularly for the common capybara (*Hydrochoerus hydrochaeris*), but localized threats may affect them depending on human activities (e.g., hunting, habitat destruction) and environmental changes.\n\nFor more detailed information, you can refer to the *Capybara* article on Wikipedia: [Capybara](https://en.wikipedia.org/wiki/Capybara).'

In [26]:
wiki_agent_results4.all_messages()[2].parts[0].content

[{'id': 0,
  'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America. It typically inhabits savannas and dense forests near water bodies, living in social groups that can number up to 100 individuals. Capybaras are herbivorous, mainly grazing on grasses and aquatic plants, and are known for their strong swimming abilities. They are not considered threatened, although they are hunted for their meat and hides in some regions.',
  'wiki_url': 'https://en.wikipedia.org/wiki/Capybara',
  'title': 'Capybara'},
 {'id': 1,
  'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America, where it inhabits savannas and dense forests near water. These social animals can live in groups of up to 100 and primarily feed on grasses and aquatic plants. Capybaras are excellent swimmers and are not currently considered threatened, though they are hunted in some areas for meat and hides.',
 

In [27]:
question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"

In [28]:
wiki_results6 = await wiki_agent.run(
    user_prompt=question,
    event_stream_handler=wiki_agent_callback
)

CALL → fetch_web_content {'wiki_url': 'https://en.wikipedia.org/wiki/Capybara'} call_fUXvGrxxgskSPEVmhOWXDeHF
URL already fetched: retrieve from local storage.
RES  ← fetch_web_content call_fUXvGrxxgskSPEVmhOWXDeHF WebPageCotent(wiki_url='https://en.wikipedia.org/wiki/Capybara', title='Capybara', content='The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, indigenous to South America, where it inhabits savannas and dense forests near water. These social animals can live in groups of up to 100 and primarily feed on grasses and aquatic plants. Capybaras are excellent swimmers and are not currently considered threatened, though they are hunted in some areas for meat and hides.')
CALL → append_webpage_summary {'summary': 'The capybara (*Hydrochoerus hydrochaeris*) is the largest rodent in the world, native to South America, particularly found in savannas and dense forests near water sources. They are social creatures, often living in groups of up to 100, and prim