# Let's go!

Install Pydantic AI:

`pip install pydantic-ai`

With Pydantic AI we can choose between [multiple LLM providers and models](https://ai.pydantic.dev/models/).

In [None]:
# Colab stuff

import sys, pathlib

if ('google.colab' in sys.modules) and (not pathlib.Path('repo').exists()):
    !git clone https://github.com/marcelpauly/scicar-agents.git
    %cd scicar-agents
    %pip install -q -r requirements.txt

In [None]:
import os
from typing import Literal

import pandas as pd
from pydantic import BaseModel, Field, HttpUrl

from pydantic_ai import Agent, NativeOutput, PromptedOutput, Tool, WebSearchTool
from pydantic_ai.models.openai import OpenAIResponsesModel
from pydantic_ai.models.anthropic import AnthropicModel
from pydantic_ai.models.google import GoogleModel

# Allow async code to run inside Jupyter Notebook's existing event loop
import nest_asyncio
nest_asyncio.apply()

**Start cheap, then scale up if needed**

Different providers, different models: start with the cheapest model that meets your formal requirements. If, despite a well-designed prompt, the output quality is unsatisfactory, then move up to more capable, more expensive models to see whether they deliver better results.

In [None]:
models = {}

# OpenAI
# Generate API key: https://platform.openai.com/settings/organization/api-keys
# Models: https://platform.openai.com/docs/models
# Pricing: https://platform.openai.com/docs/pricing
os.environ['OPENAI_API_KEY'] = 'YOUR_OPENAI_API_KEY'
models['gpt'] = OpenAIResponsesModel('gpt-5-nano-2025-08-07')

# Anthropic
# Generate API key: https://console.anthropic.com/settings/keys
# Models: https://docs.anthropic.com/en/docs/about-claude/models/overview
# Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing
os.environ['ANTHROPIC_API_KEY'] = 'YOUR_ANTHROPIC_API_KEY'
models['claude'] = AnthropicModel('claude-3-5-haiku-20241022')

# Google
# Generate API key: https://aistudio.google.com/apikey
# Models: https://ai.google.dev/gemini-api/docs/models
# Pricing: https://ai.google.dev/gemini-api/docs/pricing
os.environ['GOOGLE_API_KEY'] = 'YOUR_GOOGLE_API_KEY'
models['gemini'] = GoogleModel('gemini-2.5-flash-lite')

In [None]:
# Choose a model
model = models['gpt']

## Start with a simple prompt

In [None]:
prompt = 'List the most important professional conferences in Europe strongly related to data journalism, computer-assisted reporting and AI in journalism.'

agent = Agent(model)
result = agent.run_sync(prompt)
print(result.output)

## Get structured answers

### Let's try it using a system prompt

In [None]:
system_prompt = 'Respond with a JSON array of objects. Choose concise, relevant fields and no extra prose.'
prompt = 'List the most important professional conferences in Europe strongly related to data journalism, computer-assisted reporting and AI in journalism.'

agent = Agent(
    model = model,
    system_prompt = system_prompt,        # add system prompt
    model_settings = {'temperature': 0}   # more focused, less creative responses
)

result = agent.run_sync(prompt)
print(result.output)

### Let's use a Pydantic model

Different LLM providers have different approaches to deliver structured responses. (JSON Mode with Anthropic, Structured Output with OpenAI.)

Pydantic AI uses Pydantic Models and translate them into the instructions the choosen model uses.

In [None]:
system_prompt = 'You are an expert on professional conferences. Return only the structured output requested.'
prompt = 'List the most important professional conferences in Europe strongly related to data journalism, computer-assisted reporting and AI in journalism.'

class Conferences(BaseModel):
    # A class in Python defines a 'blueprint' for data.
    # In this case we use a Pydantic model to specify
    # which fields each conference must have.
    name: str
    organizer: str
    location: str | Literal['online only', 'various'] = Field(description = 'City, Country where the conference is held, or "online only" or "various".')
    topics: list[str]
    frequency: Literal['monthly', 'semiannual', 'annual', 'biennial', 'one-time', 'irregular', 'other', 'unknown']
    website: HttpUrl | None

output_schema = list[Conferences]

agent = Agent(
    model = model,
    system_prompt = system_prompt,
    output_type = output_schema,
    model_settings = {'temperature': 0},
    retries = 2,
    output_retries = 2
)

result = agent.run_sync(prompt)
result.output

In [None]:
result.output[0].model_dump()

In [None]:
# Convert Pydantic objects to a list of dictionaries
def convert_pydantic_objects_to_list_of_dicts(conferences: list[Conferences]) -> list[dict]:
    return [conference.model_dump() for conference in conferences]

conferences = convert_pydantic_objects_to_list_of_dicts(result.output)
conferences

In [None]:
# Make it a DataFrame

df = pd.DataFrame(conferences)
df

What about the quality?

In [None]:

prompt = 'When will the 2025 federal elections be held in Germany?'

agent = Agent(
    model = model
)

result = agent.run_sync(prompt)
print(result.output)

In [None]:
prompt = 'When will the 2025 federal elections be held in Germany?'

agent = Agent(
    model = model,
    builtin_tools = [WebSearchTool()]   # Enable web search
)

result = agent.run_sync(prompt)
print(result.output)

## Improving and enriching the output by building an agent

### Using the LLM's builtin [web search tool](https://ai.pydantic.dev/builtin-tools/#web-search-tool)

(Alternative: [DuckDuckGo Search Tool](https://ai.pydantic.dev/common-tools/#duckduckgo-search-tool) and [Tavily Search Tool](https://ai.pydantic.dev/common-tools/#tavily-search-tool) from Pydantic AI's already integrated "common tools")

In [None]:
system_prompt = 'You are an expert on professional conferences. Return only the structured output requested. You use the tools provided to you: Use the web search (`WebSearchTool`) to find conferences on the internet.'
prompt = 'List the most important professional conferences in Europe strongly related to data journalism, computer-assisted reporting and AI in journalism.'

class Conferences(BaseModel):
    name: str
    organizer: str
    location: str | Literal['online only', 'various'] = Field(description = 'City, Country where the conference is held, or "online only" or "various".')
    topics: list[str]
    frequency: Literal['monthly', 'semiannual', 'annual', 'biennial', 'one-time', 'irregular', 'other', 'unknown']
    website: HttpUrl | None

output_schema = list[Conferences]

def get_output_type(model, output_schema):
    """
    Use NativeOutput for OpenAI models, PromptedOutput for others, to ensure
    each model is able to both use tools and provide structured output.
    """
    if isinstance(model, OpenAIResponsesModel):
        return NativeOutput(output_schema)
    return PromptedOutput(output_schema)

output_type = get_output_type(model, output_schema)

agent = Agent(
    model = model,
    system_prompt = system_prompt,
    output_type = output_type,
    builtin_tools = [WebSearchTool()],   # for web search capabilities
    model_settings = {'temperature': 0},
    retries = 3,
    output_retries = 3
)
result = agent.run_sync(prompt)
convert_pydantic_objects_to_list_of_dicts(result.output)

### Building our own tool

**Warning:** Doesn't work with Google Gemeni when you demand structured output at the same time.

In [None]:
schengen_countries = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Czech Republic', 'Czechia', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Italy',
    'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland'
]

def is_schengen_country(country: str) -> bool:
    return country in schengen_countries

In [None]:
# Quick sanity check

is_schengen_country('Greece')

In [None]:
system_prompt = 'You are an expert on professional conferences. Return only the structured output requested. You use the tools provided to you: Use the web search (`WebSearchTool`) to find conferences on the internet, and `is_schengen_country` to look up whether a country is in the Schengen area.'
prompt = 'List the most important professional conferences in Germany strongly related to data journalism, computer-assisted reporting and AI in journalism.'

class Conferences(BaseModel):
    name: str
    organizer: str
    location: str | Literal['online only', 'various'] = Field(description = 'City, Country where the conference is held, or "online only" or "various".')
    is_schengen_country: bool = Field(description = 'Whether the location (country) is in the Schengen area.')   # new field
    topics: list[str]
    frequency: Literal['monthly', 'semiannual', 'annual', 'biennial', 'one-time', 'irregular', 'other', 'unknown']
    website: HttpUrl | None

def get_output_type(model, output_schema):
    """
    Use NativeOutput for OpenAI models, PromptedOutput for others, to ensure
    each model is able to both use tools and provide structured output.
    """
    if isinstance(model, OpenAIResponsesModel):
        return NativeOutput(output_schema)
    return PromptedOutput(output_schema)

output_schema = list[Conferences]
output_type = get_output_type(model, output_schema)

agent = Agent(
    model = model,
    system_prompt = system_prompt,
    output_type = output_type,
    builtin_tools = [WebSearchTool()],
    tools = [Tool(
        is_schengen_country,
        description = 'Return whether a given country is in the Schengen area.'
    )],
    model_settings = {'temperature': 0},
    retries = 3,
    output_retries = 3
)
result = agent.run_sync(prompt)
convert_pydantic_objects_to_list_of_dicts(result.output)

**Let’s inspect what happened under the hood**

(internal tools like web search may not be visible here)

In [None]:
for msg in result.all_messages():
    for part in msg.parts:
        print(part)
        print()