# Notebook 04: Structured Outputs & JSON Schema

**Objectives:**
- Extract structured data with json_extract.v1 prompt
- Validate against JSON schemas
- Repair malformed JSON
- Log token costs for validation + repair cycles

In [7]:
import sys
import pprint
sys.path.append('..')

from utils.prompts import render
from utils.llm_client import LLMClient
from utils.logging_utils import log_llm_call
from utils.router import pick_model
from utils.json_utils import safe_parse_json, validate_json_schema, create_simple_schema, format_schema_for_prompt
import json

## Part 1: JSON Extraction with Schema

Define a schema and extract structured data.

In [8]:
text = """The CloudSync Pro Business plan costs 20 LKR per user per month.
It includes 10TB storage and is currently available."""

schema = create_simple_schema({
                            "name": "string",
                            "price": "number",
                            "currency": "string",
                            "in_stock": "boolean"
                        }, required=["name", "price", "currency"])
pprint.pprint(schema)

{'properties': {'currency': {'type': 'string'},
                'in_stock': {'type': 'boolean'},
                'name': {'type': 'string'},
                'price': {'type': 'number'}},
 'required': ['name', 'price', 'currency'],
 'type': 'object'}


In [9]:
prompt_text, spec = render("json_extract.v1", schema=schema, text=text)
print(prompt_text)

Extract the requested fields and return ONLY valid JSON matching this schema:
{'type': 'object', 'properties': {'name': {'type': 'string'}, 'price': {'type': 'number'}, 'currency': {'type': 'string'}, 'in_stock': {'type': 'boolean'}}, 'required': ['name', 'price', 'currency']}

Text:
The CloudSync Pro Business plan costs 20 LKR per user per month.
It includes 10TB storage and is currently available.

Return ONLY JSON, no extra text.


In [10]:
model = pick_model('groq', 'general')
client = LLMClient('groq', model)

response = client.json_chat(
    [
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.0
)['text']

Handle malformed JSON with automatic repair.

In [11]:
success, data, error = safe_parse_json(response)
pprint.pprint(data)

{'currency': 'LKR',
 'in_stock': True,
 'name': 'CloudSync Pro Business',
 'price': 20}


## Part 2: Pydantic Models (Recommended Approach)

Use Pydantic for type-safe structured outputs with automatic validation and IDE support.

In [12]:
from pydantic import BaseModel, Field
from utils.json_utils import (
                            format_pydantic_schema_for_prompt,
                            parse_json_with_pydantic
)

class ProductInfo(BaseModel):
    name: str = Field(..., description="The name of the product")
    price: float = Field(..., description="The price of the product")
    currency: str = Field(..., description="The currency of the product")
    in_stock: bool = Field(..., description="Whether the product is in stock")


schema_str = format_pydantic_schema_for_prompt(ProductInfo)
print(schema_str)

{
  "properties": {
    "name": {
      "description": "The name of the product",
      "title": "Name",
      "type": "string"
    },
    "price": {
      "description": "The price of the product",
      "title": "Price",
      "type": "number"
    },
    "currency": {
      "description": "The currency of the product",
      "title": "Currency",
      "type": "string"
    },
    "in_stock": {
      "description": "Whether the product is in stock",
      "title": "In Stock",
      "type": "boolean"
    }
  },
  "required": [
    "name",
    "price",
    "currency",
    "in_stock"
  ],
  "title": "ProductInfo",
  "type": "object"
}


In [None]:
from pandas.io.formats.printing import _pprint_seq


text = """The CloudSync Pro Business plan costs $20 per user per month.
It includes 10TB storage and is currently available."""

prompt_text, spec = render("json_extract.v1", schema=schema_str, text=text)

model = pick_model('groq', 'general')
client = LLMClient('groq', model)

response = client.json_chat(
    [
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.0
)['text']
print(response)

In [None]:
success, data, error = safe_parse_json(response)
pprint.pprint(data)