In [1]:
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Literal

from pydantic import BaseModel, Field

In [2]:
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
from src.gepadantic.scaffold import GepaConfig, run_optimization_pipeline

In [4]:
import requests


url = "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/dataset.json"
dataset = json.loads(requests.get(url).text)

In [5]:
dataset[0]

{'fields': {'input': 'Subject: Urgent Assistance Required for Specialized Cleaning Services\n\nDear ProCare Facility Solutions Support Team,\n\nI hope this message finds you well. My name is [Sender], and my family and I have been availing your services for our home for the past year. We have always appreciated the high standards and professionalism your team brings to maintaining our living environment.\n\nHowever, we are currently facing an urgent issue that requires immediate attention. We recently hosted a large gathering at our home, and despite our best efforts, there are several areas that now require specialized cleaning. Specifically, we need deep cleaning for our carpets and upholstery, as well as thorough window washing. The situation is quite pressing as we have more guests arriving soon, and we want to ensure our home is in pristine condition to welcome them.\n\nWe have tried some basic cleaning ourselves, but the results have not been satisfactory. Given the high standard

In [6]:
dataset[0]['fields']['input']


'Subject: Urgent Assistance Required for Specialized Cleaning Services\n\nDear ProCare Facility Solutions Support Team,\n\nI hope this message finds you well. My name is [Sender], and my family and I have been availing your services for our home for the past year. We have always appreciated the high standards and professionalism your team brings to maintaining our living environment.\n\nHowever, we are currently facing an urgent issue that requires immediate attention. We recently hosted a large gathering at our home, and despite our best efforts, there are several areas that now require specialized cleaning. Specifically, we need deep cleaning for our carpets and upholstery, as well as thorough window washing. The situation is quite pressing as we have more guests arriving soon, and we want to ensure our home is in pristine condition to welcome them.\n\nWe have tried some basic cleaning ourselves, but the results have not been satisfactory. Given the high standards we have come to exp

In [7]:
dataset[0]['answer']

'{"categories": {"routine_maintenance_requests": false, "customer_feedback_and_complaints": false, "training_and_support_requests": false, "quality_and_safety_concerns": false, "sustainability_and_environmental_practices": false, "cleaning_services_scheduling": false, "specialized_cleaning_services": true, "emergency_repair_services": false, "facility_management_issues": false, "general_inquiries": false}, "sentiment": "neutral", "urgency": "high"}'

In [8]:
class EmailInput(BaseModel):
    """Input model for email classification."""
    
    input: str = Field(description="The email content to classify")


class EmailCategories(BaseModel):
    """Categories for email classification."""
    
    routine_maintenance_requests: bool
    customer_feedback_and_complaints: bool
    training_and_support_requests: bool
    quality_and_safety_concerns: bool
    sustainability_and_environmental_practices: bool
    cleaning_services_scheduling: bool
    specialized_cleaning_services: bool
    emergency_repair_services: bool
    facility_management_issues: bool
    general_inquiries: bool


class EmailClassification(BaseModel):
    """Expected output for email classification."""
    
    categories: EmailCategories
    sentiment: str = Field(description="Sentiment: positive, neutral, or negative")
    urgency: str = Field(description="Urgency level: low, medium, or high")

In [14]:
from typing import Callable, Any

from gepadantic.types import InputModelT, DataInstWithInput


def init_dataset(
    input_data: list[dict[str, Any]],
) -> list[DataInstWithInput[InputModelT]]:
    """Load data from a list of dicts and convert to DataInstWithInput instances."""
    
    dataset: list[DataInstWithInput[InputModelT]] = []

    for idx, item in enumerate(input_data):
        if not isinstance(item, dict):
            raise ValueError(f"Item at index {idx} is not a dict: {type(item)}")

        # Convert dict to input model using user-provided mapper
        try:
            input_instance = EmailInput(input=item['fields']['input'])
        except Exception as e:
            raise ValueError(f"Error mapping item {idx} to input model: {e}") from e

        # Extract metadata
        metadata = {
            'expected_output': json.loads(item['answer']),
        }

        case_id = f"item-{idx}"

        # Create DataInstWithInput
        data_inst = DataInstWithInput[InputModelT](
            input=input_instance,
            message_history=None,
            metadata=metadata,
            case_id=case_id,
        )
        dataset.append(data_inst)

    return dataset

In [15]:
prepared_dataset = init_dataset(dataset)

In [18]:
prepared_dataset[0].__dict__

{'input': EmailInput(input='Subject: Urgent Assistance Required for Specialized Cleaning Services\n\nDear ProCare Facility Solutions Support Team,\n\nI hope this message finds you well. My name is [Sender], and my family and I have been availing your services for our home for the past year. We have always appreciated the high standards and professionalism your team brings to maintaining our living environment.\n\nHowever, we are currently facing an urgent issue that requires immediate attention. We recently hosted a large gathering at our home, and despite our best efforts, there are several areas that now require specialized cleaning. Specifically, we need deep cleaning for our carpets and upholstery, as well as thorough window washing. The situation is quite pressing as we have more guests arriving soon, and we want to ensure our home is in pristine condition to welcome them.\n\nWe have tried some basic cleaning ourselves, but the results have not been satisfactory. Given the high st

In [19]:
import random

random.Random(42).shuffle(prepared_dataset)

train_set = prepared_dataset[:int(len(prepared_dataset) * 0.33)]
val_set = prepared_dataset[int(len(prepared_dataset) * 0.33):int(len(prepared_dataset) * 0.66)]
test_set = prepared_dataset[int(len(prepared_dataset) * 0.66):]

In [21]:
print(train_set[0].input.input)

Subject: Inquiry About Your Eco-Friendly Practices

Dear ProCare Facility Solutions Support Team,

I hope this message finds you well. My name is [Sender], and I am a mother of a 10-year-old daughter who has recently started Irish dance. We live in a residential complex where your team provides cleaning and maintenance services.

I am writing to inquire about the eco-friendly practices your company employs. As someone who is conscious about the environment and wants to set a good example for my daughter, I am keen to understand how your services align with sustainable practices. Specifically, I am interested in the types of cleaning products you use and any measures you take to reduce the carbon footprint of your operations.

I have noticed that the cleaning staff is very diligent and thorough, which I appreciate. However, I would like to know more about the environmental impact of the products and methods used. Are there any certifications or standards that your company adheres to in 

In [22]:
class FacilitySupportAnalyzer(BaseModel):
    """Analyzes facility support requests."""
    
    urgency: Literal['low', 'medium', 'high'] = Field(description="The urgency of the request")
    sentiment: Literal['positive', 'neutral', 'negative'] = Field(description="The sentiment of the request")
    categories: list[Literal[
        "emergency_repair_services", 
        "routine_maintenance_requests", 
        "quality_and_safety_concerns", 
        "specialized_cleaning_services", 
        "general_inquiries", 
        "sustainability_and_environmental_practices", 
        "training_and_support_requests", 
        "cleaning_services_scheduling", 
        "customer_feedback_and_complaints", 
        "facility_management_issues"]] = Field(description="The categories of the request")

In [23]:
from gepadantic.lm import get_openai_model
from pydantic_ai import Agent


DEFAULT_INSTRUCTIONS = """
Read the following facility support request and analyze it to determine the urgency, sentiment, and categories.
"""

DEFAULT_MODEL = "gpt-4.1-nano"


test_agent = Agent(
    model=get_openai_model(DEFAULT_MODEL),
    instructions=DEFAULT_INSTRUCTIONS,
    output_type=FacilitySupportAnalyzer,
    retries=5,
)

In [24]:
import nest_asyncio
nest_asyncio.apply()

test_response = test_agent.run_sync(train_set[0].input.input)

In [25]:
test_response.output.model_dump()

{'urgency': 'low',
 'sentiment': 'neutral',
 'categories': ['sustainability_and_environmental_practices']}

In [26]:
train_set[0].metadata['expected_output']

{'categories': {'routine_maintenance_requests': False,
  'customer_feedback_and_complaints': False,
  'training_and_support_requests': False,
  'quality_and_safety_concerns': False,
  'sustainability_and_environmental_practices': True,
  'cleaning_services_scheduling': False,
  'specialized_cleaning_services': True,
  'emergency_repair_services': False,
  'facility_management_issues': False,
  'general_inquiries': True},
 'sentiment': 'neutral',
 'urgency': 'low'}

In [28]:
def score_urgency(gold_urgency, pred_urgency):
    """
    Compute score for the urgency module.
    """
    score = 1.0 if gold_urgency == pred_urgency else 0.0
    return score

def score_sentiment(gold_sentiment, pred_sentiment):
    """
    Compute score for the sentiment module.
    """
    score = 1.0 if gold_sentiment == pred_sentiment else 0.0
    return score

def score_categories(gold_categories, pred_categories):
    """
    Compute score for the categories module.
    Uses the same match/mismatch logic as category accuracy in the score.
    """
    correct = 0
    for k, v in gold_categories.items():
        if v and k in pred_categories:
            correct += 1
        elif not v and k not in pred_categories:
            correct += 1
    score = correct / len(gold_categories)
    return score

In [31]:
from gepadantic.types import DataInstWithInput, RolloutOutput


def metric(
    data_inst: DataInstWithInput[EmailInput],
    output: RolloutOutput[FacilitySupportAnalyzer],
) -> float:
    """Evaluate facility support request analysis.
    
    This metric checks if the predicted urgency, sentiment, and categories match the ground truth.
    
    Args:
        data_inst: Input data instance with metadata containing ground truth.
        output: Agent's output to evaluate.
        
    Returns:
        Float score between 0.0 and 1.0.
    """
    if not output.success:
        return 0.0
    
    # Extract predicted urgency, sentiment, and categories
    pred_urgency = output.result.urgency
    pred_sentiment = output.result.sentiment
    pred_categories = output.result.categories
    
    # Extract ground truth from metadata
    gold_urgency = data_inst.metadata['expected_output']['urgency']
    gold_sentiment = data_inst.metadata['expected_output']['sentiment']
    gold_categories = data_inst.metadata['expected_output']['categories']
    
    # Score each module
    urgency_score = score_urgency(gold_urgency, pred_urgency)
    sentiment_score = score_sentiment(gold_sentiment, pred_sentiment)
    categories_score = score_categories(gold_categories, pred_categories)
    
    # Calculate final score as average of all module scores
    final_score = (urgency_score + sentiment_score + categories_score) / 3.0
    
    return final_score

In [32]:
test_output = RolloutOutput(
    result=test_response.output,
    success=True,
)

metric(train_set[0], test_output)

0.9333333333333332

In [33]:
import numpy as np
import asyncio
import nest_asyncio

nest_asyncio.apply()


async def evaluate_single_instance(i: int, data_inst):
    """Evaluate a single test instance."""
    response = await test_agent.run(data_inst.input.input)
    output = RolloutOutput(
        result=response.output,
        success=True,
    )
    score = metric(data_inst, output)
    return score

async def evaluate_test_set_parallel():
    """Evaluate the test set with parallel execution."""
    # Create all tasks
    tasks = [
        evaluate_single_instance(i, test_set[i]) 
        for i in range(len(test_set))
    ]
    
    # Run all tasks concurrently
    scores = await asyncio.gather(*tasks)
    return scores


baseline_test_scores = await evaluate_test_set_parallel()

In [34]:
print(f"Baseline test score: {np.mean(baseline_test_scores)}")

Baseline test score: 0.7975490196078431


In [35]:
def feedback_urgency(gold_urgency, pred_urgency):
    """
    Generate feedback for the urgency module.
    """
    score = 1.0 if gold_urgency == pred_urgency else 0.0
    if gold_urgency == pred_urgency:
        feedback = f"You correctly classified the urgency of the message as `{gold_urgency}`. This message is indeed of `{gold_urgency}` urgency."
    else:
        feedback = f"You incorrectly classified the urgency of the message as `{pred_urgency}`. The correct urgency is `{gold_urgency}`. Think about how you could have reasoned to get the correct urgency label."
    return feedback, score

def feedback_sentiment(gold_sentiment, pred_sentiment):
    """
    Generate feedback for the sentiment module.
    """
    score = 1.0 if gold_sentiment == pred_sentiment else 0.0
    if gold_sentiment == pred_sentiment:
        feedback = f"You correctly classified the sentiment of the message as `{gold_sentiment}`. This message is indeed `{gold_sentiment}`."
    else:
        feedback = f"You incorrectly classified the sentiment of the message as `{pred_sentiment}`. The correct sentiment is `{gold_sentiment}`. Think about how you could have reasoned to get the correct sentiment label."
    return feedback, score


def feedback_categories(gold_categories, pred_categories):
    """
    Generate feedback for the categories module.
    Uses the same match/mismatch logic as category accuracy in the score.
    """
    correctly_included = [k for k, v in gold_categories.items() if v and k in pred_categories]
    incorrectly_included = [k for k, v in gold_categories.items() if not v and k in pred_categories]
    incorrectly_excluded = [k for k, v in gold_categories.items() if v and k not in pred_categories]
    correctly_excluded = [k for k, v in gold_categories.items() if not v and k not in pred_categories]  # For completeness in accuracy check

    # Recompute category accuracy (matches score logic)
    score = (len(correctly_included) + len(correctly_excluded)) / len(gold_categories)

    if score == 1.0:
        fb_text = f"The category classification is perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`."
    else:
        fb_text = f"The category classification is not perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`.\n"
        if incorrectly_included:
            fb_text += f"However, you incorrectly identified that the message falls under the following categories: `{repr(incorrectly_included)}`. The message DOES NOT fall under these categories.\n"
        if incorrectly_excluded:
            prefix = "Additionally, " if incorrectly_included else "However, "
            fb_text += f"{prefix}you didn't identify the following categories that the message actually falls under: `{repr(incorrectly_excluded)}`.\n"
        fb_text += "Think about how you could have reasoned to get the correct category labels."
    return fb_text, score

In [37]:
def metric_with_feedback(
    data_inst: DataInstWithInput[EmailInput],
    output: RolloutOutput[FacilitySupportAnalyzer],
) -> tuple[float, str | None]:
    """Evaluate facility support request analysis with feedback.
    
    This metric checks if the predicted urgency, sentiment, and categories match the ground truth.
    It also provides feedback on the reasoning process for each module.
    
    Args:
        data_inst: Input data instance with metadata containing ground truth.
        output: Agent's output to evaluate.
        
    Returns:
        Tuple of (score, feedback) where score is between 0.0 and 1.0.
    """
    
    # Extract predicted urgency, sentiment, and categories
    pred_urgency = output.result.urgency
    pred_sentiment = output.result.sentiment
    pred_categories = output.result.categories
    
    # Extract ground truth from metadata
    gold_urgency = data_inst.metadata['expected_output']['urgency']
    gold_sentiment = data_inst.metadata['expected_output']['sentiment']
    gold_categories = data_inst.metadata['expected_output']['categories']
    
    fb_urgency, score_urgency = feedback_urgency(gold_urgency, pred_urgency)
    fb_sentiment, score_sentiment = feedback_sentiment(gold_sentiment, pred_sentiment)
    fb_categories, score_categories = feedback_categories(gold_categories, pred_categories)
    
    # Calculate final score as average of all module scores
    final_score = (score_urgency + score_sentiment + score_categories) / 3.0
    
    # Combine feedback into a single string
    feedback = f"Urgency: {fb_urgency}\nSentiment: {fb_sentiment}\nCategories: {fb_categories}"
    
    return final_score, feedback

In [38]:
metric_with_feedback(train_set[0], test_output)

(0.9333333333333332,
 "Urgency: You correctly classified the urgency of the message as `low`. This message is indeed of `low` urgency.\nSentiment: You correctly classified the sentiment of the message as `neutral`. This message is indeed `neutral`.\nCategories: The category classification is not perfect. You correctly identified that the message falls under the following categories: `['sustainability_and_environmental_practices']`.\nHowever, you didn't identify the following categories that the message actually falls under: `['specialized_cleaning_services', 'general_inquiries']`.\nThink about how you could have reasoned to get the correct category labels.")

In [40]:
reflection_model = "gpt-4.1"
agent_model="gpt-4.1-mini"


config = GepaConfig(
    # Agent configuration
    agent_model=agent_model,
    agent_instructions=DEFAULT_INSTRUCTIONS,
    input_type=EmailInput,
    output_type=FacilitySupportAnalyzer,
    
    # Data and evaluation
    trainset=train_set[:10],
    valset=val_set[:10],
    metric=metric_with_feedback,
    # auto="light",
    # max_metric_calls=500,
    max_full_evals=10,
    
    # Optimization parameters
    module_selector="all",
    candidate_selection_strategy="pareto",
    optimize_tools=True,
    use_merge=True,
    
    # LLM for reflection
    reflection_model=reflection_model,
    
    # Display options
    display_progress_bar=True,
    track_best_outputs=True,
    
    # Caching for faster iterations
    enable_cache=True,
    cache_dir=".gepa_cache",
    
    # Output settings
    output_dir="./optimization_results",
    save_result=True,
)

In [41]:
result = run_optimization_pipeline(config)

Dataset: 10 training, 10 validation examples
Starting GEPA optimization...
Running GEPA for approx 200 metric calls of the program. This amounts to 10.00 full evals on the train+val set.


GEPA Optimization:   5%|▌         | 10/200 [00:00<00:02, 66.67rollouts/s]

Iteration 0: Base program full valset score: 0.7833333333333334 over 10 / 10 examples
Iteration 1: Selected program 0 score: 0.7833333333333334
Iteration 1: Proposed new text for instructions: Read the following facility support email and classify it according to facility management urgency, sentiment, and applicable domain-specific categories. Use the company ProCare Facility Solutions as context. 

- Urgency: Classify as 'high', 'medium', or 'low' based on language (e.g., words like 'urgent', 'immediate', ongoing harm or potential safety issues = 'high'; issues that need attention but aren't emergencies = 'medium'; general inquiries or praise = 'low').
- Sentiment: Determine if the email is 'positive', 'neutral', or 'negative' (e.g., clear complaints or dissatisfaction = negative; inquiries or requests = neutral; compliments or expressions of satisfaction = positive).
- Categories: Assign all relevant categories, considering:
  • quality_and_safety_concerns: Requests or complaints ab

GEPA Optimization:   8%|▊         | 16/200 [00:21<05:06,  1.67s/rollouts]

Iteration 1: New subsample score 2.8666666666666667 is not better than old score 2.8666666666666667, skipping
Iteration 2: Selected program 0 score: 0.7833333333333334
Iteration 2: Proposed new text for instructions: Read the following facility support request email. For the message, classify these attributes:
1. Urgency: Is the request low, medium, or high urgency? Consider urgency as follows: 
- Low: Flexible timeline, no immediate action needed, 'in the coming weeks', 'at your convenience', or similar language.
- Medium: Reasonably soon, but not emergency, e.g., impacting comfort, requires follow-up but not critical.
- High: Immediate action required, critical service needed, safety or operational risk present.
2. Sentiment: Classify as positive, neutral, or negative, based on the email's tone toward the support or situation described.
3. Categories: Assign all relevant categories, choosing from:
  - 'cleaning_services_scheduling' (request to book/schedule cleaning)
  - 'specialized

GEPA Optimization:  16%|█▌        | 32/200 [00:45<04:19,  1.55s/rollouts]

Iteration 2: Found a better program on the valset with score 0.8233333333333333.
Iteration 2: Valset score for new program: 0.8233333333333333 (coverage 10 / 10)
Iteration 2: Val aggregate for new program: 0.8233333333333333
Iteration 2: Individual valset scores for new program: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 2: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 2: Valset pareto front aggregate score: 0.8233333333333333
Iteration 2: Updated valset pareto front programs: {0: {0, 1}, 1: {1}, 2: {0, 1}, 3: {1}, 4: {0, 1}, 5: {0, 1}, 6: {0, 1}, 7: {0, 1}, 8: {0, 1}, 9: {1}}
Iteration 2: Best valset aggregate score so far: 0.8233333333333333
Iteration 2: Best program as per aggregate 

GEPA Optimization:  19%|█▉        | 38/200 [01:19<06:58,  2.59s/rollouts]

Iteration 3: New subsample score 1.9333333333333333 is not better than old score 1.9666666666666666, skipping
Iteration 4: Selected program 1 score: 0.8233333333333333
Iteration 4: Proposed new text for instructions: Read the following facility support email message. Analyze its body and subject line to classify the following attributes:

1. Urgency: low, medium, or high. Use these guidelines:
   - Low: The sender expresses flexibility or no pressing need (e.g., requests for information, phrases like "at your convenience", no indication of immediate concern).
   - Medium: Service is needed reasonably soon but is not urgent or critical (e.g., impacting comfort or satisfaction, but no explicit risk).
   - High: Requires immediate action (e.g., explicit requests for urgency, safety or operational risk, situations with escalation or strong wording like "immediate assistance required").
2. Sentiment: Categorize as positive, neutral, or negative. Assess the tone toward the support or service

GEPA Optimization:  27%|██▋       | 54/200 [02:03<06:29,  2.67s/rollouts]

Iteration 4: Valset score for new program: 0.74 (coverage 10 / 10)
Iteration 4: Val aggregate for new program: 0.74
Iteration 4: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6666666666666666, 3: 0.6333333333333333, 4: 0.9, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6333333333333333}
Iteration 4: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 4: Valset pareto front aggregate score: 0.8233333333333333
Iteration 4: Updated valset pareto front programs: {0: {0, 1, 2}, 1: {1}, 2: {0, 1, 2}, 3: {1}, 4: {0, 1}, 5: {0, 1, 2}, 6: {0, 1, 2}, 7: {0, 1, 2}, 8: {0, 1, 2}, 9: {1}}
Iteration 4: Best valset aggregate score so far: 0.8233333333333333
Iteration 4: Best program as per aggregate score on valset: 1
Iteration 4: Best score on valset: 0.82333

GEPA Optimization:  35%|███▌      | 70/200 [02:43<05:39,  2.61s/rollouts]

Iteration 5: Valset score for new program: 0.7933333333333333 (coverage 10 / 10)
Iteration 5: Val aggregate for new program: 0.7933333333333333
Iteration 5: Individual valset scores for new program: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6666666666666666}
Iteration 5: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 5: Valset pareto front aggregate score: 0.8233333333333333
Iteration 5: Updated valset pareto front programs: {0: {0, 1, 2, 3}, 1: {1, 3}, 2: {0, 1, 2, 3}, 3: {1, 3}, 4: {0, 1, 3}, 5: {0, 1, 2, 3}, 6: {0, 1, 2, 3}, 7: {0, 1, 2, 3}, 8: {0, 1, 2, 3}, 9: {1}}
Iteration 5: Best valset aggregate score so far: 0.8233333333333333
Iteration 5: Best program as per aggregate score on valset: 1
Iteration 5: Best

GEPA Optimization:  38%|███▊      | 76/200 [03:24<06:56,  3.36s/rollouts]

Iteration 6: New subsample score 2.233333333333333 is not better than old score 2.6, skipping
Iteration 7: Selected program 1 score: 0.8233333333333333
Iteration 7: Proposed new text for instructions: Read the following facility support email and classify it along these three dimensions:

1. Urgency: Label as 'low', 'medium', or 'high' considering explicit statements and implied need:
- Low: The sender states there is no rush (e.g., 'at your convenience', 'in the next couple of weeks', or similar language indicating flexibility) and there are no operational, safety, comfort, or risk factors in the message.
- Medium: The sender is requesting follow-up in a reasonable timeframe (but not critically urgent); there may be some discomfort or operational impact, or a need for follow-up, but no immediate risk or safety threat. Terms like 'ideally within the next couple of weeks' may still qualify as low urgency unless other content implies operational impact or discomfort.
- High: Immediate ac

GEPA Optimization:  46%|████▌     | 92/200 [04:09<05:39,  3.15s/rollouts]

Iteration 7: Valset score for new program: 0.7866666666666667 (coverage 10 / 10)
Iteration 7: Val aggregate for new program: 0.7866666666666667
Iteration 7: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6333333333333333, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6666666666666666}
Iteration 7: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 7: Valset pareto front aggregate score: 0.8233333333333333
Iteration 7: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4}, 1: {1, 3}, 2: {0, 1, 2, 3}, 3: {1, 3, 4}, 4: {0, 1, 3, 4}, 5: {0, 1, 2, 3, 4}, 6: {0, 1, 2, 3, 4}, 7: {0, 1, 2, 3, 4}, 8: {0, 1, 2, 3, 4}, 9: {1}}
Iteration 7: Best valset aggregate score so far: 0.8233333333333333
Iteration 7: Best program as per aggregate 

GEPA Optimization:  49%|████▉     | 98/200 [04:48<06:23,  3.76s/rollouts]

Iteration 8: New subsample score 2.0 is not better than old score 2.0, skipping
Iteration 9: Selected program 1 score: 0.8233333333333333
Iteration 9: Proposed new text for instructions: You are given a facility support request email. Analyze the full message (including both subject and body) to classify:

1. Urgency ('low', 'medium', 'high'):
   - 'Low': No immediate action needed, flexible timeline, phrases like 'at your convenience', 'in the coming weeks', or inquiries that do not request immediate or soon action.
   - 'Medium': Request should be addressed reasonably soon but is not critical or disruptive; minor impact or could become an issue if ignored (e.g., small leaks, comfort issues).
   - 'High': Requires immediate action, poses safety, operational, or compliance risks, has explicit urgent or critical wording, or describes severe dissatisfaction with unresolved issues.
2. Sentiment ('positive', 'neutral', 'negative'):
   - 'Positive': Expresses gratitude, satisfaction, or app

GEPA Optimization:  57%|█████▋    | 114/200 [05:26<04:33,  3.18s/rollouts]

Iteration 9: Valset score for new program: 0.7500000000000001 (coverage 10 / 10)
Iteration 9: Val aggregate for new program: 0.7500000000000001
Iteration 9: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6333333333333333, 3: 0.6666666666666666, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6333333333333333, 9: 0.6666666666666666}
Iteration 9: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 9: Valset pareto front aggregate score: 0.8233333333333333
Iteration 9: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5}, 1: {1, 3}, 2: {0, 1, 2, 3}, 3: {1, 3, 4}, 4: {0, 1, 3, 4, 5}, 5: {0, 1, 2, 3, 4, 5}, 6: {0, 1, 2, 3, 4, 5}, 7: {0, 1, 2, 3, 4, 5}, 8: {0, 1, 2, 3, 4}, 9: {1}}
Iteration 9: Best valset aggregate score so far: 0.8233333333333333
Iteration 9: 

GEPA Optimization:  65%|██████▌   | 130/200 [06:00<03:16,  2.80s/rollouts]

Iteration 10: Valset score for new program: 0.7866666666666667 (coverage 10 / 10)
Iteration 10: Val aggregate for new program: 0.7866666666666667
Iteration 10: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6333333333333333}
Iteration 10: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 10: Valset pareto front aggregate score: 0.8233333333333333
Iteration 10: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5, 6}, 1: {1, 3}, 2: {0, 1, 2, 3, 6}, 3: {1, 3, 4, 6}, 4: {0, 1, 3, 4, 5, 6}, 5: {0, 1, 2, 3, 4, 5, 6}, 6: {0, 1, 2, 3, 4, 5, 6}, 7: {0, 1, 2, 3, 4, 5, 6}, 8: {0, 1, 2, 3, 4, 6}, 9: {1}}
Iteration 10: Best valset aggregate score so far: 0.82333333333333

GEPA Optimization:  73%|███████▎  | 146/200 [06:42<02:27,  2.73s/rollouts]

Iteration 11: Valset score for new program: 0.78 (coverage 10 / 10)
Iteration 11: Val aggregate for new program: 0.78
Iteration 11: Individual valset scores for new program: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 0.9, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6333333333333333}
Iteration 11: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 11: Valset pareto front aggregate score: 0.8233333333333333
Iteration 11: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5, 6, 7}, 1: {1, 3, 7}, 2: {0, 1, 2, 3, 6, 7}, 3: {1, 3, 4, 6, 7}, 4: {0, 1, 3, 4, 5, 6}, 5: {0, 1, 2, 3, 4, 5, 6, 7}, 6: {0, 1, 2, 3, 4, 5, 6, 7}, 7: {0, 1, 2, 3, 4, 5, 6, 7}, 8: {0, 1, 2, 3, 4, 6, 7}, 9: {1}}
Iteration 11: Best valset aggregate score so far: 0.8233333333333333
Iteration 11: Be

GEPA Optimization:  76%|███████▌  | 152/200 [07:03<02:17,  2.86s/rollouts]

Iteration 12: New subsample score 2.833333333333333 is not better than old score 2.8666666666666663, skipping
Iteration 13: Selected program 1 score: 0.8233333333333333
Iteration 13: Proposed new text for instructions: Read the following facility support email and classify its urgency, sentiment, and applicable categories.

1. Urgency: Is the request low, medium, or high urgency? Use these definitions:
   - Low: The sender allows a flexible timeline, there's no immediate risk or service breakdown, and phrases like 'at your convenience' or 'in the coming weeks' are used.
   - Medium: The issue needs attention in a reasonable time frame (e.g., is causing decreased comfort or performance) but is not business-critical or a safety hazard. Typical of minor service drop-offs or overlooked regular maintenance.
   - High: Immediate or urgent action is required—explicit language like "urgent", threats to safety or ongoing operations, or unresolved issues after previous attempts to fix.

2. Senti

GEPA Optimization:  84%|████████▍ | 168/200 [07:52<01:33,  2.93s/rollouts]

Iteration 13: Valset score for new program: 0.77 (coverage 10 / 10)
Iteration 13: Val aggregate for new program: 0.77
Iteration 13: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6666666666666666, 3: 1.0, 4: 0.9, 5: 0.3333333333333333, 6: 1.0, 7: 0.9, 8: 0.6666666666666666, 9: 0.6}
Iteration 13: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 13: Valset pareto front aggregate score: 0.8233333333333333
Iteration 13: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5, 6, 7, 8}, 1: {1, 3, 7}, 2: {0, 1, 2, 3, 6, 7, 8}, 3: {1, 3, 4, 6, 7, 8}, 4: {0, 1, 3, 4, 5, 6}, 5: {0, 1, 2, 3, 4, 5, 6, 7, 8}, 6: {0, 1, 2, 3, 4, 5, 6, 7, 8}, 7: {0, 1, 2, 3, 4, 5, 6, 7}, 8: {0, 1, 2, 3, 4, 6, 7, 8}, 9: {1}}
Iteration 13: Best valset aggregate score so far: 0.8233333333333333
Iteration 13:

GEPA Optimization:  87%|████████▋ | 174/200 [08:23<01:25,  3.30s/rollouts]

Iteration 14: New subsample score 2.1999999999999997 is not better than old score 2.5999999999999996, skipping
Iteration 15: Selected program 1 score: 0.8233333333333333
Iteration 15: Proposed new text for instructions: Read the following facility support request (including both the subject and the body) and classify the following attributes:

1. Urgency: Label as 'low', 'medium', or 'high' based on explicit and implicit cues about timing and severity:
   - Low: No immediate action required; words like 'in the coming weeks', 'no urgency', 'at your convenience', or general inquiry or research requests.
   - Medium: Sooner is better, but not critical or dangerous; impacts comfort or continued service; requests follow-up or assessment but not urgently disruptive.
   - High: Immediate action required for safety, security, operational shutdown, major hazard, or explicit urgency.

2. Sentiment: Classify as 'positive', 'neutral', or 'negative' based on the tone toward support or the situation

GEPA Optimization:  95%|█████████▌| 190/200 [09:06<00:30,  3.06s/rollouts]

Iteration 15: Valset score for new program: 0.79 (coverage 10 / 10)
Iteration 15: Val aggregate for new program: 0.79
Iteration 15: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.6666666666666666}
Iteration 15: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 15: Valset pareto front aggregate score: 0.8233333333333333
Iteration 15: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 1: {1, 3, 7}, 2: {0, 1, 2, 3, 6, 7, 8, 9}, 3: {1, 3, 4, 6, 7, 8, 9}, 4: {0, 1, 3, 4, 5, 6, 9}, 5: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 6: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, 7: {0, 1, 2, 3, 4, 5, 6, 7, 9}, 8: {0, 1, 2, 3, 4, 6, 7, 8, 9}, 9: {1}}
Iteration 15: Best valset ag

GEPA Optimization:  95%|█████████▌| 190/200 [09:41<00:30,  3.06s/rollouts]

Iteration 16: Valset score for new program: 0.7766666666666666 (coverage 10 / 10)
Iteration 16: Val aggregate for new program: 0.7766666666666666
Iteration 16: Individual valset scores for new program: {0: 0.6666666666666666, 1: 0.9666666666666667, 2: 0.6666666666666666, 3: 1.0, 4: 0.5666666666666667, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 16: New valset pareto front scores: {0: 0.6666666666666666, 1: 1.0, 2: 0.6666666666666666, 3: 1.0, 4: 1.0, 5: 0.3333333333333333, 6: 1.0, 7: 0.9333333333333332, 8: 0.6666666666666666, 9: 0.9666666666666667}
Iteration 16: Valset pareto front aggregate score: 0.8233333333333333
Iteration 16: Updated valset pareto front programs: {0: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1: {1, 3, 7}, 2: {0, 1, 2, 3, 6, 7, 8, 9, 10}, 3: {1, 3, 4, 6, 7, 8, 9, 10}, 4: {0, 1, 3, 4, 5, 6, 9}, 5: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 6: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 7: {0, 1, 2, 3, 4, 5, 6, 7, 9, 10}, 8




In [42]:
from IPython.display import Markdown, display

# Create markdown content for the results
markdown_content = f"""
## Optimization Results

**Best Score:** {result.best_score:.4f}
"""

if result.original_score is not None:
    markdown_content += f"**Original Score:** {result.original_score:.4f}\n"
    improvement = result.improvement_ratio()
    if improvement is not None:
        markdown_content += f"**Improvement:** {improvement:+.2%}\n"

markdown_content += f"""
**Iterations:** {result.num_iterations}
**Metric Calls:** {result.num_metric_calls}
**GEPA Input Tokens:** {result.gepa_usage.input_tokens:,}
**GEPA Output Tokens:** {result.gepa_usage.output_tokens:,}
**GEPA API Calls:** {result.gepa_usage.requests:,}

## Optimized Components

"""

for component_name, component_value in result.best_candidate.items():
    markdown_content += f"### {component_name}\n\n{component_value}\n\n"

display(Markdown(markdown_content))


## Optimization Results

**Best Score:** 0.8233
**Original Score:** 0.7833
**Improvement:** +5.11%

**Iterations:** 11
**Metric Calls:** 206
**GEPA Input Tokens:** 648,378
**GEPA Output Tokens:** 28,060
**GEPA API Calls:** 170

## Optimized Components

### instructions

Read the following facility support request email. For the message, classify these attributes:
1. Urgency: Is the request low, medium, or high urgency? Consider urgency as follows: 
- Low: Flexible timeline, no immediate action needed, 'in the coming weeks', 'at your convenience', or similar language.
- Medium: Reasonably soon, but not emergency, e.g., impacting comfort, requires follow-up but not critical.
- High: Immediate action required, critical service needed, safety or operational risk present.
2. Sentiment: Classify as positive, neutral, or negative, based on the email's tone toward the support or situation described.
3. Categories: Assign all relevant categories, choosing from:
  - 'cleaning_services_scheduling' (request to book/schedule cleaning)
  - 'specialized_cleaning_services' (requests needing deep cleaning, special attention)
  - 'routine_maintenance_requests' (requests for regular or expected maintenance)
  - 'customer_feedback_and_complaints' (feedback including complaints or suggestions for improvement)
  - 'quality_and_safety_concerns' (mentions missed maintenance, safety issues, or quality lapses)
  - 'sustainability_and_environmental_practices' (requests relating specifically to eco-friendly practices, NOT just mentioning eco-friendly in passing)
Assign a category only if it is clearly present in the message. Multiple categories may apply.

--
Provide your result as a JSON object: { "urgency": ..., "sentiment": ..., "categories": [...] }.


### tool:final_result:description

Analyzes facility support requests by classifying urgency (low/medium/high), sentiment (positive/neutral/negative), and all relevant request categories based on the content of the email.

### tool:final_result:param:urgency

The urgency of the request, using:
- 'low': Request is flexible in timing ('at your convenience', 'in the next couple of weeks'), no immediate service needed.
- 'medium': Request should be addressed reasonably soon (discomfort, follow-up needed, but not critical danger).
- 'high': Request needs action right away (safety issue, imminent risk, operational shutdown, explicit urgent tone).
Choose the most appropriate label based on explicit and implied timing cues.

### tool:final_result:param:sentiment

The sentiment toward the support or situation in the email: 'positive', 'neutral', or 'negative'. Assess tone—positive if appreciative or satisfied, neutral if matter-of-fact or balanced, negative if dissatisfied or highlighting significant problems.

### tool:final_result:param:categories

The list of all applicable categories for the request. Only apply a category if the message clearly refers to that area (not just a passing mention or value statement).
Available categories:
- 'cleaning_services_scheduling': Requests to arrange or change a cleaning schedule.
- 'specialized_cleaning_services': Requests for deep/special cleaning beyond regular services.
- 'routine_maintenance_requests': Maintenance or regular service requests.
- 'customer_feedback_and_complaints': Feedback, complaints, or improvement suggestions.
- 'quality_and_safety_concerns': Calls out missed checks, hazards, or performance/safety problems.
- 'sustainability_and_environmental_practices': Message specifically requests eco/environmental actions, not just mentions that the business provides them.

### signature:EmailInput:instructions

Input model for facility support email classification. Provide an email message to receive its urgency, sentiment, and specific facility-related category assignments as output.

### signature:EmailInput:input:desc

The facility support email message content to classify for urgency, sentiment, and specific categories. The body and subject line should both be considered.



In [43]:
print(result.graphviz_dag)

digraph G {
    node [style=filled, shape=circle, fontsize=50];
    0 [label="0\n(0.78)"];
    1 [label="1\n(0.82)", fillcolor=cyan, fontcolor=black];
    2 [label="2\n(0.74)"];
    3 [label="3\n(0.79)"];
    4 [label="4\n(0.79)"];
    5 [label="5\n(0.75)"];
    6 [label="6\n(0.79)"];
    7 [label="7\n(0.78)"];
    8 [label="8\n(0.77)"];
    9 [label="9\n(0.79)"];
    10 [label="10\n(0.78)"];
    0 -> 1;
    1 -> 2;
    1 -> 3;
    1 -> 4;
    1 -> 5;
    1 -> 6;
    1 -> 7;
    1 -> 8;
    1 -> 9;
    1 -> 10;
}


In [27]:
from IPython.display import SVG

SVG('optimization_results/graphviz_titanic.svg')

ExpatError: not well-formed (invalid token): line 1, column 20

In [49]:
result.best_candidate

{'instructions': 'Read the following facility support request email. For the message, classify these attributes:\n1. Urgency: Is the request low, medium, or high urgency? Consider urgency as follows: \n- Low: Flexible timeline, no immediate action needed, \'in the coming weeks\', \'at your convenience\', or similar language.\n- Medium: Reasonably soon, but not emergency, e.g., impacting comfort, requires follow-up but not critical.\n- High: Immediate action required, critical service needed, safety or operational risk present.\n2. Sentiment: Classify as positive, neutral, or negative, based on the email\'s tone toward the support or situation described.\n3. Categories: Assign all relevant categories, choosing from:\n  - \'cleaning_services_scheduling\' (request to book/schedule cleaning)\n  - \'specialized_cleaning_services\' (requests needing deep cleaning, special attention)\n  - \'routine_maintenance_requests\' (requests for regular or expected maintenance)\n  - \'customer_feedback_

In [57]:
from gepadantic import generate_system_instructions, generate_user_content


print(generate_system_instructions(
    train_set[0].input,
    candidate=result.best_candidate,
))

Input model for facility support email classification. Provide an email message to receive its urgency, sentiment, and specific facility-related category assignments as output.

Inputs

- `<input>` (str): The facility support email message content to classify for urgency, sentiment, and specific categories. The body and subject line should both be considered.


In [56]:
print(generate_user_content(
    train_set[0].input,
)[0])

<input>Subject: Inquiry About Your Eco-Friendly Practices

Dear ProCare Facility Solutions Support Team,

I hope this message finds you well. My name is [Sender], and I am a mother of a 10-year-old daughter who has recently started Irish dance. We live in a residential complex where your team provides cleaning and maintenance services.

I am writing to inquire about the eco-friendly practices your company employs. As someone who is conscious about the environment and wants to set a good example for my daughter, I am keen to understand how your services align with sustainable practices. Specifically, I am interested in the types of cleaning products you use and any measures you take to reduce the carbon footprint of your operations.

I have noticed that the cleaning staff is very diligent and thorough, which I appreciate. However, I would like to know more about the environmental impact of the products and methods used. Are there any certifications or standards that your company adheres

In [80]:
from src.gepadantic.signature_agent import SignatureAgent

# Create and configure agent
test_agent = Agent(
    model=get_openai_model(config.agent_model),
    instructions=config.agent_instructions,
    output_type=FacilitySupportAnalyzer,
)

test_signature_agent = SignatureAgent(
    test_agent,
    input_type=EmailInput,
    optimize_tools=config.optimize_tools,
)

In [None]:
nest_asyncio.apply()


optimized_test_scores: list[int] = []

for i in range(len(test_set)):
    
    test_input = test_set[i].input
    test_result = test_signature_agent.run_signature_sync(test_input, candidate=result.best_candidate)
    rollout_output = RolloutOutput(
        result=test_result.output,
        success=True,
    )
    score, feedback = metric_with_feedback(test_set[i], rollout_output)
    optimized_test_scores.append(score)

In [82]:
print(f"Optimized test score: {np.mean(optimized_test_scores)}")

Optimized test score: 0.803921568627451


In [85]:
prepared_user_prompt, instructions_override = test_signature_agent._prepare_run_arguments(
            test_set[0].input,
            candidate=result.best_candidate,
            message_history=None,
            user_prompt=None,
        )

In [86]:
print(instructions_override[0])

Read the following facility support request email. For the message, classify these attributes:
1. Urgency: Is the request low, medium, or high urgency? Consider urgency as follows: 
- Low: Flexible timeline, no immediate action needed, 'in the coming weeks', 'at your convenience', or similar language.
- Medium: Reasonably soon, but not emergency, e.g., impacting comfort, requires follow-up but not critical.
- High: Immediate action required, critical service needed, safety or operational risk present.
2. Sentiment: Classify as positive, neutral, or negative, based on the email's tone toward the support or situation described.
3. Categories: Assign all relevant categories, choosing from:
  - 'cleaning_services_scheduling' (request to book/schedule cleaning)
  - 'specialized_cleaning_services' (requests needing deep cleaning, special attention)
  - 'routine_maintenance_requests' (requests for regular or expected maintenance)
  - 'customer_feedback_and_complaints' (feedback including comp

In [87]:
normalized_user_prompt = test_signature_agent._normalize_user_prompt(prepared_user_prompt)

In [88]:
print(normalized_user_prompt[0])

<input>Subject: Urgent Concerns Regarding Sustainability Practices

Dear ProCare Support Team,

I hope this message finds you well, though I must admit, my current sentiment is far from positive. My name is [Sender], and I have been utilizing your services for my residential property for the past year. As someone who deeply values the intricate layers of environmental consciousness, much like the conceptual depth found in Megan Cutler's novels, I am disheartened by recent observations regarding your sustainability practices.

Despite your claims of prioritizing environmentally friendly methods, I have noticed several inconsistencies that suggest otherwise. For instance, the cleaning products used in my home do not appear to be eco-friendly, as they emit strong chemical odors and lack any certification labels. Additionally, the waste management practices seem haphazard, with recyclables often mixed with general waste.

I have attempted to address these issues by speaking with your on-si

In [89]:
from gepadantic.tool_components import get_or_create_tool_optimizer


tool_optimizer = get_or_create_tool_optimizer(test_signature_agent)

In [90]:
tool_optimizer.get_seed_components()

{'tool:final_result:description': 'Analyzes facility support requests.',
 'tool:final_result:param:urgency': 'The urgency of the request',
 'tool:final_result:param:sentiment': 'The sentiment of the request',
 'tool:final_result:param:categories': 'The categories of the request'}