In [None]:
from typing import Optional, Literal
from pydantic import BaseModel, Field, model_validator
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.providers.openai import OpenAIProvider
import nest_asyncio

nest_asyncio.apply()

In [None]:
class TextEval(BaseModel):
    reference: str
    hypothesis: str
    title: Optional[str] = None
        
    def wer(self) -> float:
        # Word Error Rate (WER)
        return jiwer.wer(self.reference, self.hypothesis)
        
    def mer(self) -> float:
        # Match Error Rate (MER)
        return jiwer.mer(self.reference, self.hypothesis)

    def wil(self) -> float:
        # Word Information Lost (WIL)
        return jiwer.wil(self.reference, self.hypothesis)

    def wip(self) -> float:
        # Word Information Preserved (WIP)
        return jiwer.wip(self.reference, self.hypothesis)

    def cer(self) -> float:
        # Character Error Rate (CER)
        return jiwer.cer(self.reference, self.hypothesis)

    def explain(self) -> dict:
        system_prompt = """
        You are a helpful and concise assistant specializing comparing and rating transcripts.
        You will receive two texts, reference, the quality controlled ground truth and hypothesis and automated transcript created using asr
        You will evaluate the hypothesis against the transcript using the following quality criteria:
            * Accuracy, how precisely the transcription matches the actual spoken words.
            * Completeness, Whether all relevant parts of the spoken content are included.
            * Preservation of meaning, How well the original intent and ideas are retained.
            * Readability, How easy the transcription is to read and understand.
            * Style and tone retention, How well the speaker’s personal tone, expression, and manner of speaking are maintained.
        
        You will also include a explanation of the evaluation.
        """
        
        class QualityCriteria(BaseModel):
            accuracy: Literal[1, 2, 3, 4, 5] # How precisely the transcription matches the actual spoken words.
            completeness: Literal[1, 2, 3, 4, 5] # Whether all relevant parts of the spoken content are included.
            preservation_of_meaning: Literal[1, 2, 3, 4, 5] # How well the original intent and ideas are retained.
            readability: Literal[1, 2, 3, 4, 5] # How easy the transcription is to read and understand.
            style_and_tone_retention: Literal[1, 2, 3, 4, 5] # How well the speaker’s personal tone, expression, and manner of speaking are maintained.
            explanation: str
            
        ollama_model = OpenAIModel(
            model_name='qwen3:30b', provider=OpenAIProvider(base_url='http://localhost:11434/v1')
        )
        agent = Agent(ollama_model, system_prompt=system_prompt, output_type=QualityCriteria,  instrument=True)
        
        result = agent.run_sync(f"reference: {self.reference}, hypothesis: {self.hypothesis}")
        quality_criteria = result.output.model_dump()
        return quality_criteria
        