# Facility Synthetic Dataset Evaluation

This notebook provides a comprehensive evaluation framework for testing and benchmarking LLM performance on the facility management synthetic dataset. It focuses on assessing how well language models can extract structured information from customer service messages in the facility management domain.

In [None]:
from typing import Any, Dict
import json
import re

def parse_json(input_string: str):
    """
    Attempts to parse the given string as JSON. If direct parsing fails,
    it tries to extract a JSON snippet from code blocks formatted as:
        ```json
        ... JSON content ...
        ```
    or any code block delimited by triple backticks and then parses that content.

    Parameters:
        input_string (str): The input string which may contain JSON.

    Returns:
        The parsed JSON object.

    Raises:
        ValueError: If parsing fails even after attempting to extract a JSON snippet.
    """
    # Try to parse the string directly.
    try:
        return json.loads(input_string)
    except json.JSONDecodeError as err:
        error = err  # Proceed to try extracting a JSON snippet.

    # Define patterns to search for a JSON code block.
    patterns = [
        re.compile(r"```json\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE),  # code block with "json" label
        re.compile(r"```(.*?)```", re.DOTALL)  # any code block delimited by triple backticks
    ]
    
    # Attempt extraction using each pattern in order.
    for pattern in patterns:
        match = pattern.search(input_string)
        if match:
            json_candidate = match.group(1).strip()
            try:
                return json.loads(json_candidate)
            except json.JSONDecodeError:
                # Continue trying if extraction from the code block didn't result in valid JSON.
                continue

    # If all attempts fail, raise an error.
    raise error


def evaluate(ground_truth: Any, predictions: Any, strict_json: bool = True) -> Dict[str, Any]:
    result = {
        "is_valid_json": False,
        "correct_categories": 0.,
        "correct_sentiment": False,
        "correct_urgency": False,
    }
    try:
        ground_truth = ground_truth if isinstance(ground_truth, dict) else (json.loads(ground_truth) if strict_json else parse_json(ground_truth))
        predictions = predictions if isinstance(predictions, dict) else (json.loads(predictions) if strict_json else parse_json(predictions))
    except json.JSONDecodeError:
        pass
    else:
        result["is_valid_json"] = True
        result["correct_categories"] = sum([ground_truth["categories"][k] == predictions["categories"][k] for k in ground_truth["categories"].keys()]) / len(ground_truth["categories"])
        result["correct_sentiment"] = predictions["sentiment"] == ground_truth["sentiment"]
        result["correct_urgency"] = predictions["urgency"] == ground_truth["urgency"]
    result["total"] = sum([float(v) for k, v in result.items() if k.startswith('correct_')]) / len([k for k in result.keys() if k.startswith('correct')])
    return result

In [4]:
import yaml

with open('dataset.json') as stream:
    dataset = json.load(stream)

with open('facility_prompt.yaml') as stream:
    prompt = yaml.safe_load(stream)    

In [5]:
dataset_test = dataset[int(len(dataset)*0.7):]
len(dataset_test)

60

In [None]:
import asyncio
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import os

# Configure OpenRouter client
client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),  # Make sure to set this environment variable
)

async def process_entry(entry):
    """Process a single entry with OpenRouter"""
    try:
        output = await client.chat.completions.create(
            model="meta-llama/llama-3.3-70b-instruct",
            messages=[
                {"role": "system", "content": prompt["system"]},
                {"role": "user", "content": prompt["user"].format(**entry["fields"])},
            ],
            temperature=0.
        )
        prediction = output.choices[0].message.content
        return evaluate(entry["answer"], prediction)
    except Exception as e:
        print(f"Error processing entry: {e}")
        return {"error": str(e)}

async def process_batch(entries, batch_size=10):
    """Process entries in batches to avoid rate limits"""
    results = []
    
    for i in tqdm(range(0, len(entries), batch_size), desc="Processing batches"):
        batch = entries[i:i + batch_size]
        batch_results = await asyncio.gather(*[process_entry(entry) for entry in batch])
        results.extend(batch_results)
        
        # Optional: Add a small delay between batches to be respectful to the API
        if i + batch_size < len(entries):
            await asyncio.sleep(0.1)
    
    return results

# Run the batch processing
result = await process_batch(dataset_test, batch_size=24)  # Adjust batch_size as needed

    

  from .autonotebook import tqdm as notebook_tqdm
Processing batches:  92%|█████████▏| 11/12 [00:58<00:05,  5.46s/it]

In [None]:
float_keys = [k for k, v in result[0].items() if isinstance(v, (int, float, bool))]
{k: sum([e[k] for e in result])/len(result) for k in float_keys}