In [None]:
from typing import Any, Dict
import json

def parse_json(input_string: str):
    """
    Attempts to parse the given string as JSON. If direct parsing fails,
    it tries to extract a JSON snippet from code blocks formatted as:
        ```json
        ... JSON content ...
        ```
    or any code block delimited by triple backticks and then parses that content.

    Parameters:
        input_string (str): The input string which may contain JSON.

    Returns:
        The parsed JSON object.

    Raises:
        ValueError: If parsing fails even after attempting to extract a JSON snippet.
    """
    # Try to parse the string directly.
    try:
        return json.loads(input_string)
    except json.JSONDecodeError as err:
        error = err  # Proceed to try extracting a JSON snippet.

    # Define patterns to search for a JSON code block.
    patterns = [
        re.compile(r"```json\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE),  # code block with "json" label
        re.compile(r"```(.*?)```", re.DOTALL)  # any code block delimited by triple backticks
    ]
    
    # Attempt extraction using each pattern in order.
    for pattern in patterns:
        match = pattern.search(input_string)
        if match:
            json_candidate = match.group(1).strip()
            try:
                return json.loads(json_candidate)
            except json.JSONDecodeError:
                # Continue trying if extraction from the code block didn't result in valid JSON.
                continue

    # If all attempts fail, raise an error.
    raise error


def evaluate(ground_truth: Any, predictions: Any, strict_json: bool = True) -> Dict[str, Any]:
    result = {
        "is_valid_json": False,
        "correct_categories": 0.,
        "correct_sentiment": False,
        "correct_urgency": False,
    }
    try:
        ground_truth = ground_truth if isinstance(ground_truth, dict) else (json.loads(ground_truth) if strict_json else parse_json(ground_truth))
        predictions = predictions if isinstance(predictions, dict) else (json.loads(predictions) if strict_json else parse_json(predictions))
    except json.JSONDecodeError:
        pass
    else:
        result["is_valid_json"] = True
        result["correct_categories"] = sum([ground_truth["categories"][k] == predictions["categories"][k] for k in ground_truth["categories"].keys()]) / len(ground_truth["categories"])
        result["correct_sentiment"] = predictions["sentiment"] == ground_truth["sentiment"]
        result["correct_urgency"] = predictions["urgency"] == ground_truth["urgency"]
    result["total"] = sum([float(v) for k, v in result.items() if k.startswith('correct_')]) / len([k for k in result.keys() if k.startswith('correct')])
    return result

In [None]:
import yaml

with open('facility_v2_train.json') as stream:
    dataset = json.load(stream)

with open('facility_prompt.yaml') as stream:
    prompt = yaml.safe_load(stream)    

In [None]:
dataset_test = dataset[int(len(dataset)*0.7):]
len(dataset_test)

In [None]:
# from openai import OpenAI
from gen_ai_hub.proxy.native.openai import OpenAI

In [None]:
client = OpenAI()

In [None]:
from tqdm.auto import tqdm

result = []

for entry in tqdm(dataset_test):
    output = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt["system"]},
            {"role": "user", "content": prompt["user"].format(**entry["fields"])},
        ],
        temperature=0.
    )
    prediction = output.choices[0].message.content
    result.append(evaluate(entry["answer"], prediction))

    

In [None]:
float_keys = [k for k, v in result[0].items() if isinstance(v, (int, float, bool))]
{k: sum([e[k] for e in result])/len(result) for k in float_keys}

In [None]:
# gpt-4o -> {'is_valid_json': 0.967, 'correct_categories': 0.895, 'correct_sentiment': 0.517, 'correct_urgency': 0.767, 'total': 0.726}