# HiddenOracle3 with Result Saving

This notebook extends the functionality of the previous version by saving the evaluation results to a JSON file. That way, you keep a permanent record of your local model's answers, factuality, and explanations.

In [None]:
import openai
import json
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ---------------------------------------
# Configuration Variables
# ---------------------------------------

# (1) OpenAI API Key
OPENAI_API_KEY = "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"

# (2) Paths
# Make sure these are valid paths in your Google Drive.
DATASET_PATH = "/content/drive/My Drive/Colab Notebooks/HiddenOracle3/data.json"  # Example path
LOCAL_MODEL_NAME = "meta-llama/Llama-2-7b-hf"  # Replace if needed

# Path to where we'll write the JSON results.
OUTPUT_JSON_PATH = "/content/drive/My Drive/Colab Notebooks/HiddenOracle3/evaluation_results.json"

# ---------------------------------------
# OpenAI Setup
# ---------------------------------------
def configure_openai() -> None:
    """
    Sets the OpenAI API key.
    Call this once before using any OpenAI endpoints.
    """
    openai.api_key = OPENAI_API_KEY


In [None]:
class OpenAI:
    """A simple wrapper class for OpenAI's ChatCompletion API calls."""
    def __init__(self, api_key: str):
        self.api_key = api_key
        openai.api_key = self.api_key

    @property
    def chat(self):
        return self

    @property
    def completions(self):
        return self

    def create(
        self,
        model: str,
        messages: List[Dict[str, Any]],
        tools: List[Dict[str, Any]] = None,
        tool_choice: Dict[str, Any] = None,
        **kwargs
    ) -> Any:
        """
        Mocked method to illustrate function-calling usage. In a real scenario, you would
        call `openai.ChatCompletion.create(...)` with the appropriate function definitions.
        """

        user_message = messages[-1]["content"].lower()

        # Simple example logic to simulate an API response.
        is_factual = False
        explanation = "Local LLM answer does not match known correct answers."

        if "example_correct_keyword" in user_message:
            is_factual = True
            explanation = "The local LLM's answer matches the known correct answer."

        # Mock a function call structure (simulating GPT-4 function calling)
        response = {
            "choices": [
                {
                    "message": {
                        "tool_calls": [
                            {
                                "function": {
                                    "name": "evaluate_answer",
                                    "arguments": json.dumps({
                                        "is_factual": is_factual,
                                        "explanation": explanation
                                    })
                                }
                            }
                        ]
                    }
                }
            ]
        }

        return response


## Load Dataset and Local Model Generation

In [None]:
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """
    Loads the dataset from a JSON file.
    """
    with open(dataset_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

def get_local_llm_answer(question: str, generation_pipeline) -> str:
    """
    Uses a text-generation pipeline to get an answer from the local LLM.
    """
    results = generation_pipeline(
        question, max_new_tokens=25, num_return_sequences=1, do_sample=False
    )
    generated_text = results[0]["generated_text"]

    # Optionally trim out the prompt if it's included in the output
    if generated_text.startswith(question):
        answer = generated_text[len(question):].strip()
    else:
        answer = generated_text.strip()

    return answer


## Evaluate correctness with the new function-calling approach

In [None]:
def evaluate_with_openai_api(
    question: str, local_llm_answer: str, correct_answers: List[str]
) -> Dict[str, Any]:
    """
    Calls the OpenAI ChatCompletion with the new function-calling style to evaluate
    the local LLM's response.
    """
    client = OpenAI(api_key=OPENAI_API_KEY)

    tools = [
        {
            "type": "function",
            "function": {
                "name": "evaluate_answer",
                "description": "Evaluate the correctness of a local LLM's answer.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "is_factual": {
                            "type": "boolean",
                            "description": "True if the local LLM answer is correct."
                        },
                        "explanation": {
                            "type": "string",
                            "description": "Explanation of why the answer is correct or incorrect."
                        }
                    },
                    "required": ["is_factual", "explanation"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }
    ]

    messages = [
        {
            "role": "system",
            "content": (
                "You are a judge. Compare the local LLM answer with known correct answers.\n"
                "Use the evaluate_answer function to return factuality and reasoning."
            ),
        },
        {
            "role": "user",
            "content": (
                f"Question: {question}\n"
                f"Local LLM Answer: {local_llm_answer}\n"
                f"Known Correct Answers: {correct_answers}"
            ),
        },
    ]

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "evaluate_answer"}},
    )

    if completion["choices"][0]["message"].get("tool_calls"):
        tool_call = completion["choices"][0]["message"]["tool_calls"][0]
        arguments = json.loads(tool_call["function"]["arguments"])
        return arguments
    else:
        return {
            "is_factual": False,
            "explanation": "No tool call was made."
        }


## Main Evaluation Routine
Here, in addition to printing the results, we will accumulate them in a list (`evaluation_results`) and **write** them out to `OUTPUT_JSON_PATH` at the end.


In [None]:
def evaluate_model(dataset_path: str, local_model_name: str, output_json_path: str) -> None:
    """
    1. Loads the dataset.
    2. Initializes the local model.
    3. Generates an answer with the local LLM.
    4. Uses either a simple string match or OpenAI's function calling for deeper checks.
    5. Prints the results and saves them to a JSON file.
    """
    # Load dataset
    dataset = load_dataset(dataset_path)

    # Initialize local model pipeline
    tokenizer = AutoTokenizer.from_pretrained(local_model_name)
    model = AutoModelForCausalLM.from_pretrained(
        local_model_name,
        device_map="auto",
        low_cpu_mem_usage=True,
    )
    generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    # List to store results for writing to file
    evaluation_results = []

    # Evaluate each dataset item
    for item in dataset:
        question = item["question"]
        correct_answers = item["answers"]

        # 1) Generate answer from local LLM
        local_answer = get_local_llm_answer(question, generation_pipeline)

        # 2) Simple string match check
        matched = any(
            ans.lower() in local_answer.lower() for ans in correct_answers
        )

        # 3) If we have a match, skip the OpenAI API and record success.
        if matched:
            result = {
                "is_factual": True,
                "explanation": "String match with known correct answers; no OpenAI call needed.",
            }
        else:
            # 4) If no match, call the OpenAI function-calling API
            result = evaluate_with_openai_api(
                question=question,
                local_llm_answer=local_answer,
                correct_answers=correct_answers,
            )

        # 5) Print results
        print("--------------------------------------------------")
        print(f"Question: {question}")
        print(f"Local LLM Answer: {local_answer}")
        print(f"Is Factual: {result['is_factual']}")
        print(f"Explanation: {result['explanation']}")
        print("--------------------------------------------------")

        # 6) Accumulate results for saving
        evaluation_results.append({
            "question": question,
            "answers": correct_answers,
            "local_llm_answer": local_answer,
            "is_factual": result['is_factual'],
            "explanation": result['explanation']
        })

    # 7) Write results to a JSON file (e.g., in Google Drive)
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(evaluation_results, f, ensure_ascii=False, indent=2)

    print(f"\nAll results saved to: {output_json_path}")


## Running the Evaluation

In **Colab**, mount your Google Drive:
```python
from google.colab import drive
drive.mount('/content/drive')
```
Make sure the notebook and data are in valid paths under `/content/drive/...`. Then you can run the evaluation cell below.

In [None]:
if __name__ == "__main__":
    configure_openai()
    evaluate_model(
        dataset_path=DATASET_PATH,
        local_model_name=LOCAL_MODEL_NAME,
        output_json_path=OUTPUT_JSON_PATH
    )
