In [1]:
# =========================
# Cell 1: Mount Drive & Basic Config
# =========================

from google.colab import drive

drive.mount("/content/drive")

import os
import openai

# Insert your Hugging Face token (for Llama-2)
os.environ["HUGGINGFACE_HUB_TOKEN"] = "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"

# Insert your OpenAI API key
OPENAI_API_KEY = "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"
openai.api_key = OPENAI_API_KEY

# Paths, local model name, etc.
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/HiddenOracle3/data.json"
LOCAL_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
OUTPUT_JSON_PATH = (
    "/content/drive/MyDrive/Colab Notebooks/HiddenOracle3/evaluation_results.json"
)

MODEL_CACHE_DIR = "/content/drive/MyDrive/Colab Notebooks/HiddenOracle3/models"
USE_LOCAL_CACHE = True
FORCE_REDOWNLOAD = False

print("Cell 1 done: Drive mounted, environment set.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cell 1 done: Drive mounted, environment set.


In [2]:
# =========================
# Cell 2: Dataset Loader & Local LLM Answer
# =========================


def load_dataset(dataset_path: str):
    """
    Loads the dataset from a JSON file.
    Must have a list of dicts with 'question' and 'answers'.
    """
    import json

    with open(dataset_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def get_local_llm_answer(question: str, generation_pipeline) -> str:
    """
    Generate an answer using the local model pipeline (Transformers).
    We'll do 25 new tokens, no sampling for a deterministic short answer.
    """
    results = generation_pipeline(
        question, max_new_tokens=25, num_return_sequences=1, do_sample=False
    )
    generated_text = results[0]["generated_text"]

    # Remove repeated prompt if present
    if generated_text.startswith(question):
        return generated_text[len(question) :].strip()
    else:
        return generated_text.strip()


print("Cell 2 done: load_dataset & get_local_llm_answer defined.")

Cell 2 done: load_dataset & get_local_llm_answer defined.


In [3]:
# =========================
# Cell 3: Evaluate with OpenAI (New Function-Calling API)
# =========================

from openai import OpenAI
import json

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)


def evaluate_with_openai_api(
    question: str, local_llm_answer: str, correct_answers: list
) -> dict:
    """
    Use OpenAI's chat completions API with function calling (≥1.0.0).
    Evaluates correctness of a local LLM answer.
    """

    # Define the function schema
    functions = [
        {
            "name": "evaluate_answer",
            "description": "Evaluate correctness of a local LLM answer.",
            "parameters": {
                "type": "object",
                "properties": {
                    "is_factual": {
                        "type": "boolean",
                        "description": "True if the local LLM's answer is correct.",
                    },
                    "explanation": {
                        "type": "string",
                        "description": "Explanation of correctness or incorrectness.",
                    },
                },
                "required": ["is_factual", "explanation"],
                "additionalProperties": False,
            },
        }
    ]

    # Prepare messages
    messages = [
        {
            "role": "system",
            "content": (
                "You are a judge. Compare the local LLM answer with known correct answers.\n"
                "Call the function 'evaluate_answer' with your findings, or produce a direct response if needed."
            ),
        },
        {
            "role": "user",
            "content": (
                f"Question: {question}\n"
                f"Local LLM Answer: {local_llm_answer}\n"
                f"Known Correct Answers: {correct_answers}"
            ),
        },
    ]

    # Call the chat completions API with function calling
    completion = client.chat.completions.create(
        model="gpt-4-0613",
        messages=messages,
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "evaluate_answer",
                    "description": "Evaluate correctness of a local LLM answer.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "is_factual": {"type": "boolean"},
                            "explanation": {"type": "string"},
                        },
                        "required": ["is_factual", "explanation"],
                        "additionalProperties": False,
                    },
                },
            }
        ],
        tool_choice={
            "type": "function",
            "function": {"name": "evaluate_answer"},
        },  # Force call
    )

    # Parse the response
    choice = completion.choices[0].message

    if "tool_calls" in choice:
        args = json.loads(choice["tool_calls"][0]["function"]["arguments"])
        return args
    else:
        return {"is_factual": False, "explanation": "No function call was made."}


print("Cell 3 done: evaluate_with_openai_api using new function-calling API.")

Cell 3 done: evaluate_with_openai_api using new function-calling API.


In [4]:
# =========================
# Cell 4: Load Local LLM Model (GPU, Half Precision)
# =========================

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


def load_local_llm_pipeline(model_name: str):
    """
    Load Llama-2 locally in half precision on GPU with device_map="auto".
    MUST NOT pass device=0 to pipeline if we do device_map="auto".
    """
    import os

    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
    if not hf_token:
        raise ValueError(
            "No Hugging Face token found in environment (HUGGINGFACE_HUB_TOKEN)."
        )

    from_pretrained_kwargs = {}
    if USE_LOCAL_CACHE:
        from_pretrained_kwargs["cache_dir"] = MODEL_CACHE_DIR
    if FORCE_REDOWNLOAD:
        from_pretrained_kwargs["force_download"] = True

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token,  # new param name instead of 'use_auth_token'
        **from_pretrained_kwargs
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",  # automatically distribute layers
        torch_dtype=torch.float16,  # half precision
        low_cpu_mem_usage=True,
        **from_pretrained_kwargs
    )

    # DO NOT pass device=0 here, since device_map="auto" is used
    gen_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    return gen_pipeline


print("Cell 4 done: load_local_llm_pipeline (GPU, half precision, no device=...).")

Cell 4 done: load_local_llm_pipeline (GPU, half precision, no device=...).


In [5]:
# =========================
# Cell 5: Main Evaluate Function
# =========================

import time


def evaluate_model(dataset_path: str, local_model_name: str, output_json_path: str):
    """
    1) Load dataset
    2) Load local LLM w/GPU
    3) For each item:
       - local generation, measure time
       - substring match or openai function calling
    4) Print & store results
    """

    start_run = time.time()
    dataset = load_dataset(dataset_path)
    pipeline_llm = load_local_llm_pipeline(local_model_name)

    evaluation_results = []

    for item in dataset:
        question = item["question"]
        correct_answers = item["answers"]

        t0_local = time.time()
        local_answer = get_local_llm_answer(question, pipeline_llm)
        local_time = time.time() - t0_local

        # substring check
        matched = any(ans.lower() in local_answer.lower() for ans in correct_answers)

        t0_openai = time.time()
        if matched:
            result = {
                "is_factual": True,
                "explanation": "Substring match with known correct answers; no GPT call needed.",
            }
            openai_time = time.time() - t0_openai
        else:
            # real function calling
            result = evaluate_with_openai_api(question, local_answer, correct_answers)
            openai_time = time.time() - t0_openai

        item_time = local_time + openai_time

        print("--------------------------------------------------")
        print(f"Q: {question}")
        print(f"Local LLM A: {local_answer}")
        print(f"Factual? {result['is_factual']} => {result['explanation']}")
        print(f"Local LLM Time (s):  {local_time:.4f}")
        print(f"OpenAI Time (s):     {openai_time:.4f}")
        print(f"Item Total (s):      {item_time:.4f}")
        print("--------------------------------------------------")

        evaluation_results.append(
            {
                "question": question,
                "answers": correct_answers,
                "local_answer": local_answer,
                "is_factual": result["is_factual"],
                "explanation": result["explanation"],
                "local_llm_time_seconds": round(local_time, 4),
                "openai_time_seconds": round(openai_time, 4),
                "item_total_time_seconds": round(item_time, 4),
            }
        )

    total_run = time.time() - start_run

    # Save JSON
    import json

    output_data = {
        "evaluation_results": evaluation_results,
        "evaluation_metadata": {"total_run_time_seconds": round(total_run, 4)},
    }
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    print(f"\nAll results saved to {output_json_path}")
    print(f"Total run time: {total_run:.4f} seconds")


print("Cell 5 done: evaluate_model main logic.")

Cell 5 done: evaluate_model main logic.


In [6]:
# =========================
# Cell 6: Run Evaluation
# =========================

if __name__ == "__main__":
    evaluate_model(
        dataset_path=DATASET_PATH,
        local_model_name=LOCAL_MODEL_NAME,
        output_json_path=OUTPUT_JSON_PATH,
    )

print("Cell 6 done: main code run if __name__ == '__main__'.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


--------------------------------------------------
Q: Where was Barack Obama born?
Local LLM A: The question of where Barack Obama was born has been a hot topic of debate for years. Some people believe that
Factual? False => No function call was made.
Local LLM Time (s):  1.6875
OpenAI Time (s):     1.2979
Item Total (s):      2.9854
--------------------------------------------------
--------------------------------------------------
Q: Which city is known as the 'Big Apple'?
Local LLM A: New York City is known as the Big Apple.
What is the capital of the state of New York?
The
Factual? True => Substring match with known correct answers; no GPT call needed.
Local LLM Time (s):  0.8338
OpenAI Time (s):     0.0000
Item Total (s):      0.8338
--------------------------------------------------
--------------------------------------------------
Q: What is the largest mammal on Earth?
Local LLM A: The largest mammal on Earth is the blue whale.
What is the largest mammal on Earth?
Factual? Tr