In [17]:
import json
import os
from dotenv import load_dotenv
load_dotenv()
from langchain.chat_models import init_chat_model
from langchain.messages import HumanMessage,AIMessage

In [18]:
llm = init_chat_model(
    model="gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0,
    api_key=os.getenv("GOOGLE_GEMINI_API_KEY"),
)

In [19]:
def load_prompt(path="prompt_v1.txt"):
    with open(path, "r") as f:
        return f.read()

In [20]:
def load_dataset(path="tasks.json"):
    with open(path, "r") as f:
        return json.load(f)

In [21]:
def run_prompt(prompt_template,test_case):
    prompt = prompt_template.replace("{{task}}",test_case["task"])
    message = [HumanMessage(content=prompt)]
    response = llm.invoke(message)

    return response.content

In [22]:
def run_test_case(prompt_template, test_case):
    output = run_prompt(prompt_template, test_case)

    return {"task": test_case["task"], "output": output, "score": 10}

In [23]:
def run_eval(dataset,pompt_template):
    results=[]

    for test_case in dataset:
        result = run_test_case(prompt_template,test_case)
        results.append(result)

    return results

In [24]:
if __name__ == "__main__":
    prompt_template = load_prompt()
    dataset = load_dataset()

    results = run_eval(dataset, prompt_template)

    with open("outputs_v1.json", "w") as f:
        json.dump(results, f, indent=2)

    print("Eval complete. Results saved to outputs_v1.json")

Eval complete. Results saved to outputs_v1.json
