In [4]:
import json
import os
import json
from fvalues import F
from ice.recipe import recipe
import pandas as pd
from ice.recipes.primer.subquestions import ask_subquestions
from ice.utils import map_async
from amplification import answer_by_amplification
from prompt_generator import make_scenario_prompt, make_evaluation_prompt

Question = str
Answer = str
Subs = list[tuple[Question, Answer]]


def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def render_background(subs: Subs) -> str:
    subs_text = "\n\n".join(f"Q: {q} A: {a}" for (q, a) in subs)
    return f"Here is relevant background information \n\n{subs_text}\n\n"



class RecipeModel():
    def __init__(self, agent_name):
        self.agent_name = agent_name

    def generate_text(self, inputs, max_length=250):
        response = recipe.agent(agent_name=self.agent_name).complete(prompt=inputs, stop='"', max_tokens=max_length)
        return response

async def get_standard_answer(question: str, agent_name: str) -> str:
    """
    Generate an answer using the standard approach
    """
    model = RecipeModel(agent_name=agent_name)
    prompt = make_scenario_prompt(question)
    answer = await model.generate_text(prompt)
    return answer

async def get_factored_answer(question: str, agent_name: str) -> str:
    """
    Generate an answer using the factored cognition approach
    """
    prompt = make_scenario_prompt(question)
    factored_answer, subs = await answer_by_amplification(prompt, engine=agent_name)
    return factored_answer, subs

async def evaluate_tasks(task_name, agent_name, task_count):
    task_path = os.path.join('/Users/bensturgeon/werk/ice/benchmarks/', task_name)
    task_data = read_jsonl(task_path)

    total_questions = 0
    evaluations = []
    result = {}

    
    result["count_factored"] = 0
    result["count_baseline"] = 0
    result["count_total"] = 0
    result["count_undecided"] = 0


    for index, row in enumerate(task_data):
        if index %5 == 0:
            print(f"{index} / {task_count}")
        scenario = row['text']

        # Generate answers using both approaches
        standard_answer = await get_standard_answer(scenario, agent_name)
        factored_answer, subs = await get_factored_answer(scenario, agent_name)

        # Evaluate the answers
        evaluation_model = RecipeModel(agent_name="gpt-4")
        evaluation_prompt= make_evaluation_prompt(scenario, standard_answer, factored_answer)
        evaluation_result = await evaluation_model.generate_text(evaluation_prompt)
        
        if evaluation_result == "2":
            result["count_factored"] +=1
        elif evaluation_result == "1":
            result["count_baseline"] +=1
        elif evaluation_result == "0":
            result["count_undecided"] +=1
        else:
            
            print("Invalid output from evaluator.")
        # Store the results
        evaluations.append({
            'sample_number': index+1,
            'input': scenario,
            'standard_answer': standard_answer,
            'factored_answer': factored_answer,
            'trace': subs,
            'evaluation_result': evaluation_result
        })

        total_questions += 1



        result['evaluations'] = evaluations

        result["factored_percentage"] = result["count_factored"]/(index +1)
        result["baseline_percentage"] = result["count_baseline"]/(index +1)



        # Save the results to a file
        task_name_no_extension = task_name.split(".csv")[0]
        with open(f'benchmarks/results/{task_name_no_extension}_{agent_name}_evaluations.json', 'w') as f:
            json.dump(result, f, indent=4)

        if index >= task_count:
            break
    

    del evaluations


if __name__ == "__main__":
    agent_name = "chatgpt"
    model = RecipeModel(agent_name=agent_name)
    # accuracy, answers = await evaluate_task('ethics_suite.jsonl', model, agent_name, 300)
    await evaluate_tasks("ethics_suite.jsonl",agent_name, 15)


    




0 / 15
5 / 15
10 / 15
15 / 15


In [2]:
model_answer = await model.generate_text("What is the sky?")