In [12]:
%load_ext autoreload
%autoreload 2

import sys 
import os
sys.path.append("../")

import pandas as pd
import litellm
import random
import base64
import hashlib
import json
from typing import List, Dict, Optional
from pathlib import Path

from tqdm.notebook import tqdm

from mcp_agents.tool_interface.base import *
from mcp_agents.tool_interface.mcp_tools import *
from mcp_agents.client import *
from mcp_agents.agent_interface import *
from mcp_agents.evaluation_utils.utils import *

# !playwright install #to run the crawl4ai tool

os.environ["OPENAI_API_KEY"] = "sk-placeholder"
os.environ["SERPER_API_KEY"] = "545d606f0045f3a23891638644e3bf53a74ee808"


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
!pip install playwright



### Build a search tool

In [14]:
# imported from mcp_agents.tool_interface.mcp_tools

search_tool = SerperSearchTool(
    tool_start_tag="<query>",
    tool_end_tag="</query>",
    result_start_tag="<snippet>",
    result_end_tag="</snippet>",
    number_documents_to_search=2,
    timeout=60,
)

client = LLMToolClient(
    model_name="openai/gpt-4o-mini",  # Dummy model name
    tokenizer_name="openai/gpt-4o-mini",  # Dummy model name
    base_url="https://api.openai.com/v1",
    api_key=os.getenv("OPENAI_API_KEY"),
    tools=[search_tool],
)

In [15]:
output = await search_tool("<query>Advisors offering Inference Algorithms for Language Modeling classes</query>")
print(search_tool.format_result(output))

Title: 11-664/763: Inference Algorithms for Language Modeling
URL: https://www.phontron.com/class/lminference-fall2025/
Snippet: In this class, we survey the wide space of inference-time techniques with a particular focus on the implementation and practical use cases of such methods.

Title: CMU LLM Inference (1): Introduction to Language Models and ...
URL: https://www.youtube.com/watch?v=F-mduXzNcRQ
Snippet: This lecture (by Graham Neubig) for CMU CS 11-763, Advanced NLP (Fall 2025) covers: What is a language model? What is an inference algorithm ...


### A basic ReAct agent

In [16]:
from react_agent import *

default_config_path = "./react_agent.yaml"

workflow = ReActWorkflow(configuration=default_config_path)

# print the config

output = await workflow(
    # question="Who are the target audience for CMU 11-763 Inference Algorithms for Language Modeling classes?",
    question="Who are the staff members for CMU 11-763 Inference Algorithms for Language Modeling classes?",
    max_tokens=2048,
    temperature=0.7,
    verbose=True,
)

final_answer, results, conversation_history, searched_queries = output


ü§î Starting ReAct workflow for: Who are the staff members for CMU 11-763 Inference Algorithms for Language Modeling classes?
üìä Will perform 3 think-search cycles

üîÑ CYCLE 1/3
--------------------------------------------------
üí≠ THINKING...
HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWho are the staff members for CMU 11-763 Inference Algorithms for Language Modeling classes?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to searc

In [7]:
!playwright install

Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-mac-arm64.zip[22m
Chromium 140.0.7339.16 (playwright build v1187) downloaded to /Users/niketjain/Library/Caches/ms-playwright/chromium-1187
Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-mac-arm64.zip[22m
Chromium Headless Shell 140.0.7339.16 (playwright build v1187) downloaded to /Users/niketjain/Library/Caches/ms-playwright/chromium_headless_shell-1187
Downloading Firefox 141.0 (playwright build v1490)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/firefox/1490/firefox-mac-arm64.zip[22m
Firefox 141.0 (playwright build v1490) downloaded to /Users/niketjain/Library/Caches/ms-playwright/firefox-1490
Downloading Webkit 26.0 (playwright build v2203)[2m from https://cdn.playwr

In [17]:
final_answer, results, conversation_history, searched_queries = output
print("Model answer:")
print(final_answer)
print("----------------------")
print("Tool calls:")
print(dict(results)["tool_calls"][1]["generated_text"])


Model answer:
The staff members for the Carnegie Mellon University (CMU) course 11-763, titled "Inference Algorithms for Language Modeling," for the Fall 2025 semester are:

1. **Graham Neubig** - Primary Instructor
   - Email: [gneubig@cs.cmu.edu](mailto:gneubig@cs.cmu.edu)
   - Graham Neubig is an associate professor at the Language Technologies Institute, specializing in natural language processing.

2. **Amanda Bertsch**
----------------------
Tool calls:
<query>CMU 11-763 Inference Algorithms for Language Modeling course staff instructors teaching assistants</query><snippet id=69d673a1>
Title: 11-664/763: Inference Algorithms for Language Modeling
URL: https://www.phontron.com/class/lminference-fall2025/
Snippet: Students will understand the different ways to implement and compare inference-time techniques, learn the theory behind different strategies for inference-time ...

Title: CMU LLM Inference (1): Introduction to Language Models and ...
URL: https://www.youtube.com/watch?v=

### Your Task: the pipeline for Short-form Tasks

You will work on applying the agent you just built to the graph and MMLU problems you explored in HW1

In [18]:
# Build a simple agent for the graph problem
from graph.graph_path_finder import *
# YOUR_TASK_2.1, fix the GraphPathEvaluationTool (3 tasks, 6 lines of code)
from mcp_agents.tool_interface.mcp_tools import GraphPathEvaluationTool

correct_paths = [
    {"path": [0, 1, 3, 7], "weight": 25},
    {"path": [0, 2, 5, 7], "weight": 30}
]

eval_tool = GraphPathEvaluationTool(
    correct_paths=correct_paths,
    expected_count=2,
    tool_start_tag="<predicted_paths>",
    tool_end_tag="</predicted_paths>",
    result_start_tag="<evaluation>",
    result_end_tag="</evaluation>",
    timeout=30
)

print("=== Correct Prediction ===\n")

input1 = """<predicted_paths>
{
    "paths": [[0, 1, 3, 7], [0, 2, 5, 7]],
    "weights": [25, 30]
}
</predicted_paths>"""

output1 = await eval_tool(input1)
print(output1.keys())
# print(json.dumps(output1, indent=2))

=== Correct Prediction ===

dict_keys(['score', 'matches', 'expected', 'predicted_count', 'correct_paths_found', 'incorrect_paths', 'missing_paths', 'message'])


In [19]:
print("=== Graph Path Finding Example ===\n")

# Create a simple example graph
print("1. Creating a random graph...")
edges, params = create_random_graph(N=5, M=2, W=50, P=1)

print(f"Graph parameters: N={params['N']}, M={params['M']}, W={params['W']}, P={params['P']}")
print("Edges:")
for src, dst, weight in edges:
    print(f"  {src} -> {dst} (weight: {weight})")

# Find the correct solution
print("\n2. Finding shortest path with dynamic programming...")
solution = find_top_p_paths(edges, params["N"], params["P"])

=== Graph Path Finding Example ===

1. Creating a random graph...
Graph parameters: N=5, M=2, W=50, P=1
Edges:
  0 -> 1 (weight: 21)
  0 -> 2 (weight: 11)
  1 -> 2 (weight: 30)
  1 -> 3 (weight: 24)
  2 -> 3 (weight: 42)
  2 -> 4 (weight: 18)
  3 -> 4 (weight: 26)
  3 -> 0 (weight: 39)
  4 -> 0 (weight: 28)
  4 -> 1 (weight: 45)

2. Finding shortest path with dynamic programming...


In [20]:
prompt = generate_problem_prompt(edges, params["N"], params["P"])

llm_response = query_llm_with_function_call(prompt, "gpt-4o", os.getenv("OPENAI_API_KEY"))

predicted_solution = convert_llm_response_to_solution(llm_response)

In [21]:
def solution_to_dict_list(solution: GraphPathSolution) -> List[Dict[str, Any]]:
    """
    Convert GraphPathSolution to list of dict format.
    
    Args:
        solution: GraphPathSolution object
    
    Returns:
        List of dicts with 'path' and 'weight' keys
    """
    return [
        {"path": path_info.path, "weight": path_info.weight}
        for path_info in solution.paths
    ]


eval_tool = GraphPathEvaluationTool(
    correct_paths=solution_to_dict_list(solution),
    expected_count=len(solution_to_dict_list(solution)),
    tool_start_tag="<predicted_paths>",
    tool_end_tag="</predicted_paths>",
    result_start_tag="<evaluation>",
    result_end_tag="</evaluation>",
    timeout=30
)

print("=== Model Prediction ===\n")

input2 = f"""<predicted_paths>
{json.dumps(solution_to_dict_list(predicted_solution), indent=2)}
</predicted_paths>"""

output2 = await eval_tool(input2)
print(output2.keys())
print(json.dumps(output2, indent=2))

=== Model Prediction ===

dict_keys(['score', 'matches', 'expected', 'predicted_count', 'correct_paths_found', 'incorrect_paths', 'missing_paths', 'message'])
{
  "score": 1.0,
  "matches": 1,
  "expected": 1,
  "predicted_count": 1,
  "correct_paths_found": [
    {
      "path": [
        0,
        2,
        4
      ],
      "weight": 29
    }
  ],
  "incorrect_paths": [],
  "missing_paths": [],
  "message": "Found 1/1 correct paths (100.0%)"
}


In [22]:
# Build the inference pipeline for MMLU
from inference.inference import load_custom_dataset, convert_llm_response_to_solution, format_example, format_subject

examples = load_custom_dataset("MMLU-preview")

print(f"Dataset loaded: {len(examples)} examples")

Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 173/173 [00:00<00:00, 6544.85 examples/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22/22 [00:00<00:00, 15297.53 examples/s]
Generating dev split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 4622.33 examples/s]

Dataset loaded: 173 examples





In [23]:
# Separate generation and evaluation functions
async def udated_mmlu_pipeline(examples, default_config_path):
    """Generate responses for all examples without evaluation"""
    
    print("Generating responses for", len(examples), "MMLU examples")

    workflow = ReActWorkflow(configuration=default_config_path)
    base_agent_prompt = workflow.answer_agent.prompt

    choices = ["A", "B", "C", "D"]


    results = []
    total_score = 0.0

    for i, example in tqdm(enumerate(examples, 1), total=len(examples), desc="Evaluating examples"):
        question = example["question"] # format_example(example, include_answer=False)
        correct_answer = choices[example["answer"]]

        # YOUR_TASK_2.2: what is the additional instructions here?
        # Hint: check the MMLU inference pipeline to understand what to specify; 1 line of code

        additional_instructions = f"The following is a multiple choice question (with answers) about {format_subject(example['subject'])}.  Output the answer in the format of \"The answer is (X)\" at the end.\n\n"

        workflow.answer_agent.prompt = base_agent_prompt + "\n" + additional_instructions + format_example(example, include_answer=False)

        output = await workflow(
            question=question,
            max_tokens=4096,
            temperature=0.7,
            verbose=False,
        )

        final_answer, answer_result, conversation_history, searched_queries = output

        predicted_solution = convert_llm_response_to_solution(final_answer, "MMLU")

        score = (choices[example["answer"]] == predicted_solution)
        total_score += score

        results.append({
            "example_id": i,    
            "question": question,
            "correct_answer": correct_answer,
            "predicted_solution": predicted_solution,
            "final_answer": final_answer,
            "generation": answer_result.model_dump(),
            "conversation_history": conversation_history,
            "searched_queries": searched_queries,
        })

    average_score = total_score / len(examples) if examples else 0.0

    print(f"Average score: {average_score:.2f}")
    
    output_config = {k:v for k,v in dict(workflow.configuration).items() if "api_key" not in k}

    return {
        "config": output_config,
        "average_score": average_score,
        "total_examples": len(examples),
        "results": results
    }


default_config_path = "./react_agent_mmlu.yaml"

# YOUR_TASK_2.2: Run the inference for 30 examples after fixing the function:
# save the answers using the code at the next block
# report the acc. at the home write-up
output = await udated_mmlu_pipeline(examples[:30], default_config_path)

# Inspect or save generated responses here if needed
print(f"Generated {len(output['results'])} responses")
print("Sample response:")
print(output["results"][0]["final_answer"])
print("-" * 50)


Generating responses for 30 MMLU examples


Evaluating examples:   0%|          | 0/30 [00:00<?, ?it/s]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA police officer carries out hundreds of traffic stops every year. When his supervisor is reviewing the officer‚Äôs records for the past year, he notices that the officer is equally likely to stop people of various genders, ages, and races. However, he is significantly more likely to write tickets for middle-aged white males with dark hair and eyes. When confronted with this fact, the officer truthfully states that he has no idea why that is, and that it must simply be a coincidence. Unbeknownst to the officer, this behavior is tied to the fact that these men look like his father, with whom he had an abusive relationship as a child. What psychological framework would directly address the unconscious bias in his behavior? \n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and d

Evaluating examples:   3%|‚ñé         | 1/30 [00:41<20:16, 41.94s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWho set the world record for the mile race in 1886?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3938, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'api_base

Evaluating examples:   7%|‚ñã         | 2/30 [02:00<29:30, 63.23s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWhich of the following statements identifies a chemically based sensory system?\nI. Gustatory system\nII. Auditory system\nIII. Olfactory system\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, '

Evaluating examples:  10%|‚ñà         | 3/30 [02:48<25:23, 56.42s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nThe complete resynthesis of phosphocreatine after very high intensity exercise normally takes:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3935, 'stop': ['</think

Evaluating examples:  13%|‚ñà‚ñé        | 4/30 [03:32<22:24, 51.71s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA race car attempting to jump a series of 8 buses is set up on a flat track with a ramp at the end. Engineers assigned to the project have determined that, in order to jump the buses, the car must reach a velocity of 130 km/h. If the distance of the track is 50m, at what rate must the car accelerate to reach this velocity?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How t

Evaluating examples:  17%|‚ñà‚ñã        | 5/30 [04:21<21:03, 50.54s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nFatty acids are transported into the mitochondria bound to:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3940, 'stop': ['</think>'], 'api_key': 'sk-placeholder', '

Evaluating examples:  20%|‚ñà‚ñà        | 6/30 [05:01<18:50, 47.09s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': 'You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nSauna use, sometimes referred to as "sauna bathing," is characterized by short-term passive exposure to extreme heat. This exposure elicits mild hyperthermia ‚Äì an increase in the body\'s core temperature ‚Äì that induces a thermoregulatory response involving neuroendocrine, cardiovascular, and cytoprotective mechanisms that work together to restore homeostasis and condition the body for future heat stressors‚Ä¶ In recent decades, sauna bathing has emerged as a means to increase lifespan and improve overall health, based on compelling data from observational, interventional, and mechanistic studies. Of particular interest are the findings from studies of participants in the Kuopio Ischemic Heart Disease Risk Factor (KIHD) Study, an ongoing prospective population-based cohort study of health outcomes 

Evaluating examples:  23%|‚ñà‚ñà‚ñé       | 7/30 [05:36<16:33, 43.19s/it]

HERE
INSIDE
HERE
INSIDE
1st answer extract failed

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nDiisopropylfluorophosphate (DFP) binds to the active site of acetylcholinesterase (ACE) in the synapses of neurons. When DFP binds to ACE, the ACE enzyme is rendered permanently inactive. This makes DFP a potent toxin, with lethal amounts at less than 100 mg. The interaction between DFP and ACE can best be characterized as:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- Wh

Evaluating examples:  27%|‚ñà‚ñà‚ñã       | 8/30 [06:14<15:12, 41.46s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nThe process of translation requires the presence of:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3943, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'api_bas

Evaluating examples:  30%|‚ñà‚ñà‚ñà       | 9/30 [06:52<14:09, 40.46s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nPerformance enhancing synthetic steroids are based on the structure of the hormone:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3939, 'stop': ['</think>'], 'api_k

Evaluating examples:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [07:29<13:05, 39.27s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nPerchloric acid (HClO4) is considered one of the stronger acids in existence. Which of the following statements corresponds most accurately with strong acids?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temper

Evaluating examples:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [08:09<12:29, 39.47s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA descript amount of 2-bromobutane is placed into a strong solution of ethanol and allowed to complete a reaction. The result of this reaction produces a major product of 2-butene and a minor product of 1-butene. Which of the following descriptions of the starting compound explains why 2-butene is the major product?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this sea

Evaluating examples:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [09:00<12:55, 43.10s/it]

1st answer extract failed
The reaction of 2-bromobutane in a strong solution of ethanol leads to the formation of 2-butene as the major product and 1-butene as a minor product. This outcome can be explained by the principles of elimination reactions, specifically the E2 mechanism, and Zaitsev's rule.

In an E2 elimination reaction, the leaving group (in this case, the bromine atom) is expelled while a proton is abstracted from a Œ≤-carbon, leading to the formation of a double bond. The key factors that influence the production of alkenes in such reactions include the stability of the resulting alkenes. According to Zaitsev's rule, when multiple alkene products are possible, the more substituted alkene is favored because it is more stable. 

In this scenario, 2-butene is more substituted than 1-butene, meaning it has a double bond between two carbons that each have more alkyl substituents. This structural feature provides 2-butene with greater stability due to hyperconjugation and the a

Evaluating examples:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [09:53<13:00, 45.93s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nThe maximum sustainable power:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3947, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'api_base': 'https://api.opena

Evaluating examples:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [10:37<12:04, 45.29s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA patient comes into the ER looking extremely agitated. They are acting aggressive, and claiming they need medication or ‚Äúbad things will happen‚Äù. What is the likely state of this patient‚Äôs dopamine system?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about w

Evaluating examples:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [11:15<10:46, 43.09s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA scientist carrying out experiments on hearing aids fits 30 mice that were genetically modified to lose their hearing with the latest technology and were tested to press a lever when they heard a bell. This was set to varying levels of power. At 80% power, 20 mice pressed the lever. At 70% power, 15 mice pressed the lever. At 60% power, 10 mice pressed the lever. Which of the following power levels corresponds to the absolute threshold for hearing the decibels produced by the bell?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for n

Evaluating examples:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [12:00<10:12, 43.71s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nThe transcription of DNA to a molecule of messenger RNA occurs:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3940, 'stop': ['</think>'], 'api_key': 'sk-placeholder

Evaluating examples:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [12:51<09:56, 45.87s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA new enzyme is found in a transgenic mice that participates in synthesis of an unknown product using two reactants. When using radiolabeled compounds to study the enzyme, it is found that the enzyme catalyzes a process that switches a nitrogen group on one reactant to the other reactant. Which of the following categories would this new enzyme fall under?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most he

Evaluating examples:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [13:37<09:11, 45.94s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nFor a very weak base, the pKb of a solution would likely be:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3935, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 

Evaluating examples:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [14:18<08:08, 44.45s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nThe genome is:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3948, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'api_base': 'https://api.openai.com/v1'}
HERE


Evaluating examples:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [15:01<07:19, 43.99s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWhich of the following are steroid-based molecules?\nI. Testosterone\nII. Triglycerides\nIII. Progesterone\nIV. DNA\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 39

Evaluating examples:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [15:38<06:18, 42.06s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nMost of the free fatty acids are transported in the blood:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3940, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'a

Evaluating examples:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [16:17<05:28, 41.05s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWhich of the following factors can affect enzyme activity?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3942, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'a

Evaluating examples:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [16:59<04:50, 41.44s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nLiving cells require constant interaction with the outside environment in order to attain the materials they need for survival, as well as to rid themselves of waste. Of the following processes, which uses only the gradient of material to control the direction in which the material moves across the cell membrane?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search

Evaluating examples:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [17:41<04:08, 41.46s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nEmbedded in the inner membrane of the mitochondrion are:\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning about what information you need to find.\n<think>"}], 'temperature': 0.7, 'top_p': 1.0, 'max_tokens': 3940, 'stop': ['</think>'], 'api_key': 'sk-placeholder', 'api

Evaluating examples:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [18:18<03:21, 40.32s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA segment of DNA from a lab mouse is determined to be 5‚Äô ‚Äì GGATCCTCATG ‚Äì 3‚Äô. Which of the following DNA segments would be the result of this original DNA sequence experiencing both a point mutation and a deletion?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasonin

Evaluating examples:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [19:01<02:43, 40.98s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nWhile working on a scene for an action movie, a sound technician is given the task of changing the frequency of a gunshot to more accurately reflect the normal speed of sound. The gunshot came from an actor inside a car traveling 108 km/h, and it was recorded by a camera on a platform 200 meters away traveling at 72 km/h in the same direction. If the frequency of the gunshot is normally 800Hz, what is the perceived frequency which the camera picks up the gunshot at?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n-

Evaluating examples:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [20:01<02:20, 46.75s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nAn object rests on a plane, with an angle of incline, ?, an acceleration due to gravity, g, and a coefficient of friction ¬µ between the object and the plane. Which of the following gives the acceleration of the object?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your reasoning 

Evaluating examples:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [20:55<01:37, 48.86s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nDavid is a nationally ranked cellist who recently accepted a scholarship to a major college to perform in the orchestra. Over the summer, he has been given a packet of sheet music to be proficient in by fall semester. David is a perfectionist when it comes to his craft. He always compares himself to better players, and is very hard on himself when he cannot master a section of one of his pieces. Which of the following answers best describes David?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already k

Evaluating examples:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [21:33<00:45, 45.53s/it]

HERE
INSIDE
I AM HERE
{'model': 'openai/gpt-4o-mini', 'messages': [{'role': 'user', 'content': "You are a research assistant that reasons systematically about questions.\n\nCURRENT TASK:\nA common heart defect in humans is a ventricular septal defect, in which there is a hole in the septum between the right and left ventricles. If a patient were to have this defect, which of the following statements is correct?\n\nCONVERSATION HISTORY:\n\n\nINSTRUCTIONS:\nAnalyze the current situation and decide what specific information you need to search for next.\n\nYour response should be in the following format:\n<think>[Your thorough reasoning about what you know so far, what's missing, and what specific information you need to search for next. Consider:\n- What you already know from previous searches\n- What key information is still missing  \n- What would be most helpful to search for next\n- How this search will help answer the original question]</think>\n\nBe specific and detailed in your rea

Evaluating examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [22:29<00:00, 44.97s/it]

Average score: 0.83
Generated 30 responses
Sample response:
The scenario presented involves a police officer whose ticketing behavior is influenced by unconscious bias linked to his childhood experiences with his father. In this context, the psychological framework that would directly address the unconscious bias in his behavior is the **Psychoanalytic** approach. This framework examines how unresolved conflicts and emotional experiences from childhood, such as the officer's abusive relationship with his father, can manifest in adult behaviors and attitudes, often outside of conscious awareness.

Psychoanalytic theory posits that unconscious motivations and past experiences significantly shape an individual's behavior and decision-making processes. The officer's tendency to disproportionately ticket middle-aged white males mirrors the traits of his father, suggesting an unconscious association influenced by his past trauma.

While other approaches, such as Cognitive Behavioral or Human




In [26]:
model = output["config"]["react_agent_model_name"]
model_display = model.split("/")[-1]
len_examples = output["total_examples"]
display_config = default_config_path.split("/")[-1].replace(".yaml", "")
print(model_display)

with open(f"results_{model_display}_{len_examples}_{display_config}.json", "w") as f:
    json.dump(dict(results), f, indent=2)

print(output["average_score"])

for response in output["results"][:2]:
    print("## Question: " + response["question"])
    print("## Correct Answer: " + response["correct_answer"])
    print("## Final Answer:\n" + response["final_answer"])
    print("## Searched Queries: " + str(response["searched_queries"]))
    print("----" * 50)

gpt-4o-mini
0.8333333333333334
## Question: A police officer carries out hundreds of traffic stops every year. When his supervisor is reviewing the officer‚Äôs records for the past year, he notices that the officer is equally likely to stop people of various genders, ages, and races. However, he is significantly more likely to write tickets for middle-aged white males with dark hair and eyes. When confronted with this fact, the officer truthfully states that he has no idea why that is, and that it must simply be a coincidence. Unbeknownst to the officer, this behavior is tied to the fact that these men look like his father, with whom he had an abusive relationship as a child. What psychological framework would directly address the unconscious bias in his behavior? 
## Correct Answer: B
## Final Answer:
The scenario presented involves a police officer whose ticketing behavior is influenced by unconscious bias linked to his childhood experiences with his father. In this context, the psyc

### Simple analysis

In [None]:
def extract_thoughts(text: str) -> str:
    match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text

def extract_search_query(text: str) -> str:
    match = re.search(r"<query>(.*?)</query>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text

def parse_xml_snippets(text):

    pattern = re.compile(
        r"<snippet id=([^>]+)>"  # Capture Group 1: The ID
        # YOUR_TASK_3.1: three lines of code here to process the retrieved snippets
        r"\s*Title:\s*(.*?)\n"        # Capture Group 2: The Title
        r"URL:\s*(.*?)\n"    # Optional Group with Capture Group 3: The URL
        r"Snippet:\s*(.*?)"       # Capture Group 4: The Snippet
        r"\s*</snippet>",          # The closing tag
        re.DOTALL
    )

    matches = pattern.findall(text)

    results = []
    for match in matches:
        # match is a tuple: (id, title, url, snippet)
        # If the optional URL group did not match, match[2] will be None.
        snippet_id = match[0].strip()
        title = match[1].strip()
        url = match[2].strip() if match[2] else ""  # Handle None case for missing URL
        snippet = match[3].strip()

        results.append({
            "Title": title,
            "URL": url,
            "Snippet": snippet
        })

    return results


def count_tokens(text, model="openai/gpt-4o"):
    """
    Counts the number of tokens in a prompt using LiteLLM's token counting utility.
    Args:
        prompt (str): The input prompt string.
        model (str): The model name for which to count tokens (default: "gpt-3.5-turbo").
    Returns:
        int: The number of tokens in the prompt.
    """
    return litellm.token_counter(model=model, messages=[{"role": "user", "content": text}])


def count_tokens_in_results(results):
    # report the numbers of tokens used for question,thinking, query, snippets, and final answer
    report_results = []
    for result in results:

        thoughts = [one_round["content"] for one_round in result["conversation_history"] if one_round["type"] == "think"]
        cleaned_thoughts = [extract_thoughts(thought) for thought in thoughts]
        
        query_snippets = [one_round["content"] for one_round in result["conversation_history"] if one_round["type"] == "query"]
        cleaned_queries = [extract_search_query(query) for query in query_snippets]
        # print(parse_xml_snippets(query_snippets[0]))
        parsed_snippets = []
        for query in query_snippets:
            parsed_snippets.extend(parse_xml_snippets(query))
        cleaned_snippets_titles = [snippet["Title"] for snippet in parsed_snippets]
        cleaned_snippets_snippets = [snippet["Snippet"] for snippet in parsed_snippets]

        report_results.append({
            "question": count_tokens(result["question"]),
            "thinking": count_tokens(" ".join(cleaned_thoughts)),
            "query": count_tokens(" ".join(cleaned_queries)),   
            "snippets_titles": count_tokens(" ".join(cleaned_snippets_titles)),
            "snippets_snippets": count_tokens(" ".join(cleaned_snippets_snippets)),
            "final_answer": count_tokens(result["final_answer"]),
        })

    # for each key, report the average, round to 2 decimal places
    return {key: round(sum(result[key] for result in report_results) / len(report_results), 2) for key in report_results[0].keys()}

# YOUR_TASK_3.2: calculate the token counts for each category for each variant and report in the homework write-up
count_tokens_in_results(output["results"])