In [9]:
%load_ext autoreload
%autoreload 2

import sys 
import os
sys.path.append("../")

import pandas as pd
import litellm
import random
import base64
import hashlib
import json
from typing import List, Dict, Optional
from pathlib import Path

from tqdm.notebook import tqdm

from mcp_agents.tool_interface.base import *
from mcp_agents.tool_interface.mcp_tools import *
from mcp_agents.client import *
from mcp_agents.agent_interface import *
from mcp_agents.evaluation_utils.utils import *

# !playwright install #to run the crawl4ai tool

os.environ["OPENAI_API_KEY"] = "api key"
os.environ["SERPER_API_KEY"] = "api key"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Build a search tool

In [10]:
# imported from mcp_agents.tool_interface.mcp_tools

search_tool = SerperSearchTool(
    tool_start_tag="<query>",
    tool_end_tag="</query>",
    result_start_tag="<snippet>",
    result_end_tag="</snippet>",
    number_documents_to_search=2,
    timeout=60,
)

client = LLMToolClient(
    model_name="openai/gpt-4o",  # Dummy model name
    tokenizer_name="openai/gpt-4o",  # Dummy model name
    base_url="https://api.openai.com/v1",
    api_key=os.getenv("OPENAI_API_KEY"),
    tools=[search_tool],
)

In [11]:
output = await search_tool("<query>Advisors offering Inference Algorithms for Language Modeling classes</query>")
print(search_tool.format_result(output))

Title: 11-664/763: Inference Algorithms for Language Modeling
URL: https://www.phontron.com/class/lminference-fall2025/
Snippet: In this class, we survey the wide space of inference-time techniques with a particular focus on the implementation and practical use cases of such methods.

Title: CMU LLM Inference (1): Introduction to Language Models and ...
URL: https://www.youtube.com/watch?v=F-mduXzNcRQ
Snippet: This lecture (by Graham Neubig) for CMU CS 11-763, Advanced NLP (Fall 2025) covers: What is a language model? What is an inference algorithm ...


### A basic ReAct agent

In [12]:
from react_agent import *

default_config_path = "./react_agent.yaml"

workflow = ReActWorkflow(configuration=default_config_path)

# print the config

output = await workflow(
    # question="Who are the target audience for CMU 11-763 Inference Algorithms for Language Modeling classes?",
    question="Who are the staff members for CMU 11-763 Inference Algorithms for Language Modeling classes?",
    max_tokens=2048,
    temperature=0.7,
    verbose=True,
)

final_answer, results, conversation_history, searched_queries = output


ü§î Starting ReAct workflow for: Who are the staff members for CMU 11-763 Inference Algorithms for Language Modeling classes?
üìä Will perform 3 think-search cycles

üîÑ CYCLE 1/3
--------------------------------------------------
üí≠ THINKING...
üí° Reasoning: <think>From the conversation history, it appears that there is no previous information provided about the staff members for the CMU 11-763 Inference Algorithms for Language Modeling class. Thus, we ar...
üîç GENERATING SEARCH...

üîÑ CYCLE 2/3
--------------------------------------------------
üí≠ THINKING...
üí° Reasoning: 
üîç GENERATING SEARCH...

üîÑ CYCLE 3/3
--------------------------------------------------
üí≠ THINKING...
üí° Reasoning: 
üîç GENERATING SEARCH...

‚úÖ GENERATING FINAL ANSWER...
--------------------------------------------------


In [13]:
final_answer, results, conversation_history, searched_queries = output
print("Model answer:")
print(final_answer)
print("----------------------")
print("Tool calls:")
print(dict(results)["tool_calls"][1]["generated_text"])


Model answer:
The staff members for the CMU 11-763 Inference Algorithms for Language Modeling class in Fall 2025 are as follows:

1. **Instructors:**
   - **Graham Neubig**: An associate professor at the Language Technologies Institute of Carnegie Mellon University, Graham Neubig is involved in teaching the course.
   - **Amanda Bertsch**: A PhD student at the Language Technologies Institute at Carnegie Mellon University, advised by Matt Gormley and Graham Neubig, Amanda
----------------------
Tool calls:
<query>CMU 11-763 Inference Algorithms for Language Modeling course staff members 2023</query><snippet id=6f8d1b72>
Title: 11-664/763: Inference Algorithms for Language Modeling
URL: https://www.phontron.com/class/lminference-fall2025/
Snippet: In this class, we survey the wide space of inference-time techniques with a particular focus on the implementation and practical use cases of such methods.

Title: CMU LLM Inference (1): Introduction to Language Models and ...
URL: https://www.

### Your Task: the pipeline for Short-form Tasks

You will work on applying the agent you just built to the graph and MMLU problems you explored in HW1

In [14]:
# Build a simple agent for the graph problem
from graph.graph_path_finder import *
# YOUR_TASK_2.1, fix the GraphPathEvaluationTool (3 tasks, 6 lines of code)
from mcp_agents.tool_interface.mcp_tools import GraphPathEvaluationTool

correct_paths = [
    {"path": [0, 1, 3, 7], "weight": 25},
    {"path": [0, 2, 5, 7], "weight": 30}
]

eval_tool = GraphPathEvaluationTool(
    correct_paths=correct_paths,
    expected_count=2,
    tool_start_tag="<predicted_paths>",
    tool_end_tag="</predicted_paths>",
    result_start_tag="<evaluation>",
    result_end_tag="</evaluation>",
    timeout=30
)

print("=== Correct Prediction ===\n")

input1 = """<predicted_paths>
{
    "paths": [[0, 1, 3, 7], [0, 2, 5, 7]],
    "weights": [25, 30]
}
</predicted_paths>"""

output1 = await eval_tool(input1)
print(output1.keys())
# print(json.dumps(output1, indent=2))

=== Correct Prediction ===

dict_keys(['score', 'matches', 'expected', 'predicted_count', 'correct_paths_found', 'incorrect_paths', 'missing_paths', 'message'])


In [15]:
print("=== Graph Path Finding Example ===\n")

# Create a simple example graph
print("1. Creating a random graph...")
edges, params = create_random_graph(N=5, M=2, W=50, P=1)

print(f"Graph parameters: N={params['N']}, M={params['M']}, W={params['W']}, P={params['P']}")
print("Edges:")
for src, dst, weight in edges:
    print(f"  {src} -> {dst} (weight: {weight})")

# Find the correct solution
print("\n2. Finding shortest path with dynamic programming...")
solution = find_top_p_paths(edges, params["N"], params["P"])

=== Graph Path Finding Example ===

1. Creating a random graph...
Graph parameters: N=5, M=2, W=50, P=1
Edges:
  0 -> 1 (weight: 49)
  0 -> 2 (weight: 5)
  1 -> 2 (weight: 43)
  1 -> 3 (weight: 36)
  2 -> 3 (weight: 10)
  2 -> 4 (weight: 25)
  3 -> 4 (weight: 29)
  3 -> 0 (weight: 20)
  4 -> 0 (weight: 4)
  4 -> 1 (weight: 46)

2. Finding shortest path with dynamic programming...


In [16]:
prompt = generate_problem_prompt(edges, params["N"], params["P"])

llm_response = query_llm_with_function_call(prompt, "gpt-4o", os.getenv("OPENAI_API_KEY"))

predicted_solution = convert_llm_response_to_solution(llm_response)

In [17]:
def solution_to_dict_list(solution: GraphPathSolution) -> List[Dict[str, Any]]:
    """
    Convert GraphPathSolution to list of dict format.
    
    Args:
        solution: GraphPathSolution object
    
    Returns:
        List of dicts with 'path' and 'weight' keys
    """
    return [
        {"path": path_info.path, "weight": path_info.weight}
        for path_info in solution.paths
    ]


eval_tool = GraphPathEvaluationTool(
    correct_paths=solution_to_dict_list(solution),
    expected_count=len(solution_to_dict_list(solution)),
    tool_start_tag="<predicted_paths>",
    tool_end_tag="</predicted_paths>",
    result_start_tag="<evaluation>",
    result_end_tag="</evaluation>",
    timeout=30
)

print("=== Model Prediction ===\n")

input2 = f"""<predicted_paths>
{json.dumps(solution_to_dict_list(predicted_solution), indent=2)}
</predicted_paths>"""

output2 = await eval_tool(input2)
print(output2.keys())
print(json.dumps(output2, indent=2))

=== Model Prediction ===

dict_keys(['score', 'matches', 'expected', 'predicted_count', 'correct_paths_found', 'incorrect_paths', 'missing_paths', 'message'])
{
  "score": 0.0,
  "matches": 0,
  "expected": 1,
  "predicted_count": 1,
  "correct_paths_found": [],
  "incorrect_paths": [
    {
      "path": [
        0,
        2,
        3,
        4
      ],
      "weight": 44
    }
  ],
  "missing_paths": [
    {
      "path": [
        0,
        2,
        4
      ],
      "weight": 30
    }
  ],
  "message": "Found 0/1 correct paths (0.0%)"
}


In [19]:
# Build the inference pipeline for MMLU
from inference.inference import load_custom_dataset, convert_llm_response_to_solution, format_example, format_subject

examples = load_custom_dataset("MMLU-preview")

print(f"Dataset loaded: {len(examples)} examples")

README.md: 0.00B [00:00, ?B/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

college_medicine/test-00000-of-00001.par(‚Ä¶):   0%|          | 0.00/42.5k [00:00<?, ?B/s]

college_medicine/validation-00000-of-000(‚Ä¶):   0%|          | 0.00/8.99k [00:00<?, ?B/s]

college_medicine/dev-00000-of-00001.parq(‚Ä¶):   0%|          | 0.00/4.84k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset loaded: 173 examples


In [27]:
# Separate generation and evaluation functions
async def udated_mmlu_pipeline(examples, default_config_path):
    """Generate responses for all examples without evaluation"""
    
    print("Generating responses for", len(examples), "MMLU examples")

    workflow = ReActWorkflow(configuration=default_config_path)
    base_agent_prompt = workflow.answer_agent.prompt

    choices = ["A", "B", "C", "D"]


    results = []
    total_score = 0.0

    for i, example in tqdm(enumerate(examples, 1), total=len(examples), desc="Evaluating examples"):
        question = example["question"] # format_example(example, include_answer=False)
        correct_answer = choices[example["answer"]]

        # YOUR_TASK_2.2: what is the additional instructions here?
        # Hint: check the MMLU inference pipeline to understand what to specify; 1 line of code
        additional_instructions = f"The following is a multiple choice question (with answers) about {format_subject(example['subject'])}. Output the answer in the format of \"The answer is (X)\" at the end.\n\n"

        workflow.answer_agent.prompt = base_agent_prompt + "\n" + additional_instructions + format_example(example, include_answer=False)

        output = await workflow(
            question=question,
            max_tokens=4096,
            temperature=0.7,
            verbose=False,
        )

        final_answer, answer_result, conversation_history, searched_queries = output

        predicted_solution = convert_llm_response_to_solution(final_answer, "MMLU")

        score = (choices[example["answer"]] == predicted_solution)
        total_score += score

        results.append({
            "example_id": i,    
            "question": question,
            "correct_answer": correct_answer,
            "predicted_solution": predicted_solution,
            "final_answer": final_answer,
            "generation": answer_result.model_dump(),
            "conversation_history": conversation_history,
            "searched_queries": searched_queries,
        })

    average_score = total_score / len(examples) if examples else 0.0

    print(f"Average score: {average_score:.2f}")
    
    output_config = {k:v for k,v in dict(workflow.configuration).items() if "api_key" not in k}

    return {
        "config": output_config,
        "average_score": average_score,
        "total_examples": len(examples),
        "results": results
    }


default_config_path = "./react_agent_mmlu.yaml"

# YOUR_TASK_2.2: Run the inference for 30 examples after fixing the function:
# save the answers using the code at the next block
# report the acc. at the home write-up
output = await udated_mmlu_pipeline(examples[:30], default_config_path)

# Inspect or save generated responses here if needed
print(f"Generated {len(output['results'])} responses")
print("Sample response:")
print(output["results"][0]["final_answer"])
print("-" * 50)


Generating responses for 30 MMLU examples


Evaluating examples:  23%|‚ñà‚ñà‚ñé       | 7/30 [09:33<31:53, 83.19s/it]

1st answer extract failed



Evaluating examples:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [40:33<08:20, 100.16s/it]

Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.
Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.
Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.
Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.
Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.
Error: MCP call timed out: Timed out while waiting for response to ClientRequest. Waited 60.0 seconds.


Evaluating examples:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [53:09<02:18, 138.54s/it]Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7d16f75c0620>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7d16df6b2f60>
Evaluating examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [54:34<00:00, 109.14s/it]

Average score: 0.87
Generated 30 responses
Sample response:
The scenario presented highlights the influence of unconscious bias in the police officer's behavior, particularly how his past experiences, specifically childhood trauma, shape his current actions. The fact that he is more likely to ticket middle-aged white males with dark hair and eyes‚Äîwho resemble his father‚Äîsuggests a deeper psychological association that he is not consciously aware of.

To understand this unconscious bias, the psychoanalytic framework is most relevant. This framework, rooted in the ideas of Sigmund Freud and further developed by others, emphasizes the role of unconscious processes and early childhood experiences in influencing behavior. In this case, the officer's abusive relationship with his father likely instilled unconscious associations that affect his perceptions and actions towards individuals who resemble that figure from his past.

While other frameworks, such as cognitive behavioral approach




In [28]:
model = output["config"]["react_agent_model_name"]
model_display = model.split("/")[-1]
len_examples = output["total_examples"]
display_config = default_config_path.split("/")[-1].replace(".yaml", "")
print(model_display)

with open(f"results_{model_display}_{len_examples}_{display_config}.json", "w") as f:
    json.dump(output, f, indent=2)

print(output["average_score"])

for response in output["results"][:2]:
    print("## Question: " + response["question"])
    print("## Correct Answer: " + response["correct_answer"])
    print("## Final Answer:\n" + response["final_answer"])
    print("## Searched Queries: " + str(response["searched_queries"]))
    print("----" * 50)

gpt-4o-mini
0.8666666666666667
## Question: A police officer carries out hundreds of traffic stops every year. When his supervisor is reviewing the officer‚Äôs records for the past year, he notices that the officer is equally likely to stop people of various genders, ages, and races. However, he is significantly more likely to write tickets for middle-aged white males with dark hair and eyes. When confronted with this fact, the officer truthfully states that he has no idea why that is, and that it must simply be a coincidence. Unbeknownst to the officer, this behavior is tied to the fact that these men look like his father, with whom he had an abusive relationship as a child. What psychological framework would directly address the unconscious bias in his behavior? 
## Correct Answer: B
## Final Answer:
The scenario presented highlights the influence of unconscious bias in the police officer's behavior, particularly how his past experiences, specifically childhood trauma, shape his curr

### Simple analysis

In [29]:
def extract_thoughts(text: str) -> str:
    match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text

def extract_search_query(text: str) -> str:
    match = re.search(r"<query>(.*?)</query>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text

def parse_xml_snippets(text):

    pattern = re.compile(
        r"<snippet id=([^>]+)>"  # Capture Group 1: The ID
        # YOUR_TASK_3.1: three lines of code here to process the retrieved snippets
        r"Title: (.+?)\n"         # YOUR_TASK_3.1: Capture Group 2: The Title
        r"(?:URL: (.+?)\n)?"      # Optional Group with Capture Group 3: The URL
        r"Snippet: (.+?)\n"       # Capture Group 4: The Snippet
        r"\s*</snippet>",          # The closing tag
        re.DOTALL
        )

    matches = pattern.findall(text)

    results = []
    for match in matches:
        # match is a tuple: (id, title, url, snippet)
        # If the optional URL group did not match, match[2] will be None.
        snippet_id = match[0].strip()
        title = match[1].strip()
        url = match[2].strip() if match[2] else ""  # Handle None case for missing URL
        snippet = match[3].strip()

        results.append({
            "Title": title,
            "URL": url,
            "Snippet": snippet
        })

    return results


def count_tokens(text, model="openai/gpt-4o"):
    """
    Counts the number of tokens in a prompt using LiteLLM's token counting utility.
    Args:
        prompt (str): The input prompt string.
        model (str): The model name for which to count tokens (default: "gpt-3.5-turbo").
    Returns:
        int: The number of tokens in the prompt.
    """
    return litellm.token_counter(model=model, messages=[{"role": "user", "content": text}])


def count_tokens_in_results(results):
    # report the numbers of tokens used for question,thinking, query, snippets, and final answer
    report_results = []
    for result in results:

        thoughts = [one_round["content"] for one_round in result["conversation_history"] if one_round["type"] == "think"]
        cleaned_thoughts = [extract_thoughts(thought) for thought in thoughts]
        
        query_snippets = [one_round["content"] for one_round in result["conversation_history"] if one_round["type"] == "query"]
        cleaned_queries = [extract_search_query(query) for query in query_snippets]
        # print(parse_xml_snippets(query_snippets[0]))
        parsed_snippets = []
        for query in query_snippets:
            parsed_snippets.extend(parse_xml_snippets(query))
        cleaned_snippets_titles = [snippet["Title"] for snippet in parsed_snippets]
        cleaned_snippets_snippets = [snippet["Snippet"] for snippet in parsed_snippets]

        report_results.append({
            "question": count_tokens(result["question"]),
            "thinking": count_tokens(" ".join(cleaned_thoughts)),
            "query": count_tokens(" ".join(cleaned_queries)),   
            "snippets_titles": count_tokens(" ".join(cleaned_snippets_titles)),
            "snippets_snippets": count_tokens(" ".join(cleaned_snippets_snippets)),
            "final_answer": count_tokens(result["final_answer"]),
        })

    # for each key, report the average, round to 2 decimal places
    return {key: round(sum(result[key] for result in report_results) / len(report_results), 2) for key in report_results[0].keys()}

# YOUR_TASK_3.2: calculate the token counts for each category for each variant and report in the homework write-up
count_tokens_in_results(output["results"])

{'question': 80.2,
 'thinking': 693.47,
 'query': 24.1,
 'snippets_titles': 7.0,
 'snippets_snippets': 7.0,
 'final_answer': 317.53}