In [18]:
from tqdm.auto import tqdm
from call_model import call_chat_once
import asyncio
import httpx
from uuid import uuid4
import json
import datasets

from typing import Optional
import os
import csv
import nest_asyncio
nest_asyncio.apply()

In [19]:
from datasets import load_dataset

eval_dataset = load_dataset("csv", data_files="data/eval.csv")["train"]

In [20]:
print(eval_dataset)

Dataset({
    features: ['context', 'question', 'answer', 'source_doc', 'groundedness_score', 'groundedness_eval', 'relevance_score', 'relevance_eval', 'standalone_score', 'standalone_eval'],
    num_rows: 52
})


In [21]:
OUTPUT_FILE = "data/eval_rag_results_k=20_fetch=40_top=3.csv"

def load_existing_results(file_path):
    dataset = []
    existing_questions = set()
    if os.path.exists(file_path):
        with open(file_path, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_questions.add(row["user_input"])
                # parse retrieved_contexts lại từ JSON
                row["retrieved_contexts"] = json.loads(row["retrieved_contexts"])
                dataset.append(row)
    return existing_questions, dataset


def append_result_to_csv(file_path, row, fieldnames):
    file_exists = os.path.exists(file_path)
    with open(file_path, "a", newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    verbose: Optional[bool] = False
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    fieldnames = ["user_input", "retrieved_contexts", "response", "reference"]

    processed_questions, dataset = load_existing_results(OUTPUT_FILE)
    print(f"Đã có {len(processed_questions)} câu hỏi xử lý trước đó, sẽ bỏ qua chúng.")


    for i, example in enumerate(tqdm(eval_dataset)):
        question = example["question"]
        print(question)
        if question in processed_questions:
            continue
        
        payload = {
            "question": question,
            "session_id": str(uuid4()),
            "chat_history": []
        }

        # try:
        relevant_docs, answer = asyncio.run(call_chat_once(payload))
        # except Exception as e:
        #     print(f"Error from question: '{question}': {e}")

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')

        row = {
            "user_input": question,
            "retrieved_contexts": json.dumps(
                [doc['kwargs'].get('page_content') for doc in relevant_docs],
                ensure_ascii=False
            ),
            "response": answer,
            "reference": example['answer']
        }
        append_result_to_csv(OUTPUT_FILE, row, fieldnames)
        dataset.append({
            "user_input": question,
            "retrieved_contexts": json.loads(row["retrieved_contexts"]),
            "response": answer,
            "reference": example['answer']
        })
    return dataset

In [22]:
data = run_rag_tests(eval_dataset=eval_dataset)


Đã có 0 câu hỏi xử lý trước đó, sẽ bỏ qua chúng.


  0%|          | 0/52 [00:00<?, ?it/s]

Who proposed the hybrid Vision Transformer model?


2025-08-13 16:23:44,540 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
  2%|▏         | 1/52 [00:13<11:14, 13.23s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who proposed the hybrid Vision Transformer model?', 'type': 'human', 'id': '40f94256-06a6-402b-adaf-046b5f17a068'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_CvccDBKC1xpAy19cAKKPFZgj', 'function': {'arguments': '{"query":"hybrid Vision Transformer model proposal"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--5aeac50c-783e-411e-b99c-264b2ad4ca2c-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'hybrid Vision Transformer model proposal'}, 'id': 'call_CvccDBKC1xpAy19cAKKPFZgj', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 356, 'output_tok

2025-08-13 16:23:56,736 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
  4%|▍         | 2/52 [00:25<10:30, 12.62s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who are the authors of the Longformer paper?', 'type': 'human', 'id': 'a7d580d6-c4e4-4783-ab25-4fd2a72b13ea'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_eiwSMGlL12E37x34aGBuD93r', 'function': {'arguments': '{"query":"Longformer paper authors"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--e07a4555-0344-458b-8486-50641b352053-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Longformer paper authors'}, 'id': 'call_eiwSMGlL12E37x34aGBuD93r', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 358, 'output_tokens': 19, 'total_tokens': 377, 'input

2025-08-13 16:24:09,367 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
  6%|▌         | 3/52 [00:38<10:18, 12.63s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'How many layers does the distilbert-base-uncased model have?', 'type': 'human', 'id': 'fbe5659c-b8df-406a-aa58-e1408b19c486'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_V7ZDneLF7asFPGTfL6tnD7iz', 'function': {'arguments': '{"query":"distilbert-base-uncased number of layers"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--662874fa-abac-4d1f-a661-0defa879aa66-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'distilbert-base-uncased number of layers'}, 'id': 'call_V7ZDneLF7asFPGTfL6tnD7iz', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 363, 

2025-08-13 16:24:21,933 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
  8%|▊         | 4/52 [00:50<10:04, 12.60s/it]

What is the default label inferred by Gradio for the input parameter in the GUI?


2025-08-13 16:24:33,729 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 10%|▉         | 5/52 [01:02<09:38, 12.31s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the default label inferred by Gradio for the input parameter in the GUI?', 'type': 'human', 'id': 'c2f93885-48fa-4efc-8212-693aa5ebcd56'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_o7KjrASgLfutdqmiOVubIET5', 'function': {'arguments': '{"query":"default label inferred by Gradio for the input parameter in the GUI"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--ff0a3b2d-803f-4ff8-8d63-323b3fd12614-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'default label inferred by Gradio for the input parameter in the GUI'}, 'id': 'call_o7KjrASgLfutdqmi

2025-08-13 16:24:46,444 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 12%|█▏        | 6/52 [01:15<09:32, 12.45s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': "What does the 'beta_start' parameter value mean in DDPMScheduler?", 'type': 'human', 'id': 'ebfc25a4-d2b5-4e8c-ba4e-707d3e9b7a1c'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_4e5S7euvgaX2eqGdRxBtj4Ak', 'function': {'arguments': '{"query":"DDPMScheduler beta_start parameter"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--9ed903c5-7439-48b0-8f56-ce0b144d72ce-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'DDPMScheduler beta_start parameter'}, 'id': 'call_4e5S7euvgaX2eqGdRxBtj4Ak', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 364, 'output

2025-08-13 16:25:01,174 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 13%|█▎        | 7/52 [01:29<09:53, 13.19s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'How do you register a custom Resnet model to the auto classes in Transformers?', 'type': 'human', 'id': '1fd0a7c9-26b3-4533-9570-00e468e00e54'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_VM0bewHSWZYWxX9NA9l2BENK', 'function': {'arguments': '{"query":"register custom Resnet model auto classes Transformers"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--d76b30fa-39ac-41d2-945d-6ac189cbd58d-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'register custom Resnet model auto classes Transformers'}, 'id': 'call_VM0bewHSWZYWxX9NA9l2BENK', 'type': 'tool_cal

2025-08-13 16:25:14,957 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 15%|█▌        | 8/52 [01:43<09:48, 13.38s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': "Who is the author of the paper 'Learning Transferable Visual Models From Natural Language Supervision'?", 'type': 'human', 'id': '08a576d3-c00b-4733-8f0e-57cc9cc3a274'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_RIWR0OvemNTzUM0VQ9adFNH0', 'function': {'arguments': '{"query":"Learning Transferable Visual Models From Natural Language Supervision author"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--06fa8cc6-7e1b-48f2-a99d-e225d6132189-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Learning Transferable Visual Models From Natural Language Supervisi

2025-08-13 16:25:32,138 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 17%|█▋        | 9/52 [02:00<10:26, 14.57s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the purpose of local attention in Longformer?', 'type': 'human', 'id': '0b433a24-6726-4ee0-926d-955cc151573b'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_dSDxdJYVGJdyXe2Ll2zPFYC5', 'function': {'arguments': '{"query":"local attention in Longformer"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--b7848817-53c5-45ea-bc19-ce203ec885a5-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'local attention in Longformer'}, 'id': 'call_dSDxdJYVGJdyXe2Ll2zPFYC5', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 359, 'output_tokens': 20, 'total_t

2025-08-13 16:25:44,039 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 19%|█▉        | 10/52 [02:12<09:37, 13.75s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What library is used to import the text encoder?', 'type': 'human', 'id': '1d1c3eaf-ec5c-46f4-986f-e0d1cf44109f'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_CE5tyM7IgKyeIbdFUXoGV373', 'function': {'arguments': '{"query":"text encoder library import"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--33ff3fe4-3816-41c3-9dae-bef147dffafa-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'text encoder library import'}, 'id': 'call_CE5tyM7IgKyeIbdFUXoGV373', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 358, 'output_tokens': 19, 'total_tokens': 3

2025-08-13 16:25:55,607 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 21%|██        | 11/52 [02:24<08:56, 13.08s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What license is used for TheBloke/Chronohermes-Grad-L2-13B-GPTQ?', 'type': 'human', 'id': 'a0252b32-9de8-4195-b45e-558c031a93b2'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ORMfbnUFfdbn08joOTxqVOkS', 'function': {'arguments': '{"query":"TheBloke/Chronohermes-Grad-L2-13B-GPTQ license"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--b806f202-ecb3-4ed5-9ad6-832d7c199152-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'TheBloke/Chronohermes-Grad-L2-13B-GPTQ license'}, 'id': 'call_ORMfbnUFfdbn08joOTxqVOkS', 'type': 'tool_call'}], 'usage_metadata': {'inpu

2025-08-13 16:26:07,378 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 23%|██▎       | 12/52 [02:36<08:27, 12.68s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is a potential legal issue with using outputs from language models?', 'type': 'human', 'id': '8051e44c-cac4-42ef-9641-f66c24addb03'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_JIb75WLGCMLDt7lakqC8Eoml', 'function': {'arguments': '{"query":"legal issues with using outputs from language models"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--03a332a5-b55d-45e5-9bd9-b3671897b729-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'legal issues with using outputs from language models'}, 'id': 'call_JIb75WLGCMLDt7lakqC8Eoml', 'type': 'tool_call'}], 'usa

2025-08-13 16:26:20,080 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 25%|██▌       | 13/52 [02:48<08:14, 12.69s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the filename used when downloading the model from the Hugging Face Hub?', 'type': 'human', 'id': '30f3e4d1-08fb-4e32-85ae-ea1bab5f5bd9'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_M6qiPBc9Ojx3IKcC5o6O6OFX', 'function': {'arguments': '{"query":"filename used when downloading the model from the Hugging Face Hub"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--7eb8743e-1f2d-4628-b31a-9547cb04bd1f-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'filename used when downloading the model from the Hugging Face Hub'}, 'id': 'call_M6qiPBc9Ojx3IKcC5o6

2025-08-13 16:26:32,334 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 27%|██▋       | 14/52 [03:01<07:57, 12.56s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What method is used by ViTHybridImageProcessor?', 'type': 'human', 'id': '1995d4b0-84bf-4a84-82bb-4e103b06dbb2'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_KqcEVVPyr9KhW8PVbJRLINm3', 'function': {'arguments': '{"query":"ViTHybridImageProcessor method"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--4db29714-8385-4ff2-8d3d-239de1bc3f63-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'ViTHybridImageProcessor method'}, 'id': 'call_KqcEVVPyr9KhW8PVbJRLINm3', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 359, 'output_tokens': 21, 'total_token

2025-08-13 16:26:44,480 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 29%|██▉       | 15/52 [03:13<07:39, 12.43s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who contributed the code component for syntax highlighting in Gradio?', 'type': 'human', 'id': '3ac7a1e5-a8fa-4a0f-862f-72158cf35965'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_eUy4tyJChkyMuksA7dXrvPV4', 'function': {'arguments': '{"query":"syntax highlighting code component contributor Gradio"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--d7d65280-df2f-4b68-8e85-9874ffe8b9ce-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'syntax highlighting code component contributor Gradio'}, 'id': 'call_eUy4tyJChkyMuksA7dXrvPV4', 'type': 'tool_call'}], 'usag

2025-08-13 16:26:56,934 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 31%|███       | 16/52 [03:25<07:27, 12.44s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Where can you find the Apache License, Version 2.0?', 'type': 'human', 'id': 'a087d50d-1ffe-414f-9474-30cd27e5c941'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_RAiagEINZXEVy4n02XJro9i4', 'function': {'arguments': '{"query":"Apache License, Version 2.0"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--622a63cd-cb5c-473e-b6df-d78b1297c1b9-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Apache License, Version 2.0'}, 'id': 'call_RAiagEINZXEVy4n02XJro9i4', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 362, 'output_tokens': 23, 'total_tokens'

2025-08-13 16:27:10,925 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 33%|███▎      | 17/52 [03:39<07:31, 12.91s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What tool can be used for sentiment analysis in NLP?', 'type': 'human', 'id': '00a2c8bd-35a0-4b15-9d43-60caa4b1decf'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_WxFSHpgEPkKfzxVjdcZCqse8', 'function': {'arguments': '{"query":"sentiment analysis tool NLP"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--f51189d8-05f4-4299-8175-4c20af52593c-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'sentiment analysis tool NLP'}, 'id': 'call_WxFSHpgEPkKfzxVjdcZCqse8', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 359, 'output_tokens': 20, 'total_tokens

2025-08-13 16:27:24,157 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 35%|███▍      | 18/52 [03:52<07:22, 13.00s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who are the authors of the paper associated with Graphormer?', 'type': 'human', 'id': '4bd1df19-9434-44ed-9612-e0b4bbe93c2d'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_GkkYNkALpGrUG36C08S2szgQ', 'function': {'arguments': '{"query":"Graphormer paper authors"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--0e6ea8f3-a1f6-472c-a13a-94cf868a0e18-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Graphormer paper authors'}, 'id': 'call_GkkYNkALpGrUG36C08S2szgQ', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 361, 'output_tokens': 20, 'total_toke

2025-08-13 16:27:36,312 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 37%|███▋      | 19/52 [04:05<07:00, 12.75s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What function does Accelerate provide to determine a device map?', 'type': 'human', 'id': '86edce06-2efc-4977-9f89-9fadc251c67f'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_nIfht2BtrtXUdmHVFwKpb4pO', 'function': {'arguments': '{"query":"Accelerate device map function"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--88d995ff-12ac-4231-9236-6a850cac38c0-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Accelerate device map function'}, 'id': 'call_nIfht2BtrtXUdmHVFwKpb4pO', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 360, 'output_tokens':

2025-08-13 16:27:48,664 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 38%|███▊      | 20/52 [04:17<06:44, 12.63s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Why is the __call__ method decorated with torch.no_grad?', 'type': 'human', 'id': '75a0ef05-9bf2-4995-b29e-b2bd7090ea6a'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_n41EYFY988QlsdOzjCa1hUht', 'function': {'arguments': '{"query":"__call__ method decorated with torch.no_grad"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--847a8829-baec-408a-9a05-e4066a35afe2-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': '__call__ method decorated with torch.no_grad'}, 'id': 'call_n41EYFY988QlsdOzjCa1hUht', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 3

2025-08-13 16:28:01,630 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 40%|████      | 21/52 [04:30<06:34, 12.73s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the latency in milliseconds with fp16 optimization?', 'type': 'human', 'id': '163bca9f-7f82-414b-9e0f-ee46df293c69'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_zEulvi2sVbsjE3mDz77o8FQ3', 'function': {'arguments': '{"query":"latency in milliseconds with fp16 optimization"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--2fbb08fd-92c3-4f76-b911-281fe70daae1-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'latency in milliseconds with fp16 optimization'}, 'id': 'call_zEulvi2sVbsjE3mDz77o8FQ3', 'type': 'tool_call'}], 'usage_metadata': {'input_tok

2025-08-13 16:28:17,021 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 42%|████▏     | 22/52 [04:45<06:45, 13.53s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'How can you share a Gradio demo publicly?', 'type': 'human', 'id': 'a6cbc214-cc8c-436d-845c-4a047f4dc0a8'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ip6ycxX6zE7rQKqebHr6TokK', 'function': {'arguments': '{"query":"share Gradio demo publicly"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--b9d4fb1c-5eda-42b4-848f-d6b83a4279a8-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'share Gradio demo publicly'}, 'id': 'call_ip6ycxX6zE7rQKqebHr6TokK', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 358, 'output_tokens': 20, 'total_tokens': 378, 'inpu

2025-08-13 16:28:32,616 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 44%|████▍     | 23/52 [05:01<06:50, 14.15s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the Top 1 Accuracy of swsl_resnet18 on ImageNet?', 'type': 'human', 'id': 'c643f5f3-de51-452e-a140-96d8370d3e5f'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ENQJLD1YChgobdu3dm91Xflu', 'function': {'arguments': '{"query":"Top 1 Accuracy of swsl_resnet18 on ImageNet"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--f2e88886-18b8-47bb-ae7d-5a6789f54a83-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Top 1 Accuracy of swsl_resnet18 on ImageNet'}, 'id': 'call_ENQJLD1YChgobdu3dm91Xflu', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 365

2025-08-13 16:28:48,697 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 46%|████▌     | 24/52 [05:17<06:52, 14.73s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the license for the weights of the SWSL ResNet models?', 'type': 'human', 'id': '001fb30c-d00f-4034-99f9-fb8c85531107'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_FjOZQZgXCMu4BxJbBv8bld3F', 'function': {'arguments': '{"query":"SWSL ResNet models license"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--c7760f34-c3b2-48e0-bff7-15e0b5ac6be0-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'SWSL ResNet models license'}, 'id': 'call_FjOZQZgXCMu4BxJbBv8bld3F', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 364, 'output_tokens': 22, 'tota

2025-08-13 16:28:59,541 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 48%|████▊     | 25/52 [05:28<06:06, 13.56s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who proposed Consistency Models?', 'type': 'human', 'id': 'f224a721-bb34-48bd-9c25-a2d84f777bb2'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_MzOorfZE00TYomq9pPMI0hrg', 'function': {'arguments': '{"query":"Consistency Models proposal"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--870448c0-c0ef-4543-b588-4150988424d9-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Consistency Models proposal'}, 'id': 'call_MzOorfZE00TYomq9pPMI0hrg', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 354, 'output_tokens': 18, 'total_tokens': 372, 'input_token

2025-08-13 16:29:10,985 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 50%|█████     | 26/52 [05:39<05:36, 12.93s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the Top 1 Accuracy of tv_resnet152 on ImageNet?', 'type': 'human', 'id': '719cab7a-1814-4430-9444-bb53275ecbad'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_FwAx34gqGmbSRch9CPgVu77W', 'function': {'arguments': '{"query":"Top 1 Accuracy of tv_resnet152 on ImageNet"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--71b8615a-fdd3-4968-8a2d-98eb843adac8-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Top 1 Accuracy of tv_resnet152 on ImageNet'}, 'id': 'call_FwAx34gqGmbSRch9CPgVu77W', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 364, '

2025-08-13 16:29:22,620 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 52%|█████▏    | 27/52 [05:51<05:13, 12.54s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Under which license is the HuggingFace Transformers library released?', 'type': 'human', 'id': '8005d034-efa6-4314-95b4-4796711b5744'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_jvqFBh87YFIzpjHBUfRlOu3U', 'function': {'arguments': '{"query":"HuggingFace Transformers library license"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--b76e9c6a-6de3-446a-8a15-6c1378ea00e8-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'HuggingFace Transformers library license'}, 'id': 'call_jvqFBh87YFIzpjHBUfRlOu3U', 'type': 'tool_call'}], 'usage_metadata': {'input_token

2025-08-13 16:29:36,255 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 54%|█████▍    | 28/52 [06:04<05:08, 12.87s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the model structure of GPTSAN?', 'type': 'human', 'id': 'fa65cdf5-06c3-47c3-9c80-3efe5baac1bd'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_zKScK5QSUctQnWbn56Joia8O', 'function': {'arguments': '{"query":"GPTSAN model structure"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--3f586238-28c3-4d74-8456-3b5c6fc257d4-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'GPTSAN model structure'}, 'id': 'call_zKScK5QSUctQnWbn56Joia8O', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 357, 'output_tokens': 19, 'total_tokens': 376, 'input_token_det

2025-08-13 16:29:49,715 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 56%|█████▌    | 29/52 [06:18<05:00, 13.05s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Which LCM LoRA model is derived from the SDXL 1.0 base?', 'type': 'human', 'id': 'b0f6a2b6-ba6a-4ac3-bc94-ec4a2d223f8a'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_bZTquKxP6Cd3ZKWDaDAgndJp', 'function': {'arguments': '{"query":"LCM LoRA model derived from SDXL 1.0 base"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--613b8ace-8465-403b-a1e6-dd7e258cacac-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'LCM LoRA model derived from SDXL 1.0 base'}, 'id': 'call_bZTquKxP6Cd3ZKWDaDAgndJp', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 366, 'ou

2025-08-13 16:30:04,139 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 58%|█████▊    | 30/52 [06:32<04:56, 13.46s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the Top 1 Accuracy for tf_efficientnet_b6 on ImageNet?', 'type': 'human', 'id': 'cbe0c944-3408-4225-b0b3-a0d6b128c517'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_n3QCkaGyAmIfPGYtMK26PqCE', 'function': {'arguments': '{"query":"Top 1 Accuracy for tf_efficientnet_b6 on ImageNet"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--f3f40ca7-d292-4463-8a21-9686a999ed3e-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Top 1 Accuracy for tf_efficientnet_b6 on ImageNet'}, 'id': 'call_n3QCkaGyAmIfPGYtMK26PqCE', 'type': 'tool_call'}], 'usage_metadata': {'

2025-08-13 16:30:16,640 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 60%|█████▉    | 31/52 [06:45<04:36, 13.17s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What utility allows starting multi-gpu training in a Jupyter Notebook?', 'type': 'human', 'id': '0efaa2e7-d558-43d1-b5aa-b9769430edc5'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_48CqXVxGQ4QE2cGMgeByBIMq', 'function': {'arguments': '{"query":"multi-gpu training Jupyter Notebook utility"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--9095adf3-8394-448c-94e5-2a21f370588f-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'multi-gpu training Jupyter Notebook utility'}, 'id': 'call_48CqXVxGQ4QE2cGMgeByBIMq', 'type': 'tool_call'}], 'usage_metadata': {'inpu

2025-08-13 16:30:29,611 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 62%|██████▏   | 32/52 [06:58<04:22, 13.11s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': "Who are the authors of 'Contrastive Search Is What You Need For Neural Text Generation'?", 'type': 'human', 'id': 'fe4c1314-2335-4170-893a-074a9645c6b0'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ouxnaBQasC4RuyyKnn8zPW4C', 'function': {'arguments': '{"query":"Contrastive Search Is What You Need For Neural Text Generation authors"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--75f27832-5592-4cb8-887b-1689a2c8c4c3-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Contrastive Search Is What You Need For Neural Text Generation authors'}, 'id': 'call_ou

2025-08-13 16:30:41,756 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 63%|██████▎   | 33/52 [07:10<04:03, 12.82s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': "Where can you find BERT's conversion script for porting from TensorFlow to PyTorch?", 'type': 'human', 'id': '7224f428-a128-4f8f-bfda-1809770b1556'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_jpL043QmQNyKYoTy4oSgeBWd', 'function': {'arguments': '{"query":"BERT conversion script TensorFlow to PyTorch"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--bc72e85b-d08d-4caa-8a76-8eee980de75c-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'BERT conversion script TensorFlow to PyTorch'}, 'id': 'call_jpL043QmQNyKYoTy4oSgeBWd', 'type': 'tool_call'}], 'usage_me

2025-08-13 16:30:57,560 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 65%|██████▌   | 34/52 [07:26<04:06, 13.72s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What are the advantages of streaming mode over downloading a dataset?', 'type': 'human', 'id': '9506f159-4196-42b5-8dcc-a85605f17297'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ZCTEYxQUHpfP4IxmBHgW8TbL', 'function': {'arguments': '{"query":"advantages of streaming mode over downloading a dataset"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--d831dc37-a774-427e-af8c-c42276adb4c6-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'advantages of streaming mode over downloading a dataset'}, 'id': 'call_ZCTEYxQUHpfP4IxmBHgW8TbL', 'type': 'tool_call'}], '

2025-08-13 16:31:12,785 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 67%|██████▋   | 35/52 [07:41<04:00, 14.17s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What teacher model is used with frugalscore_tiny_bert-base_mover-score?', 'type': 'human', 'id': '9bf26aa3-4c5b-4e1d-a609-92a12810e90c'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ZHMcLGClip6mKSoYQ7COYI1t', 'function': {'arguments': '{"query":"frugalscore_tiny_bert-base_mover-score teacher model"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--f8c693e9-1769-433a-9ca2-a754f3f6ba95-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'frugalscore_tiny_bert-base_mover-score teacher model'}, 'id': 'call_ZHMcLGClip6mKSoYQ7COYI1t', 'type': 'tool_call'}], 'usag

2025-08-13 16:31:25,315 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 69%|██████▉   | 36/52 [07:54<03:38, 13.68s/it]

What is the default activation function used in DistilBertConfig?


2025-08-13 16:31:38,335 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 71%|███████   | 37/52 [08:07<03:22, 13.48s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the default activation function used in DistilBertConfig?', 'type': 'human', 'id': 'fff6bcca-7554-460c-8812-0d3f61875c76'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_BhimV4LyQ0OBiq9CIwzYtyhv', 'function': {'arguments': '{"query":"default activation function in DistilBertConfig"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--eacea43c-918a-422d-b335-f1130c39ef6b-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'default activation function in DistilBertConfig'}, 'id': 'call_BhimV4LyQ0OBiq9CIwzYtyhv', 'type': 'tool_call'}], 'usage_metadata': {'i

2025-08-13 16:31:51,693 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 73%|███████▎  | 38/52 [08:20<03:08, 13.44s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the top 1 accuracy of mobilenetv3_large_100 on ImageNet?', 'type': 'human', 'id': '3fe7eff1-f085-4bbb-b793-15d56362a77f'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_mOnbYqqNIvcIrURENjhjfCvJ', 'function': {'arguments': '{"query":"mobilenetv3_large_100 top 1 accuracy ImageNet"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--7889c160-d1f6-4e61-b8f4-50ef8b88fc71-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'mobilenetv3_large_100 top 1 accuracy ImageNet'}, 'id': 'call_mOnbYqqNIvcIrURENjhjfCvJ', 'type': 'tool_call'}], 'usage_metadata': {'input_

2025-08-13 16:32:04,986 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 75%|███████▌  | 39/52 [08:33<02:54, 13.40s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What version of Python is required to contribute to 🤗 Transformers?', 'type': 'human', 'id': '0efbc24d-70ff-4890-b666-de7039c2fe19'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_XpZ5apNtUnrwWhO3PcA3IgzJ', 'function': {'arguments': '{"query":"Python version required to contribute to Transformers"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--0481e2db-64c0-4175-915b-fd260cb8758b-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Python version required to contribute to Transformers'}, 'id': 'call_XpZ5apNtUnrwWhO3PcA3IgzJ', 'type': 'tool_call'}], 'usage_

2025-08-13 16:32:18,369 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 77%|███████▋  | 40/52 [08:47<02:40, 13.39s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the default value of HF_INFERENCE_ENDPOINT?', 'type': 'human', 'id': '335a6bbc-9631-41fb-bfab-e7100b044e47'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_iiwSCusjTohpClo65CcfVn5n', 'function': {'arguments': '{"query":"default value of HF_INFERENCE_ENDPOINT"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--101d33ea-9cb1-4a7e-960e-89a6151d79ff-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'default value of HF_INFERENCE_ENDPOINT'}, 'id': 'call_iiwSCusjTohpClo65CcfVn5n', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 359, 'output_token

2025-08-13 16:32:30,981 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 79%|███████▉  | 41/52 [08:59<02:24, 13.16s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the name of the ResNet variant that uses squeeze-and-excitation blocks?', 'type': 'human', 'id': '22fd44f4-49a7-4155-9e01-4141e092c62a'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_8yp2V7qGz7TbEKQY1zl7ndOD', 'function': {'arguments': '{"query":"ResNet variant squeeze-and-excitation blocks"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--becd9b06-106a-4076-8282-1f1412ad6961-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'ResNet variant squeeze-and-excitation blocks'}, 'id': 'call_8yp2V7qGz7TbEKQY1zl7ndOD', 'type': 'tool_call'}], 'usage_metada

2025-08-13 16:32:43,073 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 81%|████████  | 42/52 [09:11<02:08, 12.84s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What Python package needs to be installed for the Gradio demo?', 'type': 'human', 'id': '4ffe3ae2-9bf5-4d84-9ff6-62b6ac75cf70'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_yjWheZFEH5krJBdHeo3yzLIn', 'function': {'arguments': '{"query":"Python package for Gradio demo"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--8f851c3b-b099-4494-baab-5d91d1801b73-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Python package for Gradio demo'}, 'id': 'call_yjWheZFEH5krJBdHeo3yzLIn', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 361, 'output_tokens': 2

2025-08-13 16:32:57,106 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 83%|████████▎ | 43/52 [09:25<01:58, 13.20s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What does ZeRO-powered data parallelism store on each GPU?', 'type': 'human', 'id': '2f9b230f-4899-4046-bf51-1e4bddc160a5'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_qoBmqAASsE3WNcLoBwi9knpX', 'function': {'arguments': '{"query":"ZeRO-powered data parallelism store on each GPU"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--64b04945-a877-40c0-ab92-5cc39eced2ed-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'ZeRO-powered data parallelism store on each GPU'}, 'id': 'call_qoBmqAASsE3WNcLoBwi9knpX', 'type': 'tool_call'}], 'usage_metadata': {'input_to

2025-08-13 16:33:10,161 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 85%|████████▍ | 44/52 [09:38<01:45, 13.16s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the license URL for TheBloke/Llama2-22B-Daydreamer-v3-GGUF?', 'type': 'human', 'id': '65e7bd28-17e5-4c23-b140-e9c95f11d8e1'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_t6CJAB67P1hpneZ3o1AhOB19', 'function': {'arguments': '{"query":"TheBloke/Llama2-22B-Daydreamer-v3-GGUF license URL"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--32bd2372-fab1-403d-b3b3-96f9929d9ba7-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'TheBloke/Llama2-22B-Daydreamer-v3-GGUF license URL'}, 'id': 'call_t6CJAB67P1hpneZ3o1AhOB19', 'type': 'tool_call'}], 'usage_metada

2025-08-13 16:33:23,139 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 87%|████████▋ | 45/52 [09:51<01:31, 13.10s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the recommended scheduler for the IP-Adapter face model?', 'type': 'human', 'id': 'ed5b7bfc-0c5b-4818-a1a8-7dadfb82057e'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_P7ZQGqHJiNet3W0GAWjBC4vr', 'function': {'arguments': '{"query":"recommended scheduler for IP-Adapter face model"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--1aefa06c-225e-4b09-9109-69528033df26-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'recommended scheduler for IP-Adapter face model'}, 'id': 'call_P7ZQGqHJiNet3W0GAWjBC4vr', 'type': 'tool_call'}], 'usage_metadata': {'in

2025-08-13 16:33:35,637 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 88%|████████▊ | 46/52 [10:04<01:17, 12.92s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is one advantage of using a generator of lists of texts in `train_new_from_iterator()`?', 'type': 'human', 'id': 'e3c2ab28-00f7-43e0-a94f-4a7aeab04e55'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_8vPrUW6LdBIVoSMNqnjxMVH9', 'function': {'arguments': '{"query":"train_new_from_iterator() advantage of using a generator of lists of texts"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--9f152751-1fd7-49db-a534-4e3104445565-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'train_new_from_iterator() advantage of using a generator of lists of texts'}, 'i

2025-08-13 16:33:50,588 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 90%|█████████ | 47/52 [10:19<01:07, 13.53s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What does the stride argument control in the tokenizer?', 'type': 'human', 'id': 'ff64eeba-547a-449b-9904-bed1d3445551'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_lJ90fbIVfrD4RhIsUQmD2DGv', 'function': {'arguments': '{"query":"stride argument tokenizer"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--46c4d83e-6493-4499-92fe-42f796788d8e-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'stride argument tokenizer'}, 'id': 'call_lJ90fbIVfrD4RhIsUQmD2DGv', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 358, 'output_tokens': 18, 'total_tokens'

2025-08-13 16:34:05,836 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 92%|█████████▏| 48/52 [10:34<00:56, 14.04s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the role of machine learning in predictive healthcare networks?', 'type': 'human', 'id': '52e496c7-7783-4e0f-8b9d-6b8dee5cd391'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_CWZh3jF12P6PrFH8gmjVFvca', 'function': {'arguments': '{"query":"machine learning in predictive healthcare networks"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--32b1f2f5-630c-4ff6-9828-7cbca39e5f0c-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'machine learning in predictive healthcare networks'}, 'id': 'call_CWZh3jF12P6PrFH8gmjVFvca', 'type': 'tool_call'}], 'usage_me

2025-08-13 16:34:17,887 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 94%|█████████▍| 49/52 [10:46<00:40, 13.45s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is the size of the Parquet file for the train split of the rotten_tomatoes dataset?', 'type': 'human', 'id': 'c456583c-1dea-42be-b7b8-f9829ca41b6b'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_ZgE7MEYbxsodM9ufbVi1EpuD', 'function': {'arguments': '{"query":"size of the Parquet file for the train split of the rotten_tomatoes dataset"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--82973fb1-4275-474c-95f8-cc158117dd85-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'size of the Parquet file for the train split of the rotten_tomatoes dataset'}, 'id'

2025-08-13 16:34:30,576 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 96%|█████████▌| 50/52 [10:59<00:26, 13.22s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What are the two types of guides for contributing to Gradio?', 'type': 'human', 'id': '3ec821a0-1a8a-419c-88ae-64f3a8091fb6'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_SpOGJTwAUc9w5MPoMDmVVOt0', 'function': {'arguments': '{"query":"types of guides for contributing to Gradio"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--8c78d6a7-025a-4900-9742-d773e7167250-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'types of guides for contributing to Gradio'}, 'id': 'call_SpOGJTwAUc9w5MPoMDmVVOt0', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 3

2025-08-13 16:34:45,653 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 98%|█████████▊| 51/52 [11:14<00:13, 13.78s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What method is called to compute gradients in the training pipeline?', 'type': 'human', 'id': '7e9128a1-39b4-4d19-be03-30c47712c550'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_lOlAamGiIAySxGd2SG7hPwDH', 'function': {'arguments': '{"query":"compute gradients training pipeline"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--52f43dcd-4c56-4e77-8477-cc5270721381-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'compute gradients training pipeline'}, 'id': 'call_lOlAamGiIAySxGd2SG7hPwDH', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 360, 'o

2025-08-13 16:34:57,528 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
100%|██████████| 52/52 [11:26<00:00, 13.20s/it]

{'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'Who created the Falcon language models?', 'type': 'human', 'id': 'adb7875a-ba48-4da5-a263-5959e578bb3a'}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': '', 'additional_kwargs': {'tool_calls': [{'index': 0, 'id': 'call_RVY3EkNc0x7Bq6s0pcnePtHJ', 'function': {'arguments': '{"query":"Falcon language models creator"}', 'name': 'data_retriever'}, 'type': 'function'}]}, 'response_metadata': {'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c'}, 'type': 'ai', 'id': 'run--de5940c0-062b-48f5-907e-36399524eb43-0', 'tool_calls': [{'name': 'data_retriever', 'args': {'query': 'Falcon language models creator'}, 'id': 'call_RVY3EkNc0x7Bq6s0pcnePtHJ', 'type': 'tool_call'}], 'usage_metadata': {'input_tokens': 355, 'output_tokens': 20, 'total_tokens': 375,




In [23]:
from ragas import EvaluationDataset
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    FactualCorrectness
)
evaluation_dataset = EvaluationDataset.from_list(data)

# Khởi tạo các metric
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]
# metrics = [FactualCorrectness()]

In [24]:
print(len(evaluation_dataset))

52


In [25]:
from app.config import configs

import os
os.environ['OPENAI_API_KEY'] = configs.OPENAI_API_KEY

result = evaluate(
    evaluation_dataset,
    metrics=metrics,
)

print(result) 

Evaluating:   0%|          | 0/208 [00:00<?, ?it/s]2025-08-13 16:34:59,747 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:34:59,926 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:34:59,974 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:35:00,026 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:35:00,047 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:35:00,085 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:35:00,098 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 16:35:00,674 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-13 16:35:01,101 - INFO - HTTP Request: POST

{'faithfulness': 0.6232, 'answer_relevancy': 0.7539, 'context_precision': 0.4679, 'context_recall': 0.4904}
