In [69]:
from tqdm.auto import tqdm
from call_model import call_chat_once
import asyncio
import httpx
from uuid import uuid4
import json
import datasets

from typing import Optional
import os
import csv
import nest_asyncio
nest_asyncio.apply()

In [70]:
from datasets import load_dataset

eval_dataset = load_dataset("csv", data_files="data/eval.csv")["train"]

In [71]:
print(eval_dataset)

Dataset({
    features: ['context', 'question', 'answer', 'source_doc', 'groundedness_score', 'groundedness_eval', 'relevance_score', 'relevance_eval', 'standalone_score', 'standalone_eval'],
    num_rows: 52
})


In [72]:
OUTPUT_FILE = "data/eval_rag_results.csv"

def load_existing_results(file_path):
    dataset = []
    existing_questions = set()
    if os.path.exists(file_path):
        with open(file_path, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_questions.add(row["user_input"])
                # parse retrieved_contexts lại từ JSON
                row["retrieved_contexts"] = json.loads(row["retrieved_contexts"])
                dataset.append(row)
    return existing_questions, dataset


def append_result_to_csv(file_path, row, fieldnames):
    file_exists = os.path.exists(file_path)
    with open(file_path, "a", newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    verbose: Optional[bool] = False
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    fieldnames = ["user_input", "retrieved_contexts", "response", "reference"]

    processed_questions, dataset = load_existing_results(OUTPUT_FILE)
    print(f"Đã có {len(processed_questions)} câu hỏi xử lý trước đó, sẽ bỏ qua chúng.")


    for i, example in enumerate(tqdm(eval_dataset)):
        question = example["question"]
        print(question)
        if question in processed_questions:
            continue
        
        payload = {
            "question": question,
            "session_id": str(uuid4()),
            "chat_history": []
        }

        # try:
        relevant_docs, answer = asyncio.run(call_chat_once(payload))
        # except Exception as e:
        #     print(f"Error from question: '{question}': {e}")

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')

        row = {
            "user_input": question,
            "retrieved_contexts": json.dumps(
                [doc.get('page_content') for doc in relevant_docs],
                ensure_ascii=False
            ),
            "response": answer,
            "reference": example['answer']
        }
        append_result_to_csv(OUTPUT_FILE, row, fieldnames)
        dataset.append({
            "user_input": question,
            "retrieved_contexts": json.loads(row["retrieved_contexts"]),
            "response": answer,
            "reference": example['answer']
        })
    return dataset

In [None]:
data = run_rag_tests(eval_dataset=eval_dataset)


Đã có 5 câu hỏi xử lý trước đó, sẽ bỏ qua chúng.


  0%|          | 0/52 [00:00<?, ?it/s]

Who proposed the hybrid Vision Transformer model?
Who are the authors of the Longformer paper?
How many layers does the distilbert-base-uncased model have?
What is the style of TPU access when using Google Colab?
What is the default label inferred by Gradio for the input parameter in the GUI?
What does the 'beta_start' parameter value mean in DDPMScheduler?


2025-08-13 11:46:30,077 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 12%|█▏        | 6/52 [00:08<01:01,  1.33s/it]

How do you register a custom Resnet model to the auto classes in Transformers?


2025-08-13 11:46:36,260 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 13%|█▎        | 7/52 [00:14<01:41,  2.27s/it]

Who is the author of the paper 'Learning Transferable Visual Models From Natural Language Supervision'?


2025-08-13 11:46:39,864 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 15%|█▌        | 8/52 [00:17<01:52,  2.55s/it]

What is the purpose of local attention in Longformer?


2025-08-13 11:46:44,486 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 17%|█▋        | 9/52 [00:22<02:10,  3.04s/it]

What library is used to import the text encoder?


2025-08-13 11:46:48,595 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 19%|█▉        | 10/52 [00:26<02:19,  3.31s/it]

What license is used for TheBloke/Chronohermes-Grad-L2-13B-GPTQ?


2025-08-13 11:46:52,788 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 21%|██        | 11/52 [00:30<02:25,  3.55s/it]

What is a potential legal issue with using outputs from language models?


2025-08-13 11:46:56,392 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 23%|██▎       | 12/52 [00:34<02:22,  3.56s/it]

What is the filename used when downloading the model from the Hugging Face Hub?


2025-08-13 11:46:59,381 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 25%|██▌       | 13/52 [00:37<02:12,  3.40s/it]

What method is used by ViTHybridImageProcessor?


2025-08-13 11:47:03,147 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 27%|██▋       | 14/52 [00:41<02:13,  3.50s/it]

Who contributed the code component for syntax highlighting in Gradio?


2025-08-13 11:47:08,425 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 29%|██▉       | 15/52 [00:46<02:28,  4.02s/it]

Where can you find the Apache License, Version 2.0?


2025-08-13 11:47:11,847 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 31%|███       | 16/52 [00:49<02:18,  3.84s/it]

What tool can be used for sentiment analysis in NLP?


2025-08-13 11:47:17,666 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 33%|███▎      | 17/52 [00:55<02:34,  4.43s/it]

Who are the authors of the paper associated with Graphormer?


2025-08-13 11:47:21,339 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 35%|███▍      | 18/52 [00:59<02:22,  4.20s/it]

What function does Accelerate provide to determine a device map?


2025-08-13 11:47:25,040 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 37%|███▋      | 19/52 [01:02<02:13,  4.05s/it]

Why is the __call__ method decorated with torch.no_grad?


2025-08-13 11:47:29,130 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 38%|███▊      | 20/52 [01:07<02:10,  4.06s/it]

What is the latency in milliseconds with fp16 optimization?


2025-08-13 11:47:32,316 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 40%|████      | 21/52 [01:10<01:57,  3.80s/it]

How can you share a Gradio demo publicly?


2025-08-13 11:47:36,284 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 42%|████▏     | 22/52 [01:14<01:55,  3.85s/it]

What is the Top 1 Accuracy of swsl_resnet18 on ImageNet?


2025-08-13 11:47:39,192 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 44%|████▍     | 23/52 [01:17<01:43,  3.57s/it]

What is the license for the weights of the SWSL ResNet models?


2025-08-13 11:47:42,385 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 46%|████▌     | 24/52 [01:20<01:36,  3.46s/it]

Who proposed Consistency Models?


2025-08-13 11:47:45,227 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 48%|████▊     | 25/52 [01:23<01:28,  3.27s/it]

What is the Top 1 Accuracy of tv_resnet152 on ImageNet?


2025-08-13 11:47:48,670 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 50%|█████     | 26/52 [01:26<01:26,  3.32s/it]

Under which license is the HuggingFace Transformers library released?


2025-08-13 11:47:51,302 - INFO - HTTP Request: POST http://localhost:8001/api/v1/chat-evaluation "HTTP/1.1 200 OK"
 52%|█████▏    | 27/52 [01:29<01:17,  3.11s/it]

What is the model structure of GPTSAN?


In [None]:
from ragas import EvaluationDataset
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    FactualCorrectness
)
evaluation_dataset = EvaluationDataset.from_list(data)

# Khởi tạo các metric
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]
# metrics = [FactualCorrectness()]

In [None]:
print(len(evaluation_dataset))

5


In [None]:
from app.config import configs

import os
os.environ['OPENAI_API_KEY'] = configs.OPENAI_API_KEY

result = evaluate(
    evaluation_dataset,
    metrics=metrics,
)

print(result) 

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]2025-08-13 11:44:14,131 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,278 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,389 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,407 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,437 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,453 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,462 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,477 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-13 11:44:14,503 - INFO - HTTP Request:

{'faithfulness': 0.8000, 'answer_relevancy': 0.9630, 'context_precision': 0.7500, 'context_recall': 0.8000}
