In [1]:
# !pip install pandas numpy nltk rouge-score bert-score sentence-transformers scikit-learn pycocoevalcap torch transformers

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from collections import Counter

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


def word_f1(pred, gt):
    """Tính F1 score dựa trên từ"""
    pred_tokens = set(pred.lower().split())
    gt_tokens = set(gt.lower().split())

    common = pred_tokens & gt_tokens
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0

    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)

    if (precision + recall) == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)


def context_relevance(context, answer):
    """Tính độ liên quan giữa ngữ cảnh và câu trả lời (0-1)"""
    if not context or not answer:
        return 0.0
    context_emb = model.encode(context)
    answer_emb = model.encode(answer)
    return cosine_similarity([context_emb], [answer_emb])[0][0]


def answer_relevance(question, answer):
    """Tính độ liên quan giữa câu hỏi và câu trả lời (0-1)"""
    if not question or not answer:
        return 0.0
    q_emb = model.encode(question)
    a_emb = model.encode(answer)
    return cosine_similarity([q_emb], [a_emb])[0][0]


def diversity_score(text):
    """Tính độ đa dạng từ vựng (tỷ lệ từ duy nhất/tổng số từ)"""
    tokens = text.lower().split()
    if not tokens:
        return 0.0
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens)


def evaluate_rag_simple(data):
    """Hàm chính để đánh giá hệ thống RAG"""
    results = []

    for item in data:
        pred = item.get("answer", "")
        ref = item.get("ground_truth", "")
        context = item.get("contexts", [""])[0]
        question = item.get("question", "")

        metrics = {
            "Question": question,
            # "Word F1": word_f1(pred, ref),
            "Context Relevance": context_relevance(context, pred),
            "Answer Relevance": answer_relevance(question, pred),
            "Diversity": diversity_score(pred),
            "Answer Length": len(pred.split()),
            "Ground Truth Length": len(ref.split())
        }

        results.append(metrics)

    return pd.DataFrame(results)


def load_data(filepath):
    """Hàm load dữ liệu từ CSV"""
    df = pd.read_csv(filepath)
    data = []
    for _, item in df.iterrows():
        dt = {
            "question": item["question"],
            "contexts": [item["context_retriever_llama"]],
            "answer": item["response_llama"],
            "ground_truth": item["answer"]
        }
        data.append(dt)
    return data


if __name__ == "__main__":
    filepath = "E:/source_code/nlp/med_studio/data_sample_50_predicted.csv"
    data = load_data(filepath)

    evaluation_data = data

    results_df = evaluate_rag_simple(evaluation_data)

    avg_scores = results_df.mean(numeric_only=True)

    results_df.to_csv("eval_v2_llama.csv", index=False)

    print("\nChi tiết đánh giá:")
    print(results_df)
    print("\nĐiểm trung bình các metrics:")
    print(avg_scores)


Chi tiết đánh giá:
                                             Question  Context Relevance  \
0                       What is (are) Breast Cancer ?           0.652565   
1       What are the symptoms of Genochondromatosis ?           0.825076   
2                         What causes Pars planitis ?           0.840998   
3   What are the genetic changes related to genito...           0.710824   
4   What are the treatments for Pseudopelade of Br...           0.816476   
5                  What is (are) Preauricular sinus ?           0.801746   
6   How many people are affected by capillary malf...           0.678636   
7                      What is (are) Drug Reactions ?           0.565405   
8                      What is (are) Ollier disease ?           0.856656   
9                                  What causes ARDS ?           0.886695   
10  What is (are) carnitine-acylcarnitine transloc...           0.820138   
11  How many people are affected by spondylocostal...           0.77

In [3]:
print("\nChi tiết đánh giá:")
print(results_df)


Chi tiết đánh giá:
                                             Question  Context Relevance  \
0                       What is (are) Breast Cancer ?           0.652565   
1       What are the symptoms of Genochondromatosis ?           0.825076   
2                         What causes Pars planitis ?           0.840998   
3   What are the genetic changes related to genito...           0.710824   
4   What are the treatments for Pseudopelade of Br...           0.816476   
5                  What is (are) Preauricular sinus ?           0.801746   
6   How many people are affected by capillary malf...           0.678636   
7                      What is (are) Drug Reactions ?           0.565405   
8                      What is (are) Ollier disease ?           0.856656   
9                                  What causes ARDS ?           0.886695   
10  What is (are) carnitine-acylcarnitine transloc...           0.820138   
11  How many people are affected by spondylocostal...           0.77

In [4]:
results_df

Unnamed: 0,Question,Context Relevance,Answer Relevance,Diversity,Answer Length,Ground Truth Length
0,What is (are) Breast Cancer ?,0.652565,0.590611,0.552106,451,68
1,What are the symptoms of Genochondromatosis ?,0.825076,0.848552,0.756098,41,244
2,What causes Pars planitis ?,0.840998,0.840961,0.85,80,107
3,What are the genetic changes related to genito...,0.710824,0.752809,0.607362,163,157
4,What are the treatments for Pseudopelade of Br...,0.816476,0.821685,0.818182,44,102
5,What is (are) Preauricular sinus ?,0.801746,0.709372,0.734375,64,121
6,How many people are affected by capillary malf...,0.678636,0.740368,1.0,15,26
7,What is (are) Drug Reactions ?,0.565405,0.619543,0.703125,64,205
8,What is (are) Ollier disease ?,0.856656,0.84149,0.759259,54,90
9,What causes ARDS ?,0.886695,0.619919,0.763158,76,101


In [5]:
print("\nĐiểm trung bình các metrics:")
print(avg_scores)


Điểm trung bình các metrics:
Context Relevance        0.777956
Answer Relevance         0.775119
Diversity                0.735691
Answer Length          114.400000
Ground Truth Length    212.860000
dtype: float64


In [6]:
avg_scores

Context Relevance        0.777956
Answer Relevance         0.775119
Diversity                0.735691
Answer Length          114.400000
Ground Truth Length    212.860000
dtype: float64

In [7]:
print("doneee ...")

doneee ...
