In [1]:
# !pip install pandas numpy nltk rouge-score bert-score sentence-transformers scikit-learn pycocoevalcap torch transformers

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from collections import Counter
import numpy as np

# Khởi tạo mô hình Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def safe_convert_to_string(value):
    """Chuyển đổi giá trị đầu vào an toàn thành string"""
    if pd.isna(value) or value is None:
        return ""
    if isinstance(value, float):
        # Xử lý trường hợp đặc biệt khi giá trị là float
        if value.is_integer():
            return str(int(value))
        return str(value)
    return str(value)

def exact_match(pred, gt):
    """Tính Exact Match score"""
    pred = safe_convert_to_string(pred)
    gt = safe_convert_to_string(gt)
    return int(pred.strip().lower() == gt.strip().lower())

def word_f1(pred, gt):
    """Tính F1 score dựa trên từ"""
    pred = safe_convert_to_string(pred)
    gt = safe_convert_to_string(gt)
    
    pred_tokens = set(pred.lower().split())
    gt_tokens = set(gt.lower().split())
    
    common = pred_tokens & gt_tokens
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0
    
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    
    if (precision + recall) == 0:
        return 0.0
    
    return 2 * (precision * recall) / (precision + recall)

def context_relevance(context, answer):
    """Tính độ liên quan giữa ngữ cảnh và câu trả lời"""
    context = safe_convert_to_string(context)
    answer = safe_convert_to_string(answer)
    
    if not context or not answer:
        return 0.0
    
    try:
        context_emb = model.encode(context)
        answer_emb = model.encode(answer)
        return float(cosine_similarity([context_emb], [answer_emb])[0][0])
    except:
        return 0.0

def answer_relevance(question, answer):
    """Tính độ liên quan giữa câu hỏi và câu trả lời"""
    question = safe_convert_to_string(question)
    answer = safe_convert_to_string(answer)
    
    if not question or not answer:
        return 0.0
    
    try:
        q_emb = model.encode(question)
        a_emb = model.encode(answer)
        return float(cosine_similarity([q_emb], [a_emb])[0][0])
    except:
        return 0.0

def diversity_score(text):
    """Tính độ đa dạng từ vựng"""
    text = safe_convert_to_string(text)
    try:
        tokens = text.lower().split()
        if not tokens:
            return 0.0
        unique_tokens = set(tokens)
        return len(unique_tokens) / len(tokens)
    except:
        return 0.0

def evaluate_rag_simple(data):
    """Hàm chính để đánh giá hệ thống RAG"""
    results = []
    
    for item in data:
        pred = item.get("answer", "")
        ref = item.get("ground_truth", "")
        context = item.get("contexts", [""])[0] if isinstance(item.get("contexts", []), list) else ""
        question = item.get("question", "")
        
        metrics = {
            "Question": question,
            # "Exact Match": exact_match(pred, ref),
            # "Word F1": word_f1(pred, ref),
            "Context Relevance": context_relevance(context, pred),
            "Answer Relevance": answer_relevance(question, pred),
            "Diversity": diversity_score(pred),
            "Answer Length": len(safe_convert_to_string(pred).split()),
            "Ground Truth Length": len(safe_convert_to_string(ref).split())
        }
        
        results.append(metrics)
    
    return pd.DataFrame(results)


def load_data(filepath):
    """Hàm load dữ liệu từ CSV"""
    df = pd.read_csv(filepath)
    data = []
    for _, item in df.iterrows():
        dt = {
            "question": item["question"],
            "contexts": [item["context_retriever_cohere"]],
            "answer": item["response_cohere"],
            "ground_truth": item["answer"]
        }
        data.append(dt)
    return data


if __name__ == "__main__":
    filepath = "E:/source_code/nlp/med_studio/data_sample_50_predicted.csv"
    data = load_data(filepath)

    evaluation_data = data

    results_df = evaluate_rag_simple(evaluation_data)

    avg_scores = results_df.mean(numeric_only=True)

    results_df.to_csv("eval_v2_cohere.csv", index=False)

    print("\nChi tiết đánh giá:")
    print(results_df)
    print("\nĐiểm trung bình các metrics:")
    print(avg_scores)


Chi tiết đánh giá:
                                             Question  Context Relevance  \
0                       What is (are) Breast Cancer ?           0.725674   
1       What are the symptoms of Genochondromatosis ?           0.816604   
2                         What causes Pars planitis ?           0.868330   
3   What are the genetic changes related to genito...           0.859980   
4   What are the treatments for Pseudopelade of Br...           0.805402   
5                  What is (are) Preauricular sinus ?           0.910615   
6   How many people are affected by capillary malf...           0.760703   
7                      What is (are) Drug Reactions ?           0.811457   
8                      What is (are) Ollier disease ?           0.853164   
9                                  What causes ARDS ?           0.856890   
10  What is (are) carnitine-acylcarnitine transloc...           0.931598   
11  How many people are affected by spondylocostal...           0.77

In [3]:
print("\nChi tiết đánh giá:")
print(results_df)


Chi tiết đánh giá:
                                             Question  Context Relevance  \
0                       What is (are) Breast Cancer ?           0.725674   
1       What are the symptoms of Genochondromatosis ?           0.816604   
2                         What causes Pars planitis ?           0.868330   
3   What are the genetic changes related to genito...           0.859980   
4   What are the treatments for Pseudopelade of Br...           0.805402   
5                  What is (are) Preauricular sinus ?           0.910615   
6   How many people are affected by capillary malf...           0.760703   
7                      What is (are) Drug Reactions ?           0.811457   
8                      What is (are) Ollier disease ?           0.853164   
9                                  What causes ARDS ?           0.856890   
10  What is (are) carnitine-acylcarnitine transloc...           0.931598   
11  How many people are affected by spondylocostal...           0.77

In [4]:
results_df

Unnamed: 0,Question,Context Relevance,Answer Relevance,Diversity,Answer Length,Ground Truth Length
0,What is (are) Breast Cancer ?,0.725674,0.707059,0.553299,197,68
1,What are the symptoms of Genochondromatosis ?,0.816604,0.81279,0.74359,39,244
2,What causes Pars planitis ?,0.86833,0.83276,0.842105,76,107
3,What are the genetic changes related to genito...,0.85998,0.672646,0.84127,63,157
4,What are the treatments for Pseudopelade of Br...,0.805402,0.859622,0.676471,68,102
5,What is (are) Preauricular sinus ?,0.910615,0.631521,0.752137,117,121
6,How many people are affected by capillary malf...,0.760703,0.899012,1.0,14,26
7,What is (are) Drug Reactions ?,0.811457,0.531261,0.810127,79,205
8,What is (are) Ollier disease ?,0.853164,0.82684,0.75,84,90
9,What causes ARDS ?,0.85689,0.778448,0.677083,96,101


In [5]:
print("\nĐiểm trung bình các metrics:")
print(avg_scores)


Điểm trung bình các metrics:
Context Relevance        0.780344
Answer Relevance         0.730061
Diversity                0.729163
Answer Length          103.140000
Ground Truth Length    212.860000
dtype: float64


In [6]:
avg_scores

Context Relevance        0.780344
Answer Relevance         0.730061
Diversity                0.729163
Answer Length          103.140000
Ground Truth Length    212.860000
dtype: float64

In [7]:
print("doneee ...")

doneee ...
