# Evaluate RAG

In [1]:
import sys
sys.path.append("../..")
from datasets import load_dataset
from src.service.provider import ProviderService

provider = ProviderService()
# provider.config.enable_tracing(project="EVALUATE_RAG")

In [2]:
QA_REPO = "BroDeadlines/QA.TDT.FQA_tu_van_hoc_duong"
QA_SPLIT = "INDEX.medium_index_TDT"
qa_dataset = load_dataset(QA_REPO, split=QA_SPLIT)
qa_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata'],
    num_rows: 144
})

# Run RAG

In [14]:
# donwload corpora dataset
# DATA_REPO = "BroDeadlines/TEST.UEH.ueh_copora_data"
DATA_REPO = "BroDeadlines/TEST.UEH.ueh_copora_data"
# DATA_SPLIT = "INDEX.medium_index_TDT_clean"
SUBSET = "default"
# dataset = load_dataset(DATA_REPO, split=DATA_SPLIT)
dataset = load_dataset(DATA_REPO, SUBSET)
# dataset = dataset['train']
dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'content', 'doc_id', 'metadata', 'split', 'shards', 'propositions', 'proposition_list', '__index_level_0__'],
        num_rows: 196
    })
})

In [3]:
from src.rag.hg_parent_retriever import HugFaceParentRAG
from src.rag.hyde_rag import HydeRAG, HydeHybridSearchRAG, RAG
from src.utils.type_utils import get_default_config

config = get_default_config()

config['vec_index'] = "vec-raptor-medium_index_tdt_vi"
config['txt_index'] = "TEX-raptor-ueh-data-tree-unique"

config['total_k'] = 8
config['llm'] = "gemini-1.0-pro"
config

{'llm': 'gemini-1.0-pro',
 'total_k': 8,
 'txt_weight': 0.5,
 'vec_weight': 0.5,
 'vec_index': 'vec-raptor-medium_index_tdt_vi',
 'txt_index': 'TEX-raptor-ueh-data-tree-unique'}

In [4]:
UNI = "Đại học Kinh tế TP. Hồ Chí Minh"
# UNI = "Tôn Đức Thắng"
dataset = None
# rag = HugFaceParentRAG(provider=provider, config=config, text_corpora=dataset, uni = UNI)
rag = RAG(provider=provider, config=config, uni=UNI)
# rag = HydeHybridSearchRAG(provider=provider, config=config, uni=UNI)

In [6]:
# q = "sinh viên tôn đức thắng"
# q = "Đại học Tôn Đức Thắng có những phương thức tuyển sinh nào?"
# q = "Cho em hỏi là nếu em học chất lượg cao thì em sẽ học ở chi nhánh nào , sẽ học khác với đại trà chỗ nào và cơ sở vật chất ra làm sao ạ . Em cảm ơn"
q = "Câu 2: Học phí và học bổng đào tạo tại Vĩnh Long có gì khác so với đào tại Thành phố Hồ Chí Minh"
a = rag.answer(q)

a['answer']

'Dữ liệu cung cấp không đề cập đến sự khác biệt về học phí và học bổng đào tạo tại Vĩnh Long so với Thành phố Hồ Chí Minh.'

## Run

In [5]:
def eval_no_score1(row):
    resp = rag.answer(row['question'])
    row['answer'] = resp['answer']
    row['exc_second'] = resp['exc_second']
    docs = resp['retrieved_docs']
    result = [{'content': d.page_content, "doc_id": d.metadata['doc_id'], "score": -1.0} for d in docs]
    row['evaluation'] = result
    return row

In [6]:
eval_dataset = qa_dataset.map(eval_no_score1)



Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._co

## Push

In [7]:
eval_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'exc_second', 'evaluation'],
    num_rows: 144
})

In [8]:
PUSH_REPO = "BroDeadlines/EVAL.NEW.raptor.IR_evaluation"
PUSH_SUBSET = "raptor_no_hyde_k8"

In [9]:
eval_dataset.push_to_hub(PUSH_REPO, PUSH_SUBSET)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/EVAL.NEW.raptor.IR_evaluation/commit/3490dd05b628672619d828007f8b4bed09ca1434', commit_message='Upload dataset', commit_description='', oid='3490dd05b628672619d828007f8b4bed09ca1434', pr_url=None, pr_revision=None, pr_num=None)

## Update shards

### proposition

In [12]:


pros = {}

def map_pros(row):
    pros[row['doc_id']] = row['proposition_list']
    return row

dataset.map(map_pros)

AttributeError: 'NoneType' object has no attribute 'map'

In [16]:
import json
def parent(row):
    row['metadata'] = json.dumps({"shards": 1})
    return row
    
def no_parent(row):
    shards = pros[row['doc_id']]
    row['metadata'] = json.dumps({"shards": len(shards)})
    return row
    

In [17]:
eval_dataset = eval_dataset.map(no_parent)

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

### RAPTOR

In [22]:
CLUSTER_REPO = "BroDeadlines/TEST.NEW.PART_CLUSTER.raptor.edu_tdt_data"
SUMMERIZE_REPO = "BroDeadlines/TEST.NEW.PART_SUMMERIZE.raptor.edu_tdt_data"
SPLIT = "TEST.medium_tdt_raptor_vi"
# 
# CLUSTER_REPO = "BroDeadlines/TEST.PART_CLUSTER.UEH.raptor.edu_data"
# SUMMERIZE_REPO = "BroDeadlines/TEST.PART_SUMMERIZE.UEH.raptor.edu_tdt_data"
# SUBSET = 'unique'

cluster_dataset =  load_dataset(CLUSTER_REPO, split=SPLIT)
summerize_dataset =  load_dataset(SUMMERIZE_REPO, split=SPLIT)

In [23]:
easy_shards = {}
hard_shards = {}

def get_shards_cluster5(row):
    d_id = row['doc_ids']
    if d_id not in hard_shards:
        hard_shards[d_id] = row['hard_shards']
        easy_shards[d_id] = row['easy_shards']
    return row

# cluster_dataset.filter(lambda row: row['level_id'] == "tree_1").map(get_shards_cluster_1)
# summerize_dataset.map(get_shards_cluster_1)
len(easy_shards)

0

In [24]:
cluster_dataset.filter(lambda row: row['level_id'] == "tree_1").map(get_shards_cluster5)
len(easy_shards)

Map:   0%|          | 0/1254 [00:00<?, ? examples/s]

344

In [25]:
summerize_dataset.map(get_shards_cluster5)
len(easy_shards)

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

613

In [20]:
195 + 283

478

In [26]:
import json
def update_shards(row):
    id = row['doc_id']
    return {**row, "metadata": json.dumps({"easy_shards": easy_shards[id], "hard_shards": hard_shards[id]})}

test_ds_update = eval_dataset.map(update_shards)
test_ds_update

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'exc_second', 'evaluation'],
    num_rows: 144
})

In [27]:
test_ds_update.filter(lambda e: len(e['metadata']) > 0)

Filter:   0%|          | 0/144 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'exc_second', 'evaluation'],
    num_rows: 144
})

In [28]:
print(PUSH_REPO)
PUSH_SUBSET

BroDeadlines/EVAL.NEW.raptor.IR_evaluation


'raptor_no_hyde_k8'

In [29]:
test_ds_update.push_to_hub(PUSH_REPO, PUSH_SUBSET)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/EVAL.NEW.raptor.IR_evaluation/commit/c08adf3637470946a786734ba1f0bd3df427cbf9', commit_message='Upload dataset', commit_description='', oid='c08adf3637470946a786734ba1f0bd3df427cbf9', pr_url=None, pr_revision=None, pr_num=None)

# Report

In [3]:
EVAL_REPO = "BroDeadlines/EVAL.RAG.UEH.evaluation"
SUBSET = "raptor"
eval_dataset = load_dataset(EVAL_REPO, SUBSET)

Downloading readme:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading data: 100%|███████████████████████████████████████████| 130k/130k [00:00<00:00, 133kB/s]


Generating train split:   0%|          | 0/54 [00:00<?, ? examples/s]

In [4]:
eval_dataset = eval_dataset['train']

In [12]:
a = set([4,5])
len(a)

2

In [13]:
import numpy as np

count_docs = []
for ques in eval_dataset['evaluation']:
    docs = [q['doc_id'] for q in ques]
    doc_set = set(docs)
    count_docs.append(len(doc_set))

In [13]:
from src.utils.eval_utils import evaluate_IR, calculate_BLEU, calculate_ROUGE,evaluate_IR_RAPTOR

preds = eval_dataset['answer']
grounds = qa_dataset['answer']
len(preds) == len(grounds)

bleu = calculate_BLEU(grounds=grounds, preds=preds)
# bleu
rouge = calculate_ROUGE(grounds=grounds, preds=preds)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### RAPTOR

In [14]:
data = {}
t_from = 6
t_to = 8

for i in range(t_from, t_to+1):
    res = evaluate_IR_RAPTOR(eval_dataset=eval_dataset, limit_k=i, shard_key='hard_shards')
    del res['relevant']
    data[f'k_{i}'] = res
data

KeyError: 'hard_shards'

### Normal

In [16]:
data = {}
t_from = 6
t_to = 8

for i in range(t_from, t_to+1):
    res = evaluate_IR(eval_dataset=eval_dataset, limit_k=i, shard_key='shards')
    del res['relevant']
    data[f'k_{i}'] = res
data

{'k_6': {'precision': 0.5425531914893617,
  'recall': 0.9444444444444444,
  'map_score': 0.2328703703703704,
  'relevant_retrieved': 51,
  'num_retrieved': 94},
 'k_7': {'precision': 0.5555555555555556,
  'recall': 1.0185185185185186,
  'map_score': 0.23323360208280844,
  'relevant_retrieved': 55,
  'num_retrieved': 99},
 'k_8': {'precision': 0.5544554455445545,
  'recall': 1.037037037037037,
  'map_score': 0.2319475938523558,
  'relevant_retrieved': 56,
  'num_retrieved': 101}}

In [17]:
import numpy as np
time = eval_dataset['exc_second']
time_np = np.array(time)

In [13]:
es = provider.get_elasticsearch_store(index=config['vec_index'])

NameError: name 'config' is not defined

In [14]:
es_connect = provider.config.load_elasticsearch_connection()

es_connect

FileNotFoundError: [Errno 2] No such file or directory: '/home/h4438/Desktop/graduate/WebQA/core/experiments/evaluation/../../src/service/../.keys/elastic.nodes'

In [18]:
import json

ALGO = ['proposition', 'parent retriever', 'hybrid search']
ES_SIZE = 883

a = {
    "QA": {"repo": QA_REPO, "split": QA_SPLIT,
           "size": qa_dataset.num_rows, 
           "total_time(minute)": round(time_np.sum() / 60, 3), 
           "median_time(second)": round(np.median(time_np), 3),
           "avg_time(second)": round(time_np.mean(), 3)},
    "RAG": {"algo": ALGO, **config, "es_size": ES_SIZE},
    "IR": data,
    "BLEU": bleu,
    "ROUGE-L": rouge
}
 
json_formatted_str = json.dumps(a, indent=2)
print(json_formatted_str)

{
  "QA": {
    "repo": "BroDeadlines/QA.UEH.QA_tu_van_tuyen_sinh",
    "split": "train",
    "size": 54,
    "total_time(minute)": 21.167,
    "median_time(second)": 23.754,
    "avg_time(second)": 23.519
  },
  "RAG": {
    "algo": [
      "proposition",
      "parent retriever",
      "hybrid search"
    ],
    "llm": "gemini-1.0-pro",
    "total_k": 8,
    "txt_weight": 0.5,
    "vec_weight": 0.5,
    "vec_index": "vec-sentence-ueh-unique",
    "txt_index": "text-sentence-ueh-unique",
    "es_size": 883
  },
  "IR": {
    "k_6": {
      "precision": 0.5425531914893617,
      "recall": 0.9444444444444444,
      "map_score": 0.2328703703703704,
      "relevant_retrieved": 51,
      "num_retrieved": 94
    },
    "k_7": {
      "precision": 0.5555555555555556,
      "recall": 1.0185185185185186,
      "map_score": 0.23323360208280844,
      "relevant_retrieved": 55,
      "num_retrieved": 99
    },
    "k_8": {
      "precision": 0.5544554455445545,
      "recall": 1.037037037037037,
