# GET Evaluation

In [11]:
import sys
sys.path.append("../..")
from datasets import load_dataset
from src.service.provider import ProviderService
provider = ProviderService()

In [12]:
QA_REPO = "BroDeadlines/QA.TDT.FQA_tu_van_hoc_duong"
QA_SPLIT = "INDEX.medium_index_TDT"
qa_dataset = load_dataset(QA_REPO, split=QA_SPLIT)
qa_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata'],
    num_rows: 144
})

In [13]:
EVAL_REPO = "BroDeadlines/EVAL.NEW.raptor.IR_evaluation"
SUBSET = "raptor_no_hyde_k8"
# SPLIT = "TEST.Hyde.medium_tdt_raptor_vi"
# eval_dataset = load_dataset(EVAL_REPO, split=SPLIT)
eval_dataset = load_dataset(EVAL_REPO, SUBSET)

Downloading readme:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

Downloading data: 100%|████████████████████████████████████████| 49.2k/49.2k [00:00<00:00, 58.7kB/s]


Generating INDEX.medium_index_TDT split:   0%|          | 0/144 [00:00<?, ? examples/s]

In [15]:
eval_dataset = eval_dataset['INDEX.medium_index_TDT']

In [16]:
from src.utils.type_utils import get_default_config

config = get_default_config()

config['vec_index'] = "vec-raptor-medium_index_tdt_vi"
config['txt_index'] = "text-raptor-medium_index_tdt_vi"

config['total_k'] = 8
config['llm'] = "gemini-1.0-pro"
config

{'llm': 'gemini-1.0-pro',
 'total_k': 8,
 'txt_weight': 0.5,
 'vec_weight': 0.5,
 'vec_index': 'vec-raptor-medium_index_tdt_vi',
 'txt_index': 'text-raptor-medium_index_tdt_vi'}

In [17]:
import numpy as np

count_docs = []
for ques in eval_dataset['evaluation']:
    docs = [q['doc_id'] for q in ques]
    doc_set = set(docs)
    count_docs.append(len(doc_set))

In [18]:
from src.utils.eval_utils import evaluate_IR, calculate_BLEU, calculate_ROUGE,evaluate_IR_RAPTOR

preds = eval_dataset['answer']
grounds = qa_dataset['answer']
len(preds) == len(grounds)

bleu = calculate_BLEU(grounds=grounds, preds=preds)
# bleu
rouge = calculate_ROUGE(grounds=grounds, preds=preds)

## RAPTOR

In [19]:
data = {}
t_from = 6
t_to = 8

for i in range(t_from, t_to+1):
    res = evaluate_IR_RAPTOR(eval_dataset=eval_dataset, limit_k=i, shard_key='hard_shards')
    del res['relevant']
    data[f'k_{i}'] = res
data

{'k_6': {'precision': 0.269,
  'recall': 0.034,
  'map_score': 0.081,
  'relevant_retrieved': 21,
  'num_retrieved': 78},
 'k_7': {'precision': 0.31,
  'recall': 0.043,
  'map_score': 0.085,
  'relevant_retrieved': 26,
  'num_retrieved': 84},
 'k_8': {'precision': 0.33,
  'recall': 0.048,
  'map_score': 0.087,
  'relevant_retrieved': 29,
  'num_retrieved': 88}}

## Normal

In [21]:
data = {}
t_from = 6
t_to = 8

for i in range(t_from, t_to+1):
    res = evaluate_IR(eval_dataset=eval_dataset, limit_k=i, shard_key='shards')
    del res['relevant']
    data[f'k_{i}'] = res
data

{'k_6': {'precision': 0.5483870967741935,
  'recall': 0.1186046511627907,
  'map_score': 0.27515432098765435,
  'relevant_retrieved': 51,
  'num_retrieved': 93},
 'k_7': {'precision': 0.5612244897959183,
  'recall': 0.12790697674418605,
  'map_score': 0.2694622910892753,
  'relevant_retrieved': 55,
  'num_retrieved': 98},
 'k_8': {'precision': 0.5656565656565656,
  'recall': 0.13023255813953488,
  'map_score': 0.2665301923238432,
  'relevant_retrieved': 56,
  'num_retrieved': 99}}

In [20]:
import numpy as np
time = eval_dataset['exc_second']
time_np = np.array(time)
# time_np = np.array([])

In [21]:
import json

ALGO = ['proposition', 'parent retriever', 'hybrid search']
ES_SIZE = 479

a = {
    "QA": {"repo": QA_REPO, "split": QA_SPLIT,
           "size": qa_dataset.num_rows, 
           "total_time(minute)": round(time_np.sum() / 60, 3), 
           "median_time(second)": round(np.median(time_np), 3),
           "avg_time(second)": round(time_np.mean(), 3)},
    "RAG": {"algo": ALGO, **config, "es_size": ES_SIZE},
    "IR": data,
    "BLEU": bleu,
    "ROUGE-L": rouge
}
 
json_formatted_str = json.dumps(a, indent=2)
print(json_formatted_str)

{
  "QA": {
    "repo": "BroDeadlines/QA.TDT.FQA_tu_van_hoc_duong",
    "split": "INDEX.medium_index_TDT",
    "size": 144,
    "total_time(minute)": 8.854,
    "median_time(second)": 3.376,
    "avg_time(second)": 3.689
  },
  "RAG": {
    "algo": [
      "proposition",
      "parent retriever",
      "hybrid search"
    ],
    "llm": "gemini-1.0-pro",
    "total_k": 8,
    "txt_weight": 0.5,
    "vec_weight": 0.5,
    "vec_index": "vec-raptor-medium_index_tdt_vi",
    "txt_index": "text-raptor-medium_index_tdt_vi",
    "es_size": 479
  },
  "IR": {
    "k_6": {
      "precision": 0.269,
      "recall": 0.034,
      "map_score": 0.081,
      "relevant_retrieved": 21,
      "num_retrieved": 78
    },
    "k_7": {
      "precision": 0.31,
      "recall": 0.043,
      "map_score": 0.085,
      "relevant_retrieved": 26,
      "num_retrieved": 84
    },
    "k_8": {
      "precision": 0.33,
      "recall": 0.048,
      "map_score": 0.087,
      "relevant_retrieved": 29,
      "num_retrieve