# Evaluate IR (Information Retrieval)

In [1]:
import sys
sys.path.append("../..")
from datasets import load_dataset
from src.service.provider import ProviderService

provider = ProviderService()

In [2]:
QA_REPO = "BroDeadlines/QA.FQA_tu_van_hoc_duong"
SPLIT = "INDEX.medium_index_TDT"
qa_dataset = load_dataset(QA_REPO, split=SPLIT)

In [3]:
qa_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata'],
    num_rows: 144
})

## Run IR

In [2]:
QA_REPO = "BroDeadlines/QA.FQA_tu_van_hoc_duong"
SPLIT = "INDEX.medium_index_TDT"
qa_dataset = load_dataset(QA_REPO, split=SPLIT)

In [3]:
qa_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata'],
    num_rows: 144
})

In [4]:
qa_dataset[0]['metadata']

'{"shards": 1}'

In [4]:

DATA_REPO = "BroDeadlines/TEST.edu_tdt_proposition_data"
index_dataset = load_dataset(DATA_REPO, split="INDEX.medium_index_TDT")

Downloading readme:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading data: 100%|████████████████████████████████████████| 5.02M/5.02M [00:00<00:00, 5.34MB/s]
Downloading data: 100%|████████████████████████████████████████| 5.02M/5.02M [00:00<00:00, 5.35MB/s]


Generating propositon_medium_edu_tdt split:   0%|          | 0/344 [00:00<?, ? examples/s]

Generating INDEX.medium_index_TDT split:   0%|          | 0/344 [00:00<?, ? examples/s]

In [5]:
index_dataset

Dataset({
    features: ['content', 'url', 'doc_id', 'shards', 'splits', 'split', 'propositions', 'proposition_list'],
    num_rows: 344
})

## Update shards

In [7]:
ids = {}

def build_ids(row):
    ids[row['doc_id']] = 1
    return
    
eval_dataset.map(build_ids)
len(ids)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

144

In [8]:
eval_dataset

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'evaluation', 'metadata'],
    num_rows: 144
})

In [12]:
shards = {}

def build_shards(row):
    if row['doc_id'] not in ids:
        return
    # shards[row['doc_id']] = row['shards']
    shards[row['doc_id']] = len(row['proposition_list'])
    return
    
index_dataset.map(build_shards)
len(shards)

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

144

In [13]:
import json

eval_dataset = eval_dataset.map(lambda e: {**e, "metadata": json.dumps({'shards': shards[e['doc_id']]})})

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [15]:
EVAL_REPO = "BroDeadlines/EVAL.IR_evaluation"
split = "INDEX.medium_index_TDT.proposition.sentence.hybrid"
# split = "INDEX.medium_index_TDT.proposition.hybrid"
eval_dataset.push_to_hub(EVAL_REPO, split=split)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/EVAL.IR_evaluation/commit/78086836d46c649e532b5ea084e80f4b518d12a6', commit_message='Upload dataset', commit_description='', oid='78086836d46c649e532b5ea084e80f4b518d12a6', pr_url=None, pr_revision=None, pr_num=None)

In [33]:
metas = {}

def build_meta(row):
    metas[row['doc_id']] = row['metadata']
    pass

qa_dataset_updated.map(build_meta)
len(metas)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

144

In [32]:

qa_dataset_updated.push_to_hub(QA_REPO, split=SPLIT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/785 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/QA.FQA_tu_van_hoc_duong/commit/293b6e18f98f99f46239da08961939357d5f88c1', commit_message='Upload dataset', commit_description='', oid='293b6e18f98f99f46239da08961939357d5f88c1', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
qa_dataset['train'] = qa_dataset['train'].map(lambda e: {**e, "metadata": ""})

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

In [24]:
qa_dataset['TEST.basic_test_tdt_dataset'] = qa_dataset['TEST.basic_test_tdt_dataset'].remove_columns(['shards'])
# dir(a)

In [32]:
qa_dataset.push_to_hub("BroDeadlines/QA.FQA_tu_van_hoc_duong")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/QA.FQA_tu_van_hoc_duong/commit/c2e94ef7977fb20e5b69ae47e42a4ac2f811a40c', commit_message='Upload dataset', commit_description='', oid='c2e94ef7977fb20e5b69ae47e42a4ac2f811a40c', pr_url=None, pr_revision=None, pr_num=None)

## Run RAG

In [7]:
from src.rag.hyde_rag import HydeRAG, HydeHybridSearchRAG
from src.utils.config_utils import get_gemini_hyde_config

# rag = HydeRAG(provider=provider, index="test-basic_test_tdt_dataset")
text_idx = "text-raptor-medium_index_tdt"
vec_idx = "vec-raptor-medium_index_tdt"

config = get_gemini_hyde_config()
config.vector_index = vec_idx
config.text_index = text_idx

# config.vector_index = "vec-raptor-basic_index_tdt_clean"
# config.text_index = "text-raptor-basic_index_tdt_clean"

# config.vector_index = "vec-sentence-propositon_medium_edu_tdt"
# config.text_index = 'text-sentence-propositon_medium_edu_tdt'

# rag = HydeRAG(provider=provider, index=config.vector_index)
rag = HydeHybridSearchRAG(provider=provider,config=config, k=4)

### simple RAG

In [8]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
retriever = rag.ensemble_retriever
model = provider.get_simple_gemini_pro()
# Prompt
prompt = hub.pull("rlm/rag-prompt")
RETRIEVE_DOCS = []

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def store_docs(docs):
    RETRIEVE_DOCS.append(docs)
    return docs

# Chain
rag_chain = (
    {"context": retriever | store_docs | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [9]:
rag.ensemble_retriever

EnsembleRetriever(retrievers=[MyElasticSearchBM25Retriever(client=<Elasticsearch(['https://158.178.243.160:9201', 'https://158.178.243.160:9203', 'https://158.178.243.160:9202'])>, index_name='text-raptor-medium_index_tdt'), VectorStoreRetriever(tags=['ElasticsearchStore', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.elasticsearch.ElasticsearchStore object at 0x7ffa9c0ffac0>, search_kwargs={'k': 4, 'fetch_k': 10})], weights=[0.5, 0.5])

In [10]:
def eval(row):
    docs = rag.search(question=row['question'])
    if (docs == None):
        row['evaluation'] = []
        return row
    result = [{'content': d[0].page_content, "doc_id": d[0].metadata['doc_id'], "score": d[1]} for d in docs]
    row['evaluation'] = result
    return row

def eval_no_score(row):
    docs = rag.ensemble_retriever.invoke(row['question'])
    if (docs == None):
        row['evaluation'] = []
        return row
    result = [{'content': d.page_content, "doc_id": d.metadata['doc_id'], "score": 0} for d in docs]
    row['evaluation'] = result
    return row

def eval_no_score_answer(row):
    answer = rag_chain.invoke(row['question'])
    docs = RETRIEVE_DOCS[-1]
    result = [{'content': d.page_content, "doc_id": d.metadata['doc_id'], "score": -1.0} for d in docs]
    row['evaluation'] = result
    row['answer'] = answer
    return row

test_ds = qa_dataset.map(eval_no_score_answer)



Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


In [14]:
test_ds.filter(lambda row: len(row['evaluation']) > 1 )

Filter:   0%|          | 0/144 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'evaluation'],
    num_rows: 144
})

In [37]:
def update_meta(row):
    row['shards_'] = metas[row['doc_id']]
    return row
    
test_ds_updated = test_ds.map(update_meta)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [8]:
test_ds

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'evaluation'],
    num_rows: 144
})

In [15]:
def update_score(row):
    tmp = row['evaluation']
    n_tmp = [{**i, "score": -1.1} for i in tmp]
    row['evaluation'] = n_tmp
    return row

test_ds_updated_one = test_ds.map(update_score)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [10]:
SPLIT

'INDEX.medium_index_TDT'

### RAPTOR: Update Shards

In [15]:
CLUSTER_REPO = "BroDeadlines/TEST.NEW.PART_CLUSTER.raptor.edu_tdt_data"
SUMMERIZE_REPO = "BroDeadlines/TEST.NEW.PART_SUMMERIZE.raptor.edu_tdt_data"
SPLIT = "TEST.medium_tdt_raptor"

cluster_dataset =  load_dataset(CLUSTER_REPO, split=SPLIT)
summerize_dataset =  load_dataset(SUMMERIZE_REPO, split=SPLIT)

Downloading readme:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading data: 100%|████████████████████████████████████████| 13.5M/13.5M [00:03<00:00, 3.46MB/s]


Generating TEST.medium_tdt_raptor split:   0%|          | 0/1925 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading data: 100%|███████████████████████████████████████████| 510k/510k [00:01<00:00, 506kB/s]


Generating TEST.medium_tdt_raptor split:   0%|          | 0/332 [00:00<?, ? examples/s]

In [16]:
easy_shards = {}
hard_shards = {}

def get_shards_cluster(row):
    d_id = row['doc_ids']
    if d_id not in hard_shards:
        hard_shards[d_id] = row['hard_shards']
        easy_shards[d_id] = row['easy_shards']
    return row

cluster_dataset.filter(lambda row: row['level_id'] == "tree_1").map(get_shards_cluster)

Filter:   0%|          | 0/1925 [00:00<?, ? examples/s]

Map:   0%|          | 0/1603 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'embd', 'cluster', 'doc_ids', 'level_id', 'easy_shards', 'hard_shards'],
    num_rows: 1603
})

In [18]:
summerize_dataset.map(get_shards_cluster)

Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Dataset({
    features: ['summaries', 'level', 'cluster', 'doc_ids', 'level_id', 'easy_shards', 'hard_shards'],
    num_rows: 332
})

In [19]:
sum(easy_shards.values())

1935

In [20]:
def update_shards(row):
    id = row['doc_id']
    return {**row, 'easy_shards': easy_shards[id], 'hard_shards': hard_shards[id]}

test_ds_update = test_ds.map(update_shards)
test_ds_update

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'evaluation', 'easy_shards', 'hard_shards'],
    num_rows: 144
})

In [22]:
test_ds_update.push_to_hub("BroDeadlines/EVAL.NEW.raptor.IR_evaluation", split=SPLIT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/EVAL.NEW.raptor.IR_evaluation/commit/601ebca477c46bccaf4f3d67d3cf91cdcbfc0aac', commit_message='Upload dataset', commit_description='', oid='601ebca477c46bccaf4f3d67d3cf91cdcbfc0aac', pr_url=None, pr_revision=None, pr_num=None)

### Upload

In [21]:
SPLIT

'TEST.medium_tdt_raptor'

In [12]:
test_ds.push_to_hub("BroDeadlines/EVAL.raptor.IR_evaluation", split=SPLIT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BroDeadlines/EVAL.raptor.IR_evaluation/commit/21278415a3c0621c3a00b606e2ce01c69415fa3c', commit_message='Upload dataset', commit_description='', oid='21278415a3c0621c3a00b606e2ce01c69415fa3c', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
test_ds

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'evaluation'],
    num_rows: 144
})

In [19]:
test_ds[6]['metadata']

''

# Evaluation IR

In [2]:
EVAL_REPO = "BroDeadlines/EVAL.IR_evaluation"
# split = "INDEX.medium_index_TDT.proposition.sentence.hybrid"
split = "INDEX.medium_index_TDT.proposition.hybrid"
split = "INDEX.medium_index_TDT.fulltext.clean.8.proposition.sentence.hybrid"
# eval_dataset = load_dataset(EVAL_REPO, split=split)
eval_dataset = load_dataset(EVAL_REPO)

In [3]:
print(SPLIT)
len(eval_dataset)

INDEX.medium_index_TDT


8

In [16]:
KEY = 'INDEX.medium_index_TDT.fulltext.clean.proposition.sentence.hybrid'
KEY = 'INDEX.medium_index_TDT.fulltext.clean.8.proposition.sentence.hybrid'
KEY = 'INDEX.medium_index_TDT.fulltext.clean.2.proposition.sentence.hybrid'
eval = eval_dataset[KEY]

In [17]:
a = eval[0]['evaluation']
len(a)

4

In [7]:
import json

shards = []

def count_shards(row):
    meta = json.loads(row['metadata'])
    shards.append(meta['shards'])
    return row

eval_dataset.map(count_shards)
sum(shards)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

144

In [18]:
from src.utils.eval_utils import evaluate_IR

res = evaluate_IR(eval_dataset=eval, limit_k=4)
res

{'relevant': 0.4652777777777778,
 'precision': 0.46206896551724136,
 'recall': 0.4652777777777778,
 'map_score': 0.2957175925925926,
 'relevant_retrieved': 67,
 'num_retrieved': 145}

## RAPTOR

In [3]:
QA_REPO = "BroDeadlines/EVAL.NEW.raptor.IR_evaluation"
SPLIT = "TEST.medium_tdt_raptor"
test_ds_update = load_dataset(QA_REPO, split=SPLIT)
test_ds_update

Dataset({
    features: ['question', 'answer', 'url', 'group', 'doc_id', 'metadata', 'evaluation', 'easy_shards', 'hard_shards'],
    num_rows: 144
})

In [7]:
from src.utils.eval_utils import evaluate_IR_RAPTOR

res = evaluate_IR_RAPTOR(test_ds_update, 4, 'hard_shards')
res

{'relevant': 0.5,
 'precision': 0.47058823529411764,
 'recall': 0.11556982343499198,
 'map_score': 0.3084490740740741,
 'relevant_retrieved': 72,
 'num_retrieved': 153}