In [74]:
import nest_asyncio

nest_asyncio.apply()

import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings

from llama_index.llms.llama_api import LlamaAPI
from llama_index.core import QueryBundle
from llama_index.core.indices.query.schema import QueryType

os.environ["OPENAI_API_KEY"] = ""
os.environ["COHERE_API_KEY"] = ""
os.environ["LLAMA_API_KEY"] = ""

llm = OpenAI(model="gpt-4o-mini", temperature=0.0)
filter_llm = LlamaAPI(model="llama3.3-70b", api_key=os.environ["LLAMA_API_KEY"])
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

documents = SimpleDirectoryReader("data").load_data()
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents)

In [75]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

In [76]:
from llama_index.core.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm,
    num_questions_per_chunk=1
)

100%|██████████| 584/584 [08:58<00:00,  1.09it/s]


In [77]:
from llama_index.core import PromptTemplate

template = (
    "You are an intelligent assistant. Analyze the following knowledge strip for the query:\n"
    "Query: {query}\n"
    "Knowledge Strip: {strip}\n"

    "Tasks:\n"
    "Evaluate the following knowledge strip and determine if it is pertinent to the query:\n\n"

    "Respond with 'relevant' or 'irrelevant'\n"
)

assess_relevance_template = PromptTemplate(template)

In [80]:
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.core.schema import NodeWithScore
from typing import List

class CustomRetriever(BaseRetriever):
    def __init__(self, retriever: VectorIndexRetriever) -> None:
        self._retriever = retriever
        super().__init__()

    def _split_into_strips(self, text: str, max_length: int = 100) -> List[str]:
        words = text.split()
        return [" ".join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

    def _assess_relevance(self, query: str, strips: List[str]) -> List[str]:
        relevant_strips = []

        for strip in strips:
            prompt = assess_relevance_template.format(query=query, strip=strip)
            response = llm.complete(prompt)
            print(response)

            if response.text.lower() == "relevant":
                relevant_strips.append(strip)
        return relevant_strips

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        retrieved_nodes = self._retriever.retrieve(query_bundle)
        refined_nodes = []
        query = query_bundle.query_str

        for node_with_score in retrieved_nodes:
            content = node_with_score.node.get_content()
            strips = self._split_into_strips(content)
            relevant_strips = self._assess_relevance(query, strips)

            if relevant_strips:
                refined_content = " ".join(relevant_strips)
                node_with_score.node.set_content(refined_content)
                refined_nodes.append(node_with_score)

        return retrieved_nodes

    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        return self._retrieve(query_bundle)

    async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
        if isinstance(str_or_query_bundle, str):
            str_or_query_bundle = QueryBundle(str_or_query_bundle)

        return await self._aretrieve(str_or_query_bundle)

In [81]:
vector_index = VectorStoreIndex(nodes)
vector_retriever = VectorIndexRetriever(
    index=vector_index,
    similarity_top_k=2
)

custom_retriever = CustomRetriever(vector_retriever)

from llama_index.core.evaluation import RetrieverEvaluator

metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]
evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=custom_retriever
)
results = await evaluator.aevaluate_dataset(qa_dataset)

relevant
relevant
irrelevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
irrelevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
irrelevant
relevant
relevant
relevant
relevant
irrelevant
relevant
relevant
relevant
relevant
relevant
relevant
relevant
irrelevant
relevant
irrelevant
irrelevant
relevant
irrelevant
relevant
Relevant
relevant
irrelevant
relevant
irrelevant
irrelevant
irrelevant
relevant
relevant
Irrelevant
relevant
relevant
irrelevant
irrelevant
relevant
relevant
irrelevant
Relevant
relevant
relevant
relevant
relevant
relevant
relevant
irrelevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
relevant
relevant
irrelevant
irrelevant
irrelevant
irrelevant
irrelevant
irrelevant
Relevant
irrelevant
irrelevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
irrelevant
irrelevant
irrelevant
irrelevant
relevant
relevant
relevant
irrelevant
irrelevant
irrelevant
relevant
irrelevant
irrelevant
relevant
irrelevant
i

In [82]:
import pandas as pd

def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df

In [83]:
display_results("filter", results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,filter,0.806507,0.731164,0.403253,0.806507,0.731164,0.750894
