# Orderfox RAG
This is the official notebook of the Datathon 2025 Orderfox Challenge from the team "Bruteforcers".

## Imports

In [52]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings, get_response_synthesizer
from llama_index.core.postprocessor import SentenceTransformerRerank, LLMRerank
from llama_index.embeddings.openai import OpenAIEmbedding
import pickle
from typing import List
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from keyword_dbsqlite import search_documents_bm25
from llama_index.core.schema import TextNode
import openai

from llama_index.core import StorageContext, load_index_from_storage

## Hyperparameters

Set the hyperparameters for the RAG pipeline.

In [53]:
RETRIEVAL_TOP_K = 10            # Number of documents to retrieve from the vector database
KEYWORD_TOP_K = 2               # Number of documents to retrieve from the keyword search
RERANK_TOP_K = 5                # Number of documents to keep after reranking

MAX_TOKEN_GENERATION = 1000     # Max tokens for the LLM generation

MODEL = 'gpt-4o-mini'
API_KEY = ...

# General Setup

In [54]:
llm = OpenAI(
    model=MODEL,
    engine=MODEL,
    max_tokens=MAX_TOKEN_GENERATION,
    api_key=API_KEY,
    temperature=0
)

embed_model = OpenAIEmbedding(
    embed_batch_size=10,
    api_key=API_KEY,
    model="text-embedding-3-small"
)

reranker = LLMRerank(
    llm=llm,
    choice_batch_size=5,
    top_n=RERANK_TOP_K
)

Settings.llm = llm
Settings.embed_model = embed_model

## Custom Classes

In [55]:
def prompt_to_keywordsearch(prompt, api_key) -> List[str]:
    """Convert a prompt to keywords for searching."""
    input_content = f"""Extract the most specific, relevant, and meaningful keywords
    from the following text: '{prompt}'. Exclude stop words and return the keywords as
    a space-separated string. Less is more, max 5 words."""

    client = openai.OpenAI(api_key=api_key)
    response = client.responses.create(
        model="gpt-4o-mini",
        input=[{"role": "user", "content": input_content}],
    )
    if response is None:
        print("No response from LLM")
        return []
    return response.output_text.split(" ")

In [56]:
class CustomRetriever(BaseRetriever):
    """Looks up the query in all indices and concatenates the retrieved nodes to one list."""

    def __init__(self, vector_retriever: VectorIndexRetriever, llm, api_key, my_reranker: SentenceTransformerRerank) -> None:

        """Init params."""
        self.vector_retriever = vector_retriever
        self.my_reranker = my_reranker
        self.llm = llm
        self.api_key = api_key

        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        # vector search
        print(f"Vector Retrieval for following query: {query_bundle.query_str}")
        vector_nodes = self.vector_retriever.retrieve(query_bundle)

        # keyword search
        keywords = prompt_to_keywordsearch(query_bundle, self.api_key)
        print(f"Retrieving documents for keywords: {keywords}")
        for doc_id, content, score in search_documents_bm25(keywords, top_k=KEYWORD_TOP_K):
            # print(f"📄 Doc {doc_id} / {score}: {content[:100]}")
            node = TextNode(text=content)
            node_with_score = NodeWithScore(node=node, score = 0.5)
            vector_nodes.append(node_with_score)

        # rerank
        if self.my_reranker:
            print(f"BEFORE reranking we have following {len(vector_nodes)} retrieved nodes -----------------------")
            vector_nodes = self.my_reranker.postprocess_nodes(nodes=vector_nodes, query_bundle=query_bundle)
            print(f"AFTER reranking we have following {len(vector_nodes)} retrieved nodes -----------------------")

        print(f"Returning {len(vector_nodes)} nodes")
        return vector_nodes

## Setting up

In [57]:
# index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_folder_name))
index = pickle.load(open("TestDBBaseUrlOnly.pkl", "rb"))

retriever = VectorIndexRetriever(index=index, similarity_top_k=RETRIEVAL_TOP_K)
custom_retriever = CustomRetriever(retriever, llm, API_KEY, reranker)

retriever_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=get_response_synthesizer()
)

## Inference

In [58]:
question = "Which companies in Italy produce Aluminum auto components?"

response = retriever_query_engine.query(question)

Vector Retrieval for following query: Which companies in Italy produce Aluminum auto components?
Retrieving documents for keywords: ['Italy', 'Aluminum', 'auto', 'components', 'companies']
BEFORE reranking we have following 10 retrieved nodes -----------------------
AFTER reranking we have following 5 retrieved nodes -----------------------
Returning 5 nodes


In [59]:
print(response)

In Italy, Flexitech is a company that produces aluminum auto components. They have a facility in Pinerolo, which was acquired in 2021 and specializes in OEM brake hose and shock absorber hose spare parts.


In [60]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='037d95c6-bb99-4209-b3b3-ab711b7fca34', embedding=None, metadata={'website_id': '7854b6cc-90ae-59b3-b4e6-5b5a6ca7bd29', 'website_url': 'https://laminazionesottile.com/', 'pageID': 'page_0', 'url': 'https://laminazionesottile.com/', 'total_content_length': 2892}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='cc514b29-1237-463a-bd74-5ad965ca7e38', node_type='4', metadata={'website_id': '7854b6cc-90ae-59b3-b4e6-5b5a6ca7bd29', 'website_url': 'https://laminazionesottile.com/', 'pageID': 'page_0', 'url': 'https://laminazionesottile.com/', 'total_content_length': 2892}, hash='a1cd539ffac843c6f782edf278622969c8b9d4d432298f03f9cf5bd795317031'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='89e00784-7077-48d5-adad-fa171f09db88', node_type='1', metadata={}, hash='570686f6c4a74e134a41136abb8db7d70cb4d0c1e061afc925008197e30a43a2')}, metadata_template='{key}: {value}', 