# Hands-on 1: Ingestion and Chunking

## Problem

DEFINITION:

https://docs.llamaindex.ai/en/stable/examples/retrievers/relative_score_dist_fusion/

## Code

In [1]:
# if running on colab uncomment the those lines
# %pip install llama-index>=0.12.2
# %pip install llama-index-retrievers-bm25>=0.50
# !mkdir -p 'data/paul_graham/'
# !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

In [2]:
# load .env
from dotenv import load_dotenv
load_dotenv()

# import os
# os.environ["OPENAI_API_KEY"] = "sk-proj-..."

True

In [3]:
from llama_index.core import SimpleDirectoryReader
import nest_asyncio

nest_asyncio.apply()

documents = SimpleDirectoryReader("./data/paul_graham").load_data()

In [None]:

from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from rich import print


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  6.05it/s]
Generating embeddings: 100%|██████████| 587/587 [00:09<00:00, 60.10it/s]


In [None]:
splitter = SentenceSplitter(chunk_size=256)

index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter], show_progress=True
)

In [None]:
QUESTION = "Why the author was in Florance?"

### Vector retrieval

In [None]:

vector_retriever = index.as_retriever(similarity_top_k=3)

vector_retrieve = vector_retriever.retrieve(QUESTION)
for node in vector_retrieve:
    print(f"{node.text}")

### Setup bm25 retriever

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=3
)
bm25_retrieve = bm25_retriever.retrieve(QUESTION)
for node in bm25_retrieve:
    print(f"{node.text}")

### Merge the results of the two retrievers

In [None]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    similarity_top_k=3,
    num_queries=1,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    # query_gen_prompt="...",  # we could override the query generation prompt here
)

nodes = retriever.retrieve(QUESTION)
for node in nodes:
    print(f"{node.text}")