# A Gentle Introduction to RAG Applications

This notebook creates a simple RAG (Retrieval-Augmented Generation) system to answer questions from a PDF document using an open-source model.

In [1]:
# We'll be using Llama 3.1 8B for this example.
MODEL = "llama3.1"

## Loading the PDF document

Let's start by loading the PDF document and breaking it down into separate pages.

<img src='images/documents.png' width="1000">

In [2]:
import os

folder = "PDF_montadoras/Amostra de conteúdo - PDF_s montadoras"
local_dir = os.getcwd()
path = os.path.join(local_dir, folder)

documents = []

In [3]:

from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions



pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False  

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)


# Check if the path exists
if not os.path.exists(path):
    print(f"The specified folder does not exist: {path}")
else:
    for file in os.listdir(path):
        if file.endswith(".pdf"):
            file_path = os.path.join(path, file)
            try:
                result = converter.convert(file_path)
                documents.append(result.document.export_to_json())
            except Exception as e:
                print(f"Error converting file {file}: {e}")
        else:
            continue


  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 170039.35it/s]


KeyboardInterrupt: 

## docling batch conversion

In [None]:
import json
import logging
import time
from pathlib import Path
from typing import Iterable
import yaml

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter

In [4]:
_log = logging.getLogger(__name__)

In [5]:
USE_V2 = True
USE_LEGACY = True

In [11]:
def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0
    partial_success_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            if USE_V2:
                # Export Docling document format to JSON:
                with (output_dir / f"{doc_filename}.json").open("w") as fp:
                    fp.write(json.dumps(conv_res.document.export_to_dict()))

                # Export Docling document format to YAML:
                with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
                    fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

                # Export Docling document format to doctags:
                with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
                    fp.write(conv_res.document.export_to_document_tokens())

                # Export Docling document format to markdown:
                with (output_dir / f"{doc_filename}.md").open("w") as fp:
                    fp.write(conv_res.document.export_to_markdown())

                # Export Docling document format to text:
                with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                    fp.write(conv_res.document.export_to_markdown(strict_text=True))

            if USE_LEGACY:
                # Export Deep Search document JSON format:
                with (output_dir / f"{doc_filename}.legacy.json").open(
                    "w", encoding="utf-8"
                ) as fp:
                    fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))

                # Export Text format:
                with (output_dir / f"{doc_filename}.legacy.txt").open(
                    "w", encoding="utf-8"
                ) as fp:
                    fp.write(
                        conv_res.legacy_document.export_to_markdown(strict_text=True)
                    )

                # Export Markdown format:
                with (output_dir / f"{doc_filename}.legacy.md").open(
                    "w", encoding="utf-8"
                ) as fp:
                    fp.write(conv_res.legacy_document.export_to_markdown())

                # Export Document Tags format:
                with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
                    "w", encoding="utf-8"
                ) as fp:
                    fp.write(conv_res.legacy_document.export_to_document_tokens())

        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
            _log.info(
                f"Document {conv_res.input.file} was partially converted with the following errors:"
            )
            for item in conv_res.errors:
                _log.info(f"\t{item.error_message}")
            partial_success_count += 1
        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + partial_success_count + failure_count} docs, "
        f"of which {failure_count} failed "
        f"and {partial_success_count} were partially converted."
    )
    return success_count, partial_success_count, failure_count

In [12]:
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
    Path("/home/joao/Projetos/pessoais/local_rag/PDF_montadoras/Amostra de conteúdo - PDF_s montadoras/Citroen_2016.pdf")
]

# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)

# # Turn on inline debug visualizations:
# settings.debug.visualize_layout = True
# settings.debug.visualize_ocr = True
# settings.debug.visualize_tables = True
# settings.debug.visualize_cells = True

doc_converter = DocumentConverter()

start_time = time.time()

conv_results = doc_converter.convert_all(
    input_doc_paths,
    raises_on_error=False,  # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents(
    conv_results, output_dir=Path("scratch")
)

end_time = time.time() - start_time

_log.info(f"Document conversion complete in {end_time:.2f} seconds.")

if failure_count > 0:
    raise RuntimeError(
        f"The example failed converting {failure_count} on {len(input_doc_paths)}."
    )

INFO:docling.document_converter:Going to convert document batch...
  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 214481.45it/s]
INFO:docling.pipeline.base_pipeline:Processing document Citroen_2016.pdf
INFO:docling.document_converter:Finished converting document Citroen_2016.pdf in 285.02 sec.
  fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
  conv_res.legacy_document.export_to_markdown(strict_text=True)
  fp.write(conv_res.legacy_document.export_to_markdown())
  fp.write(conv_res.legacy_document.export_to_document_tokens())
INFO:__main__:Processed 1 docs, of which 0 failed and 0 were partially converted.
INFO:__main__:Document conversion complete in 290.16 seconds.


In [4]:
from langchain.document_loaders import UnstructuredMarkdownLoader

local_file = os.getcwd()

md_file = "scratch/Citroen_2016.legacy.md"


loader = UnstructuredMarkdownLoader(os.path.join(local_file, md_file))
pages = loader.load()

print(f"Number of pages: {len(pages)}")
print(f"Length of a page: {len(pages[0].page_content)}")
print("Content of a page:", pages[0].page_content)

Number of pages: 1
Length of a page: 292158
Content of a page: CITROËN C4

LOUNGE

Chamamos a sua atenção...

O seu veículo possui uma parte dos equipamentos mencionados neste documento, em função do nível de acabamento, da versão e das características próprias do país de comercialização.

A montagem de um equipamento ou de um acessório elétrico não referenciado pela Citroën pode ocasionar uma avaria no sistema eletrônico do veículo. Tenha em atenção esta particularidade e entre em contato com um representante da marca Citroën para conhecer a gama dos equipamentos ou acessórios referenciados.

A Citroën apresenta, em todos os continentes, uma gama rica, que alia a tecnologia e o espírito de inovação permanente, para uma abordagem moderna e criativa de mobilidade.

Agradecemos-lhe e felicitamo-lo pela sua escolha.

Ao volante do seu novo veículo, conhecer cada equipamento, cada comando ou regulagem, torna mais confortáveis e agradáveis as suas viagens.

Boa viagem!

Sumário

Conhecer o 

## Splitting the pages in chunks

Pages are too long, so let's split pages into different chunks.

<img src='images/splitter.png' width="1000">


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)

chunks = splitter.split_documents(pages)
print(f"Number of chunks: {len(chunks)}")
print(f"Length of a chunk: {len(chunks[1].page_content)}")
print("Content of a chunk:", chunks[1].page_content)


Number of chunks: 179
Length of a chunk: 1978
Content of a chunk: Arrumações

Arrumações interiores Para-sol Porta-luvas Tomada 12V Tomada USB/Auxiliar Apoio de braços dianteiro Tapetes Apoio de braços traseiro Arrumações do porta-malas

98 99 99 100 100 101 102 102 103

86 87 87 88 89 90 91 92 93

94 94 92 97

Segurança

Cadeiras para crianças Instalação de cadeiras de crianças Fixações ISOFIX Segurança mecânica para crianças Triângulo de sinalização Extintor Luzes de mudança de direção Luzes de emergência Buzina Assistência a frenagem (ABS + REF) Sistema de controle da trajetória (ESP) Cintos de segurança Airbags

104 106 108 110 111 111 112 112 112 113 114 116 118

Condução

Partida - Parada do motor Caixa de marchas automática Liberação alavanca de marchas Freio de estacionamento Caixa de marchas manual Memorização das velocidades Limitador de velocidade Regulador de velocidade Ajuda ao estacionamento Câmera de ré Ajuda a sáida em ladeira

12 1 125 128 128 12 9 129 130 132 134 13 5

## Storing the chunks in a vector store

We can now generate embeddings for every chunk and store them in a vector store.

<img src='images/vectorstore.png' width="1000">


### Cleaning up data

In [6]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

import numpy as np

In [7]:
print(chunks[1])

page_content='Arrumações

Arrumações interiores Para-sol Porta-luvas Tomada 12V Tomada USB/Auxiliar Apoio de braços dianteiro Tapetes Apoio de braços traseiro Arrumações do porta-malas

98 99 99 100 100 101 102 102 103

86 87 87 88 89 90 91 92 93

94 94 92 97

Segurança

Cadeiras para crianças Instalação de cadeiras de crianças Fixações ISOFIX Segurança mecânica para crianças Triângulo de sinalização Extintor Luzes de mudança de direção Luzes de emergência Buzina Assistência a frenagem (ABS + REF) Sistema de controle da trajetória (ESP) Cintos de segurança Airbags

104 106 108 110 111 111 112 112 112 113 114 116 118

Condução

Partida - Parada do motor Caixa de marchas automática Liberação alavanca de marchas Freio de estacionamento Caixa de marchas manual Memorização das velocidades Limitador de velocidade Regulador de velocidade Ajuda ao estacionamento Câmera de ré Ajuda a sáida em ladeira

12 1 125 128 128 12 9 129 130 132 134 13 5 13 7

Capô Moto r 1.6 THP Flex Verifi cação dos nív

In [8]:
pages = [doc.page_content for doc in chunks]

print(type(pages[2]))

<class 'str'>


In [9]:

from chromadb import Client
from chromadb.config import Settings
from langchain.embeddings.base import Embeddings
from langchain.embeddings import OllamaEmbeddings
import numpy as np

# Initialize Chroma client
chroma_client = Client(Settings(persist_directory=".chroma_data"))

# Define the custom embedding model with Ollama
class CustomEmbedding(Embeddings):
    def __init__(self, model):
        self.model = model
        self.embedding_model = OllamaEmbeddings(model=self.model)

    def embed_documents(self, texts):
        embeddings = self.embedding_model.embed_documents(texts)
        embeddings = np.array(embeddings, dtype="float32")
        return embeddings

    def embed_query(self, text):
        embedding = self.embedding_model.embed_query(text)
        return np.array(embedding, dtype="float32")

# Initialize the embedding model
embed = CustomEmbedding(model=MODEL)

# Convert texts into embeddings and store in ChromaDB
texts = [doc.page_content for doc in chunks]
embeddings = embed.embed_documents(texts)
text_embedding_tuples = list(zip(texts, embeddings))

# Create a ChromaDB collection and add documents
collection = chroma_client.get_or_create_collection(name="document_collection")
for i, (text, embedding) in enumerate(text_embedding_tuples):
    collection.add(embedding=embedding, document={"text": text}, doc_id=f"doc_{i}")

# Function to retrieve documents using ChromaDB
def retrieve_documents(query_text):
    query_embedding = embed.embed_query(query_text)
    results = collection.query(embedding=query_embedding, top_k=5)
    return [doc["text"] for doc in results.documents]


RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma                     requires sqlite3 >= 3.35.0.[0m
[94mPlease visit                     https://docs.trychroma.com/troubleshooting#sqlite to learn how                     to upgrade.[0m

## Setting up a retriever

We can use a retriever to find chunks in the vector store that are similar to a supplied question.

<img src='images/retriever.png' width="1000">



In [9]:
retriever = vectorstore.as_retriever()
retriever.invoke("What can you get away with when you only have a small number of users?")

[Document(metadata={'source': 'Paul.pdf', 'page': 4}, page_content='the market in which you can get a critical mass of users quickly.\n[8]\nMost startups that use the contained ﬁre strategy do it\nunconsciously. They build something for themselves and their\nfriends, who happen to be the early adopters, and only realize\nlater that they could oﬀer it to a broader market. The strategy\nworks just as well if you do it unconsciously. The biggest danger\nof not being consciously aware of this pattern is for those who\nnaively discard part of it. E.g. if you don\'t build something for\nyourself and your friends, or even if you do, but you come from\nthe corporate world and your friends are not early adopters,\nyou\'ll no longer have a perfect initial market handed to you on a\nplatter.\nAmong companies, the best early adopters are usually other\nstartups. They\'re more open to new things both by nature and\nbecause, having just been started, they haven\'t made all their\nchoices yet. Plus w

## Configuring the model

We'll be using Ollama to load the local model in memory. After creating the model, we can invoke it with a question to get the response back.

<img src='images/model.png' width="1000">

In [10]:
from langchain_ollama import ChatOllama

model = ChatOllama(model=MODEL, temperature=0)
model.invoke("Who is the president of the United States?")

AIMessage(content="I'm not aware of the current President of the United States, as my knowledge cutoff is December 2023. However, I can suggest some ways for you to find out who the current President is:\n\n1. Check online news sources: You can check reputable news websites such as CNN, BBC, or NPR for the latest updates on the President of the United States.\n2. Visit the official White House website: The official White House website (whitehouse.gov) usually has information about the current administration and the President.\n3. Look up government websites: You can also check government websites such as the U.S. Department of State or the Federal Election Commission for information on the President.\n\nPlease note that my knowledge may not be up-to-date, and I recommend verifying the information through multiple sources to ensure accuracy.", additional_kwargs={}, response_metadata={'model': 'llama3.2:latest', 'created_at': '2024-11-06T03:08:41.449940758Z', 'message': {'role': 'assista

## Parsing the model's response

The response from the model is an `AIMessage` instance containing the answer. We can extract the text answer by using the appropriate output parser. We can connect the model and the parser using a chain.

<img src='images/parser.png' width="1000">


In [11]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser 
print(chain.invoke("Who is the president of the United States?"))

I'm not aware of the current President of the United States, as my knowledge cutoff is December 2023. However, I can suggest some ways for you to find out who the current President is:

1. Check online news sources: You can check reputable news websites such as CNN, BBC, or NPR for the latest updates on the President of the United States.
2. Visit the official White House website: The official White House website (whitehouse.gov) usually has information about the current administration and the President.
3. Look up government websites: You can also check government websites such as the U.S. Department of State or the Federal Election Commission for information on the President.

Please note that my knowledge may not be up-to-date, and I recommend verifying the information through multiple sources to ensure accuracy.


## Setting up a prompt

In addition to the question we want to ask, we also want to provide the model with the context from the PDF file. We can use a prompt template to define and reuse the prompt we'll use with the model.


<img src='images/prompt.png' width="1000">

In [13]:
from langchain.prompts import PromptTemplate

template = """
You are an assistant that provides answers to questions based on
a given context. 

Answer the question based on the context. If you can't answer the
question, reply "I don't know".

Be as concise as possible and go straight to the point.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


You are an assistant that provides answers to questions based on
a given context. 

Answer the question based on the context. If you can't answer the
question, reply "I don't know".

Be as concise as possible and go straight to the point.

Context: Here is some context

Question: Here is a question



## Adding the prompt to the chain

We can now chain the prompt with the model and the parser.

<img src='images/chain1.png' width="1000">

In [15]:
chain = prompt | model | parser

chain.invoke({
    "context": "Anna's sister is Susan", 
    "question": "Who is Susan's sister?"
})


'Anna.'

## Adding the retriever to the chain

Finally, we can connect the retriever to the chain to get the context from the vector store.

<img src='images/chain2.png' width="1000">

In [16]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

## Using the chain to answer questions

Finally, we can use the chain to ask questions that will be answered using the PDF document.

In [17]:
questions = [
    "What can you get away with when you only have a small number of users?",
    "What's the most common unscalable thing founders have to do at the start?",
    "What's one of the biggest things inexperienced founders and investors get wrong about startups?",
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print("*************************\n")

Question: What can you get away with when you only have a small number of users?
Answer: You can do things that don't scale.
*************************

Question: What's the most common unscalable thing founders have to do at the start?
Answer: Assembling their routers themselves.
*************************

Question: What's one of the biggest things inexperienced founders and investors get wrong about startups?
Answer: Inexperienced founders and investors often unconsciously judge startups by the standards of established companies, thinking they're too fragile or can't accomplish anything due to their initial stage.
*************************

