In [1]:
import glob
import re,os

import pdfplumber
import pytesseract
from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
from langchain import hub
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pdf2image import convert_from_path

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
load_dotenv(find_dotenv())

True

# Simple RAG script

Welcome to this notebook, where we've been building a foundational Retrieval-Augmented Generation (RAG) system. Our journey began by extracting text from PDF documents using OCR, turning unstructured data into usable text. Recognizing that Large Language Models (LLMs) have limited context windows, we then employed a recursive text splitter to meticulously break down these large texts into smaller, manageable chunks, ensuring a crucial overlap between them to preserve semantic continuity.

Following this, we transformed these text chunks into numerical representations called embeddings using OpenAI's embedding models. These embeddings are vital as they allow computers to understand and compare text based on its meaning, rather than just keywords. Finally, we've taken these embeddings and indexed them within a vector database, specifically utilizing FAISS or ChromaDB. This vector store is the backbone of our RAG system, enabling fast and efficient semantic searches to retrieve the most relevant information. This entire pipeline allows us to effectively leverage external knowledge bases to provide LLMs with precise, context-rich information, ultimately leading to more accurate and informed generations.

# Select model

Prepare the OpenAI gpt-3.5-turbo-16k model for use in subsequent tasks, likely for generating conversational responses, summaries, or any other language processing task that requires a chat model with a large context window.

In [25]:
llm = ChatOpenAI(
    model_name='gpt-4.1-nano'
)

# Load data

## Load webpage

Let's try with a web page

In [4]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/History_of_artificial_intelligence")
docs = loader.load()

In [5]:
text = docs[0].page_content[:500]

new_text = text.replace('\n', ' ')
new_text = re.sub(r'\s+', ' ', new_text)

final_text = new_text.strip()

print("Original Text:")
print(repr(text))
print("\nClean text:")
print(repr(final_text))

Original Text:
'\n\n\n\nHistory of artificial intelligence - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate C'

Clean text:
'History of artificial intelligence - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us Contribute HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages Search Search Appearance Don

## Load PDF

### By using OCR

In [6]:
book = "ARAGOG_Advanced_RAG_Output_Grading"
os.makedirs(book, exist_ok=True)

pdf_path = f"{book}.pdf"
# Convert each page of the PDF into an image (PNG format by default).
# The '500' argument specifies the DPI (dots per inch) for better OCR accuracy.
pages = convert_from_path(pdf_path, 500)

# Extract the base filename (without the .pdf extension) for naming output text files.
# This will be used to create paths like 'book/my_novel_page001.txt'.
base_file_name = os.path.basename(pdf_path)[:-4]

# Loop through each image (page) generated from the PDF.
# imgBlob is the image object, pageNum is its index (starting from 0).
for pageNum, imgBlob in tqdm(enumerate(pages), total=len(pages), desc=f"Extracting text from '{pdf_path}'"):
    # Use pytesseract to perform OCR on the image.
    # 'lang='eng'' specifies that the text is in English.
    text = pytesseract.image_to_string(imgBlob, lang='eng')

    # Format the page number to have leading zeros (e.g., 001, 002, 010).
    # This helps in keeping the files sorted correctly.
    padded_pageNum = str(pageNum).zfill(3)

    # Construct the full path for the output text file.
    # It will be saved inside the 'book' directory.
    output_file_path = os.path.join(book, f'{base_file_name}_page{padded_pageNum}.txt')

    # Open the output file in write mode ('w') and save the extracted text.
    with open(output_file_path, 'w', encoding='utf-8') as the_file:
        the_file.write(text)

    print(f"Extracted text from {pdf_path} - Page {pageNum+1} to {output_file_path}")

print("Text extraction complete for all PDFs.")

Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':   7%|▋         | 1/14 [00:05<01:10,  5.45s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 1 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page000.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  14%|█▍        | 2/14 [00:09<00:53,  4.48s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 2 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page001.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  21%|██▏       | 3/14 [00:14<00:52,  4.78s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 3 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page002.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  29%|██▊       | 4/14 [00:18<00:46,  4.62s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 4 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page003.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  36%|███▌      | 5/14 [00:23<00:41,  4.65s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 5 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page004.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  43%|████▎     | 6/14 [00:28<00:38,  4.75s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 6 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page005.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  50%|█████     | 7/14 [00:33<00:34,  4.92s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 7 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page006.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  57%|█████▋    | 8/14 [00:38<00:29,  4.98s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 8 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page007.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  64%|██████▍   | 9/14 [00:43<00:23,  4.79s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 9 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page008.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  71%|██████▍  | 10/14 [00:48<00:20,  5.02s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 10 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page009.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  79%|███████  | 11/14 [00:53<00:15,  5.10s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 11 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page010.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  86%|███████▋ | 12/14 [00:59<00:10,  5.28s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 12 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page011.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf':  93%|████████▎| 13/14 [01:05<00:05,  5.36s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 13 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page012.txt


Extracting text from 'ARAGOG_Advanced_RAG_Output_Grading.pdf': 100%|█████████| 14/14 [01:09<00:00,  4.99s/it]

Extracted text from ARAGOG_Advanced_RAG_Output_Grading.pdf - Page 14 to ARAGOG_Advanced_RAG_Output_Grading/ARAGOG_Advanced_RAG_Output_Grading_page013.txt
Text extraction complete for all PDFs.





In [7]:
files = sorted(glob.glob(f"{book}/*"))
text = ""

# Concatenate all files from the book
for fl in files:
    try:
        with open(fl, 'r') as file_:
            content = file_.read()
            text += content + "\n"
    except FileNotFoundError:
        print(f"An error in {nombre_archivo} ")

text = re.sub(r'\s+', ' ', text)

with open("{}.txt".format(book), 'w') as file_:
    file_.write(text)

In [8]:
loader = TextLoader("{}.txt".format(book))
pages_ocr = loader.load_and_split()

In [9]:
pages_ocr[0].page_content[:50]

'2404.01037v1 [cs.CL] 1 Apr 2024 ar X1V ARAGOG:;: A'

### By using PyDFLoader

In [10]:
loader = PyPDFLoader(f"{book}.pdf")
pages_pydfloader = loader.load_and_split()

In [11]:
pages_pydfloader[0].page_content[:50]

'ARAGOG: Advanced RAG Output Grading\nMatouˇ s Eibic'

### PDFplumber

In [12]:
pages_plumber = []

with pdfplumber.open(f"{book}.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        pages_plumber.append(text)

In [13]:
pages_plumber[0][:50]

'ARAGOG: Advanced RAG Output Grading\nMatouˇs Eibich'

# Split text

The `RecursiveCharacterTextSplitter` code efficiently breaks down large PDF texts into smaller segments, or chunks with specified `chunk_size` and `chunk_overlap`. This is crucial because Large Language Models (LLMs) have limited input capacities preventing them from processing entire documents at once. By segmenting the text, the code ensures that data fits within an LLM's context window. The overlap between chunks is vital for maintaining semantic continuity preventing the loss of meaning that might occur if a concept spans across a split. This process is especially important for **Retrieval-Augmented Generation (RAG)** systems, as it enables the retrieval of highly relevant, manageable text snippets, leading to more accurate LLM responses and optimizing processing time and costs.

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
splits = text_splitter.split_documents(pages_pydfloader)

In [15]:
splits[0].metadata

{'producer': 'pdfTeX-1.40.25',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2024-04-02T04:16:02+00:00',
 'author': '',
 'keywords': '',
 'moddate': '2024-04-02T04:16:02+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'subject': '',
 'title': '',
 'trapped': '/False',
 'source': 'ARAGOG_Advanced_RAG_Output_Grading.pdf',
 'total_pages': 14,
 'page': 0,
 'page_label': '1'}

In [16]:
splits[0].page_content[:50]

'ARAGOG: Advanced RAG Output Grading\nMatouˇ s Eibic'

# Vector Database

## FAISS

The code `db = FAISS.from_documents(splits, OpenAIEmbeddings())` converts text chunks into numerical embeddings using **OpenAIEmbeddings**, then stores and indexes these semantic representations in a **FAISS database**. This process is vital as it enables computers to understand and compare text by meaning facilitates efficient similarity searches** across large datasets, and forms the core of Retrieval-Augmented Generation (RAG) systems for feeding LLMs relevant information.

In [17]:
db = FAISS.from_documents(
    splits,
    OpenAIEmbeddings(model="text-embedding-3-small")
)

In [18]:
query = "what does the document is about?"
docs = db.similarity_search(query)

In [19]:
for d in docs:
    print(d.page_content)
    print("***********************")

2.2 Document summary index
The Document Summary Index method enhances RAG systems by indexing document summaries for
efficient retrieval, while providing LLMs with full text documents for response generation (Liu, 2023a).
This decoupling strategy optimizes retrieval speed and accuracy through summary-based indexing and
supports comprehensive response synthesis by utilizing the original text.
2.3 HyDE
The Hypothetical Document Embedding (Gao et al., 2022) technique enhances the document retrieval
***********************
The Hypothetical Document Embedding (Gao et al., 2022) technique enhances the document retrieval
by leveraging LLMs to generate a hypothetical answer to a query. HyDE capitalizes on the ability of
LLMs to produce context-rich answers, which, once embedded, serve as a powerful tool to refine and
focus document retrieval efforts. See Figure 2 for overview of HyDE RAG system workflow.
Figure 2: The process flow of Hypothetical Document Embedding (HyDE) technique within a Re

## Chroma

The following code creates and saves a persistent vector database using ChromaDB. It takes your text splits (document chunks) and converts them into embeddings via OpenAIEmbeddings (using the text-embedding-3-small model). These embeddings, along with the original text, are then stored in a specified local directory ({}_embeddings). Finally, vectorstore.persist() ensures this database is saved to disk, allowing you to reload it later without re-embedding everything.

In [20]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
    persist_directory="{}_embeddings".format(book))

vectorstore.persist()


  vectorstore.persist()


In [21]:
retriever = (
    vectorstore.as_retriever(
        search_type="similarity", 
        search_kwargs={"k":5})
)

In [23]:
for doc in retriever.invoke("what does the document is about?"):
    print(doc.page_content)
    print("***********************")

2.2 Document summary index
The Document Summary Index method enhances RAG systems by indexing document summaries for
efficient retrieval, while providing LLMs with full text documents for response generation (Liu, 2023a).
This decoupling strategy optimizes retrieval speed and accuracy through summary-based indexing and
supports comprehensive response synthesis by utilizing the original text.
2.3 HyDE
The Hypothetical Document Embedding (Gao et al., 2022) technique enhances the document retrieval
***********************
The Hypothetical Document Embedding (Gao et al., 2022) technique enhances the document retrieval
by leveraging LLMs to generate a hypothetical answer to a query. HyDE capitalizes on the ability of
LLMs to produce context-rich answers, which, once embedded, serve as a powerful tool to refine and
focus document retrieval efforts. See Figure 2 for overview of HyDE RAG system workflow.
Figure 2: The process flow of Hypothetical Document Embedding (HyDE) technique within a Re

In [26]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False
)
query = "what does the document is about?"
result = chain({"query": query})
result

{'query': 'what does the document is about?',
 'result': 'The document discusses various techniques and strategies to improve retrieval-augmented generation (RAG) systems. It covers methods such as the Document Summary Index, Hypothetical Document Embedding (HyDE), and Multi-query techniques, highlighting their roles in enhancing document retrieval accuracy and response generation efficiency. The document also compares the performance of these methods, indicating that the Document Summary Index and other enhanced techniques have potential for superior retrieval precision compared to traditional approaches.'}