### Agentic RAG via tutorial
- Tutorial link: https://medium.com/the-ai-forum/build-an-agentic-rag-using-huggingface-transformer-agent-ec741f09ddcc
- 20 Jan 2025

In [53]:
# ## Installation to agentic_env environment
# !pip install pandas
# # !pip uninstall "git+https://github.com/huggingface/transformers.git#egg=transformers[agents]" # Not used
# !pip install langchain 
# !pip install langchain-community 
# !pip install sentence-transformers 
# !pip install faiss-cpu 
# !pip install groq
# !pip install -qU langchain-groq
# !pip install unstructured
# !pip install "unstructured[pdf]"
# !pip install -U langchain-huggingface
# !pip install transformers
# !brew install poppler
# !brew install tesseract
# !pip install nltk
# !pip install PyPDF2 pdfplumber

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [101]:
## Import required dependencies
# arch -arm64 brew install poppler
import pandas as pd
import nltk
nltk.download('punkt')
import datasets
from transformers import AutoTokenizer
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from tqdm import tqdm
from transformers.agents import Tool, HfApiEngine, ReactJsonAgent #replaces HfEngine
from huggingface_hub import InferenceClient
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader, PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
import os
import glob


[nltk_data] Downloading package punkt to /Users/nkeeley/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [75]:
## Bring in a document to vectorize in your RAG
directory_path = './rag_data'
# Get a list of all PDF files (including subdirectories)
pdf_files = glob.glob(f'{directory_path}/**/*.pdf', recursive=True)
# List to store all documents
all_docs = []
# Load documents from each PDF
for file_path in pdf_files:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    all_docs.extend(docs)
print(all_docs)

## Some of the original tutorial code that didn't work
# os.mkdir("rag_data")
# os.getcwd()
#https://arxiv.org/pdf/2306.05033 # replaced a researchgate article
# loader = DirectoryLoader('./rag_data', glob="**/*.pdf", show_progress=True)
# loader=UnstructuredPDFLoader("/Users/nkeeley/github_projects/agent-experimentation/rag_data/language_models_few_shot_learners.pdf", show_progress=True)
# loader=PyPDFLoader(
#     "./rag_data/language_models_few_shot_learners.pdf",
#     "./rag_data/cash_credibility.pdf",
# )
# docs = loader.load()



In [79]:
## Function for extracting the title of a document from its file path
def extract_titles(file_path):
    # Extract the base file name (e.g., "cash_credibility.pdf")
    base_name = file_path.split('/')[-1]
    # Remove the file extension (e.g., ".pdf")
    name_without_ext = base_name.split('.')[0]
    # Replace underscores with spaces and capitalize each word
    title = ' '.join(word.capitalize() for word in name_without_ext.split('_'))
    return title

## Add metadata field to each of the pages within this doc
for doc in all_docs:
    file_path=doc.metadata['source']
    doc.metadata['filename']=extract_titles(file_path)
print(all_docs[0].metadata)

{'source': './rag_data/cash_credibility.pdf', 'page': 0, 'filename': 'Cash Credibility'}


In [80]:
## Initialize the text splitter
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=200,
    chunk_overlap=20,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

In [93]:
## Split documents and remove duplicates
# For reference, doc really means page of a file. For some reason these are treated separately when loaded
# logger.info("Splitting documents...")
docs_processed = []
unique_texts = {}
for doc in tqdm(all_docs):
    new_docs = text_splitter.split_documents([doc])
    for new_doc in new_docs:
        if new_doc.page_content not in unique_texts:
            unique_texts[new_doc.page_content] = True
            docs_processed.append(new_doc)

100%|██████████| 109/109 [00:01<00:00, 74.73it/s]


In [102]:
## Initialize the embedding model
model_name = "thenlper/gte-small"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [104]:
# Create the vector database
# logger.info("Creating vector database...")
vectordb = FAISS.from_documents(
    documents=docs_processed,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

In [110]:
## Creating retriever tool that combines and returns most relevant chunks
class RetrieverTool(Tool):
    name = "retriever"
    description = "Using semantic similarity, retrieves some documents from the knowledge base that have the closest embeddings to the input query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, vectordb, **kwargs):
        super().__init__(**kwargs)
        self.vectordb = vectordb

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        docs = self.vectordb.similarity_search(
            query,
            k=7,
        )

        return "\nRetrieved documents:\n" + "".join(
            [f"===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
        )

## Create an instance of the RetrieverTool
retriever_tool = RetrieverTool(vectordb)