In [104]:
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import faiss
import os

Indexing

In [110]:
class Indexing:
    def __init__(self, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2", chunk_size=1000, chunk_overlap=200):
        self.embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_name)
        self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    def load_pdfs_as_single_document(self, directory_path):
        pdf_docs = {}
        
        for filename in os.listdir(directory_path):
            if filename.lower().endswith(".pdf"):
                file_path = os.path.join(directory_path, filename)
                loader = PyPDFLoader(file_path)
                pages = loader.load()
                
                # Combine all page contents
                full_text = "\n".join([page.page_content for page in pages])
                
                # Create a single Document
                merged_doc = Document(
                    page_content=full_text,
                    metadata={"source": filename}
                )
                
                pdf_docs[filename] = merged_doc
        
        return pdf_docs

    def split_pdf_documents(self, pdf_documents):
        all_chunks = []
        
        for filename, document in pdf_documents.items():
            chunks = self.splitter.split_documents([document])
            
            for chunk in chunks:
                chunk.metadata["source"] = filename
                chunk.page_content = f"Source: {filename}\n" + chunk.page_content
            
            all_chunks.extend(chunks)
        
        return all_chunks

    def create_vector_store(self, chunks):
        return FAISS.from_documents(chunks, self.embedding_function)

In [111]:
indexer = Indexing()

# Step 1: Load PDFs
pdf_documents = indexer.load_pdfs_as_single_document("/Users/mukki11/Documents/UberChatbot/Resources for Uber Rider Help Chatbot/Pdf's")

# Step 2: Split into chunks
chunks = indexer.split_pdf_documents(pdf_documents)

# Step 3: Create FAISS vector store
vector_store = indexer.create_vector_store(chunks)

Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Retriever

In [112]:
retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs = {'k': 3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x14d5e73a0>, search_kwargs={'k': 3})

In [113]:
def format_docs(retrieved_docs):
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context

Prompt Formation

In [114]:
llm = ChatGroq(model='gemma2-9b-it')

prompt = PromptTemplate(
    template = """
    You are a helpful AI assisstant
    Answer only from the provided context
    If you dont know answer to the query asked, just say I dont know

    Context: {context}
    Question: {question}
    """,
    input_variables = ['context', 'question']
)

Chaining all the components

In [115]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
chain = parallel_chain | prompt | llm

In [123]:
response = chain.invoke('My fare for the recent trip was very high')
response.pretty_print()


You may be charged a different amount than your upfront fare in case the route deviates from the predicted route.

We are happy to review these cases if you feel you were overcharged.


In [109]:
# # Example usage
# directory = "/Users/mukki11/Documents/UberChatbot/Resources for Uber Rider Help Chatbot/Pdf's"
# pdf_documents = load_pdfs_as_single_document(directory)

## Example output
# for pdf_name, doc in pdf_documents.items():
#     print(f"{pdf_name}: {len(doc.page_content)} characters")

# chunks = split_pdf_documents(pdf_documents)
# print(chunks[0].page_content), len(chunks)

# vector_store.index_to_docstore_id
# vector_store.get_by_ids(["f431b990-b5a9-42c5-ad12-e6f57e8c8118"])

# retriever.invoke('I lost my items in an uber')

# query = 'I lost my items in an uber'
# retrieved_docs = retriever.invoke(query)

# context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
# context_text

# final_prompt = prompt.invoke({'context': context_text, 'question': query})
# final_prompt

# response = llm.invoke(final_prompt)
# response