In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma 
# import chromadb
# from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm.notebook import tqdm

In [None]:
local_path = "../pdf/BILLS-119hr1eh.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

In [None]:
len(data[0].page_content)

In [None]:
#Split and chunk the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
# Add the chunks to vector database, which takes the model for creating the embeddings.
vector_db = Chroma.from_documents(
                                    documents=chunks, 
                                    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
                                    collection_name="local-rag"
                                )

In [None]:
local_llm = "llama3.1"
llm = ChatOllama(model=local_llm)

QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
    template="""You are an AI Language model assistant. Your task is to generate five different versions of the given user question to retrieve relavant documents from a vector databaase. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Provide these alternative questions separated by newlines. 
    Original question: {question} """
)


retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

# RAG Prompt
template = """Answer the question based ONLY on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

In [None]:
q = "Describe the primary objectives of this Act."
response = chain.invoke(q)

print(response)

In [None]:
q = "What is the most controversial objective within this Act?"
response = chain.invoke(q)

print(response)

In [None]:
q = "Resulting from this Act, which groups would be harmed most?"
response = chain.invoke(q)

print(response)

In [None]:
q = "Resulting from this Act, which groups would benefit most?"
response = chain.invoke(q)