In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm.notebook import tqdm
import time
import itertools

In [2]:
local_path = "../pdf/BILLS-119hr1eh.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

In [3]:
len(data[0].page_content)

1136279

In [4]:
#Split and chunk the data
chunk_size = 750
chunk_overlap = 200

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(data)

# Add the chunks to vector database, which uses nomic for model embeddings
vector_db = Chroma.from_documents(
                                    documents=chunks, 
                                    embedding=OllamaEmbeddings(model="nomic-embed-text"),
                                    collection_name="local-rag"
                                )

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [5]:
local_llm = "llama3.2"
llm = ChatOllama(model=local_llm)

# Set up a basic PromptTemplate as the backbones of the solution
# Ask the system to gather several responses and to limit response to 200 words
QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
        template="""You are an AI Language model assistant. Your task is to generate three different versions of the given user question 
        to retrieve relavant documents from a vector database. By generating multiple perspectives on the user question, your goal is 
        to help the user overcome some of the limitations of the distance-based similarity search. Please be as concise as possible and 
        limit your response to 200 words or less. 
        Original question: {question} """
)

retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

In [6]:
# use a ChatPromptTemplate to initiate a conversation, allowing the System to assume a Role
chat_template = """Answer the question based only on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(chat_template)

chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

### Context-Driven responses

In [7]:
def iterate_response(q, context):

    response = chain.invoke(input={'context': context, 'question': q})
    
    print('*** \n')
    print(f"Question - {q}")
    print(f"Context - {context} \n")
    print("Response: \n", response)
    

q_list = ['Resulting from this Act, which groups would be harmed most?',\
          'Resulting from this Act, which groups would benefit most?']

c_list = ['Role: you are a staunch Democrat',\
          'Role: you are a staunch Republican',\
          'Role: you are a US citizen with no political affiliation',\
          'Role: you are a wealthy investor with interests in oil, gas, and mining']

for combo in itertools.product(q_list, c_list):
    iterate_response(q=combo[0], context=combo[1])

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


*** 

Question - Resulting from this Act, which groups would be harmed most?
Context - Role: you are a staunch Democrat 

Response: 
 As a staunch Democrat, I would argue that this Act would harm the most low-income and marginalized communities, who already struggle with access to healthcare, education, and economic opportunities. The provisions in the Act, such as the expansion of health savings accounts and the reduced funding for Medicaid, would disproportionately affect these groups.

Additionally, the Act's emphasis on individual responsibility and market-based solutions would likely exacerbate existing systemic inequalities, such as racial and socioeconomic disparities in healthcare outcomes and access to quality education. Furthermore, the lack of protections for pre-existing conditions and the erosion of the Affordable Care Act would leave many vulnerable individuals without adequate health insurance coverage.

In particular, the following groups may be harmed most:

1. Low-inc