In [None]:
from dotenv import load_dotenv
import json
from time import monotonic
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.docstore.document import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
from langchain_core.callbacks import BaseCallbackHandler
from langchain.chains.summarize import load_summarize_chain
from langchain.vectorstores import FAISS

from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_precision, context_recall, answer_similarity
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from datasets import Dataset
import itertools
import pandas as pd

from support_functions import split_into_subtitle_sections, replace_double_lines_with_one_line

In [43]:
local_llm = "llama3.2"
llm = ChatOllama(model=local_llm)

In [2]:
load_dotenv()

True

In [3]:
local_path = "../pdf/BILLS-119hr1eh.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path,
                                   split_pdf_page=True,
                                   split_pdf_concurrency_level=15)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

In [37]:
len(data[0].page_content)

1136279

In [38]:
# create PromptTemplate for summarization
summarization_prompt_template = """Write an extensive summary of the following:

{text}

SUMMARY:"""

summarization_prompt = PromptTemplate(
    template=summarization_prompt_template,
    input_variables=["text"]
)

In [None]:


def create_subtitle_summary(subtitle, llm):
    """
    Creates a summary of a subtitle section using a large language model (LLM).

    Args:
        subtitle: A Document object representing the subtitle section to summarize.

    Returns:
        A Document object containing the summary of the subtitle section.
    """

    # Extract the text content from the subtitle
    subtitle_txt = subtitle.page_content

    verbose = False  # Set to True for more detailed output

    try:
        import tiktoken
        encoding = tiktoken.get_encoding("cl100k_base")  # Use encoding name directly
        num_tokens = len(encoding.encode(subtitle_txt))
    except Exception as e:
        print(f"Token counting error: {e}")
        # Fallback: estimate tokens (rough approximation: ~4 chars per token)
        num_tokens = len(subtitle_txt) // 4

    # Choose the summarization chain type based on token count
    max_chars = 50000  # Adjust based on your model's context window
    
    if len(subtitle_txt) < max_chars:
        # For shorter subtitle sections, use the "stuff" chain type
        chain = load_summarize_chain(
            llm,
            chain_type="stuff",
            prompt=summarization_prompt,
            verbose=verbose
        )
    else:
        # For longer subtitle sections, use the "map_reduce" chain type
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=summarization_prompt,
            combine_prompt=summarization_prompt,
            verbose=verbose
        )

    # Start timer to measure summarization time
    start_time = monotonic()

    # Create a Document object for the subtitle
    doc_subtitle = Document(page_content=subtitle_txt)

    try:
        summary_result = chain.invoke([doc_subtitle])
        print(f"Chain type: {chain.__class__.__name__}")
        print(f"Run time: {monotonic() - start_time}")
        
        if summary_result is None:
            print("Warning: Chain returned None, using fallback summary")
            summary_text = f"Summary unavailable for subtitle section: {subtitle_txt[:200]}..."
        elif isinstance(summary_result, dict) and "output_text" in summary_result:
            summary_text = replace_double_lines_with_one_line(summary_result["output_text"])
        else:
            print(f"Unexpected chain result type: {type(summary_result)}")
            print(f"Chain result: {summary_result}")
            # Fallback handling
            summary_text = f"Summary processing failed for subtitle section: {subtitle_txt[:200]}..."
            
    except Exception as e:
        print(f"Chain invocation error: {e}")
        summary_text = f"Error generating summary: {str(e)[:200]}..."

    # Create a Document object for the summary, preserving subtitle metadata
    doc_summary = Document(page_content=summary_text, metadata=subtitle.metadata)

    return doc_summary

def encode_subtitle_summaries(subtitle_summaries):
    """
    Encodes a list of subtitle summaries into a FAISS vector store using chosen embeddings.

    Args:
        subtitle_summaries (list): A list of Document objects representing the subtitle summaries.

    Returns:
        FAISS: A FAISS vector store containing the encoded subtitle summaries.
    """
    # Create OpenAI embeddings instance
    #embeddings = OpenAIEmbeddings()
    embeddings=OllamaEmbeddings(model="nomic-embed-text")
    
    # Encode the chapter summaries into a FAISS vector store
    subtitle_summaries_vectorstore = FAISS.from_documents(subtitle_summaries, embeddings)

    # Return the vector store
    return subtitle_summaries_vectorstore

def retrieve_context_per_question(state):
    """
    Retrieves relevant context for a given question by aggregating content from:
    - Subtitle summaries

    Args:
        state (dict): A dictionary containing the question to answer, with key "question".

    Returns:
        dict: A dictionary with keys:
            - "context": Aggregated context string from all sources.
            - "question": The original question.
    """
    question = state["question"]

    # Retrieve relevant subtitle summaries
    print("Retrieving relevant subtitle summaries...")
    docs_summaries = subtitle_summaries_query_retriever.get_relevant_documents(question)
    context_summaries = " ".join(
        f"{doc.page_content} (Subtitle {doc.metadata['Subtitle']})" for doc in docs_summaries
    )

    return {"context": context_summaries, "question": question}

### Use a LLM to Generate Subtitle Section Summaries and Vector Store

In [47]:
subtitle_sections = split_into_subtitle_sections(pdf_path=local_path)
# subtitle_sections

In [48]:
subtitle_summaries = []

# Iterate over each subtitle in the subtitles list and summarize it
for subtitle in subtitle_sections:
    summary = create_subtitle_summary(subtitle, llm=llm)
    subtitle_summaries.append(summary)

Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 29.500236959000176
Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 27.80193516700092
Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 13.539345415998469
Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 13.056082832999891
Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 19.33263395799986
Token counting error: Duplicate encoding name gpt2 in tiktoken plugin tiktoken_ext.openai_public
Chain type: StuffDocumentsChain
Run time: 19.049033582999982
Token counting error: Duplicate encoding name gpt2 in 

In [50]:
subtitle_summaries[:5]

[Document(metadata={'Subtitle': 1}, page_content='The provided subsections (A) under the title "Subtitle A—Nutrition" outline various provisions related to food security, nutrition assistance programs, and other related matters in the United States. Here is a detailed summary of each section:\n**Sec. 10001. Thrifty Food Plan**\nThis section likely outlines the guidelines for the Thrifty Food Plan (TFP), which is used as the basis for the Supplemental Nutrition Assistance Program (SNAP) benefits. The TFP provides a standardized food plan to help determine eligible expenditures for SNAP recipients.\n**Sec. 10002. Able Bodied Adults without Dependents Work Requirements**\nThis section may establish or modify work requirements for able-bodied adults without dependents receiving SNAP benefits. Such requirements might include mandatory employment, job training, or other activities aimed at promoting self-sufficiency and reducing dependency on government assistance.\n**Sec. 10003. Able Bodied

In [None]:
# create vector store and retriever for subtitle summaries
subtitle_summary_vector_store = encode_subtitle_summaries(subtitle_summaries)

# create retriever for subtitle summaries
# This will allow us to retrieve the most relevant subtitle summary for a given question
subtitle_summaries_query_retriever = subtitle_summary_vector_store.as_retriever(search_kwargs={"k": 1})

### Chunk the document based on optimization research

In [25]:
#Split and chunk the data
chunk_size = 750
chunk_overlap = 200

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(data)

print(f"Number of documents after splitting with chunk_size={chunk_size} and chunk_overlap={chunk_overlap}: {len(chunks)}")

# Add the chunks to vector database, which uses nomic for model embeddings
vector_db = Chroma.from_documents(
                                    documents=chunks, 
                                    embedding=OllamaEmbeddings(model="nomic-embed-text"),
                                    collection_name="local-rag"
                                )

Number of documents after splitting with chunk_size=750 and chunk_overlap=200: 2114


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [57]:


# Set up a basic PromptTemplate as the backbones of the solution
# Ask the system to gather several responses and to limit response to 200 words
QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
        template="""You are an AI Language model assistant. Your task is to generate three different versions of the given user question 
        to retrieve relevant documents from a vector database. Please be as concise as possible and limit your response to 200 words or less. 
        Original question: {question} """
)

retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

In [58]:
# use a ChatPromptTemplate to initiate a conversation, allowing the System to assume a Role
chat_template = """Answer the question based only on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(chat_template)

chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

### Context-Driven responses

In [None]:
# q_list = ['Resulting from this Act, which groups would be harmed most?',\
#           'Resulting from this Act, which groups would benefit most?']

# c_list = ['Role: you are a staunch Democrat',\
#           'Role: you are a staunch Republican',\
#           'Role: you are a US citizen with no political affiliation',\
#           'Role: you are a wealthy investor with interests in oil, gas, and mining',\
#           'Role: you are a person that enjoys recreating in public lands while living in Colorado']

In [59]:
def response_gen(q, context):
    """
    Invoke the language model chain with a given question and context, 
    then print the question, context, and the model's response.

    Args:
        q (str): The question to ask.
        context (str): The context or role to provide to the model.
    """
    response = chain.invoke(input={'context': context, 'question': q})
    
    print('*** \n')
    print(f"Question - {q}")
    print(f"Context - {context} \n")
    print("Response: \n", response)

    init_state = {"question": f"{q}"}

    # Bonus: Retrieve relevant context for the question from the subtitle summary vector store
    context_state_related_to_question = retrieve_context_per_question(init_state)

    return response, context_state_related_to_question


def iterate_responses(q_list, c_list):
    """
    Iterate over all combinations of questions and contexts, generate responses using the language model,
    and return a DataFrame with the results.

    Args:
        q_list (list): List of questions to ask.
        c_list (list): List of contexts or roles to provide to the model.

    Returns:
        pd.DataFrame: DataFrame containing columns 'question', 'context', and 'response' for each combination.
    """
    response_list = []
    df_responses = []
    
    for combo in itertools.product(q_list, c_list):
        response, context_state_related_to_question = response_gen(q=combo[0], context=combo[1])

        data = {
        'question': [combo[0]],
        'context': [combo[1]],
        'response': [response],
        'related_subtitles_context': [context_state_related_to_question]
        }
    
        df = pd.DataFrame(data)
    
        response_list.append(df)

    df_responses = pd.concat(response_list)
    df_responses.reset_index(inplace=True, drop=True)

    return df_responses

In [66]:
q_list1 = ['Resulting from this Act, which groups would be harmed most?']

c_comp_list = ['Role: you are a US citizen with no political affiliation',\
        'Role: you are a staunch Republican',\
        'Role: you are a staunch Republican',\
        'Role: you are a staunch Democrat',\
        'Role: you are a staunch Republican and Trump Supporter']

df_responses = iterate_responses(q_list=q_list1, c_list=c_comp_list)

*** 

Question - Resulting from this Act, which groups would be harmed most?
Context - Role: you are a US citizen with no political affiliation 

Response: 
 The context suggests that the Act being referred to is a bill related to various policy changes and amendments in the United States. Given the content of the provided excerpt, it appears to be a bill dealing with healthcare, food assistance, and financial regulations.

Without specific details about the overall impact of the Act, it's challenging to definitively state which groups would be harmed most. However, some potential groups that could be negatively affected include:

1. **Low-income households**: Changes in food assistance programs might affect their access to affordable nutrition.
2. **Individuals with pre-existing medical conditions**: Revisions to healthcare policies or insurance regulations could impact their ability to obtain necessary coverage or afford medications.
3. **Small businesses and entrepreneurs**: Tax all

In [65]:
pd.set_option('display.max_colwidth', None)
df_responses.head()

Unnamed: 0,question,context,response,related_subtitles_context
0,"Resulting from this Act, which groups would be harmed most?",Role: you are a US citizen with no political affiliation,"Based on the context provided, it appears that this Act is related to various amendments and additions to tax laws, education regulations, and agricultural policies. Without more specific information about the Act's provisions and their effects, it's challenging to pinpoint which groups would be harmed most.\n\nHowever, some potential groups that might be disproportionately affected by the Act's provisions include:\n\n1. Small business owners: The amendments related to qualified opportunity funds (Section 6039K) might impact small businesses that invest in these funds.\n2. Low-income individuals and families: The changes to Medicaid community engagement requirements (Subtitle D, Title IV) could affect low-income individuals and families who rely on these programs for health insurance and support services.\n3. Rural communities: The provisions related to qualified opportunity zones (Section 22 of the Internal Revenue Code) might disproportionately benefit rural areas, potentially at the expense of urban or more developed regions.\n4. Immigrant and refugee communities: The changes to citizenship and immigration regulations (Section 1902(ee)) could impact immigrant and refugee populations who are seeking citizenship or residency in the United States.\n\nIt's essential to note that these are speculative possibilities, and the actual effects of the Act on various groups would depend on the specific provisions and their implementations.","{'context': 'The provided text appears to be a summary of sections from the U.S. Code, specifically from Title 28 (Section 70300-70302). Here's an extensive summary: **Section 70300 - Limitation on Donations Made Pursuant to Settlement Agreements** This section limits the use of settlement agreements entered into by the United States government. Specifically: * An official within the Department of Justice cannot direct or provide for a payment to any person or entity other than the United States, except in cases where:  + The payment provides restitution for actual harm caused by the party making the payment.  + The payment constitutes payment for services rendered in connection with the case. * Violation of this subsection is punishable under Section 3302 of Title 31, United States Code. * This section applies only to settlement agreements entered into on or after the date of enactment of the Act. **Section 70301 - Solicitation of Orders Defined** This section amends Public Law 86-272 (73 Stat. 555) by adding a new definition for ""solicitation of orders."" The revised definition includes any business activity that facilitates the solicitation of orders, even if it serves an independently valuable function apart from solicitation. **Section 70302 - Restriction on Enforcement** This section restricts the enforcement of contempt citations in cases where no security was given when the injunction or temporary restraining order was issued. Specifically: * No court can enforce a contempt citation for failure to comply with an injunction or temporary restraining order if:  + No security was given when the injunction or order was issued.  + The injunction or order was issued pursuant to Federal Rule of Civil Procedure 65(c). * This section applies regardless of whether the injunction or order was issued prior to, on, or subsequent to the date of enactment. **Annual Audit Requirement** The Inspector General of the Department of Justice is required to submit an annual report to: * The Committee on the Judiciary of the Senate * The Committee on the Judiciary of the House of Representatives The report must include information about any settlement agreement entered into in violation of Section 70300. No additional funds are authorized for this purpose. Overall, these sections aim to regulate and limit the use of settlement agreements by the United States government, ensure transparency and accountability, and restrict enforcement in certain circumstances. (Subtitle 39)', 'question': 'Resulting from this Act, which groups would be harmed most?'}"


### Leverage RAGAS for Evaluating Prompts

In [None]:
def structure_eval_data(question, contexts, answer, ground_truth):
    """
    Structure the evaluation data into a Dataset format.
    
    Args:
        question (str): The evaluation question.
        contexts (list): List of contexts for the question.
        answer (str): The answer for the question.
        ground_truth (str): The ground truth for the evaluation.

    Returns:
        Dataset: A structured dataset containing the evaluation data.
    """
    eval_data = {
        'question': [question],
        'answer': [answer],
        'contexts': [contexts],
        'ground_truth': [ground_truth]
    }

    json_formatted_string = json.dumps(eval_data, indent=4)
    print("Structured Evaluation Data (in json format):")
    print(json_formatted_string)

    return Dataset.from_dict(eval_data)

In [None]:
class TestCallback(BaseCallbackHandler):

    def on_llm_start(self, serialized, prompts, **kwargs):
        print(f"**********Prompts*********:\n {prompts[0]}\n\n")

    def on_llm_end(self, response, **kwargs):
        print(f"**********Response**********:\n {response}\n\n")

In [None]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o", temperature=0.0, max_tokens=3000))
evaluator_embed = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="llama3.2"))

In [None]:
def evaluate_llm_with_ragas(dataset, metrics, llm, embeddings, run_config, raise_exceptions=True, callbacks=None):
    """
    Evaluate a dataset using RAGAS metrics and print the results.

    Args:
        dataset: The dataset to evaluate.
        metrics: List of metrics to use for evaluation.
        llm: The language model wrapper for evaluation.
        embeddings: The embeddings wrapper for evaluation.
        run_config: The run configuration for evaluation.
        raise_exceptions (bool, optional): Whether to raise exceptions during evaluation. Defaults to True.
        callbacks (list, optional): List of callback handlers. Defaults to None.

    Returns:
        None. Prints the evaluation result.
    """
    # Evaluate using RAGAS
    result = evaluate(
        dataset=dataset,
        metrics=metrics,
        llm=llm,
        embeddings=embeddings,
        run_config=run_config,
        raise_exceptions=raise_exceptions,
        callbacks=callbacks
    )

    df = result.to_pandas()
    print(df.head())


In [None]:
dataset_repub = structure_eval_data(question="Resulting from this Act, which groups would be harmed most?", 
                              contexts=["Role: you are a staunch Republican"], 
                              answer="Based on the provided context, as a staunch Republican, I would argue that the groups most harmed by the resulting Act would likely be small business owners and rural communities. The Act appears to include several provisions aimed at expanding access to healthcare and addressing issues related to poverty and inequality. Specifically, Sections 111201 and 111202 of the Act expand the definition of 'rural emergency hospital' under the Medicare program, which could provide additional support for rural hospitals that serve underserved communities. Additionally, Section 44301 expands the exclusion for orphan drugs under the Drug Price Negotiation Program, which could help reduce costs for some patients who rely on these medications. However, it's also worth noting that some provisions of the Act, such as the elimination of certain tax credits and deductions, could have a negative impact on small business owners and low-income families. For example, Section 112005 terminates the energy efficient home improvement credit, which could make it more difficult for low-income households to access affordable housing options.\n\nOverall, while there may be some benefits to certain groups under the Act, I believe that small business owners and rural communities would be among the most harmed by the resulting changes.",
                              ground_truth="""Foreign nationals: The unfair foreign tax provision may affect foreign nationals who are subject to taxation by a foreign government.\
            Small businesses: Some provisions, such as the repeal of EPA rules and NHTSA standards, may benefit small businesses by reducing regulations and costs.\
            Low-income individuals: The legislation includes provisions related to health insurance, such as the exchange enrollment verification requirement and the premium adjustment percentage. These provisions may affect low-income individuals who rely on government-subsidized health insurance plans.\
            Environmental groups and advocates: The Act repeals EPA rules related to greenhouse gas emissions standards (Section 42201), which could harm environmental organizations that relied on these regulations to advocate for climate action.\
            Alien populations: The Act revises the definition of "eligible alien" in certain contexts, which could affect non-citizen populations who rely on specific healthcare programs or services.\
            Health care workers: The legislation includes provisions that may impact health care workers, particularly those involved in administering or managing government-subsidized health insurance programs.\
            Clean energy companies: The legislation includes provisions that may impact clean energy companies, particularly those involved in renewable energy projects and environmental regulations."""
                              )

In [None]:
dataset_democrat = structure_eval_data(question="Resulting from this Act, which groups would be harmed most?", 
                              contexts=["Role: you are a staunch Democrat"], 
                              answer="Based on my understanding of the content of the provided text, it appears that the Act is primarily focused on tax reform and regulatory changes.\
                                However, some provisions in the Act could potentially harm certain groups. Some potential groups that might be harmed by this Act include:\
                                    * Low- and middle-income individuals who would face increased taxes under certain provisions\
                                    * Environmentally conscious individuals who would see stricter regulations relaxed regarding greenhouse gas emissions standards for light-duty vehicles\
                                    * Workers in the healthcare industry who may face changes to their job security due to the new requirements on exchange enrollment verification\
                                    * Certain corporations or organizations that might be affected by the new rules on foreign research and experimental expenditures, as well as the restriction on donations made pursuant to settlement agreements.",
                              ground_truth="""Foreign nationals: The unfair foreign tax provision may affect foreign nationals who are subject to taxation by a foreign government.\
            Small businesses: Some provisions, such as the repeal of EPA rules and NHTSA standards, may benefit small businesses by reducing regulations and costs.\
            Low-income individuals: The legislation includes provisions related to health insurance, such as the exchange enrollment verification requirement and the premium adjustment percentage. These provisions may affect low-income individuals who rely on government-subsidized health insurance plans.\
            Environmental groups and advocates: The Act repeals EPA rules related to greenhouse gas emissions standards (Section 42201), which could harm environmental organizations that relied on these regulations to advocate for climate action.\
            Alien populations: The Act revises the definition of "eligible alien" in certain contexts, which could affect non-citizen populations who rely on specific healthcare programs or services.\
            Health care workers: The legislation includes provisions that may impact health care workers, particularly those involved in administering or managing government-subsidized health insurance programs.\
            Clean energy companies: The legislation includes provisions that may impact clean energy companies, particularly those involved in renewable energy projects and environmental regulations."""
                              )

In [None]:
evaluate_llm_with_ragas(
    dataset=dataset_repub, 
    metrics=[
        #faithfulness, 
        answer_relevancy, 
        answer_correctness, 
        #context_precision, 
        #context_recall, 
        answer_similarity
    ], 
    llm=evaluator_llm, 
    embeddings=evaluator_embed, 
    run_config=RunConfig(timeout=300, max_retries=10, max_wait=300, log_tenacity=False), 
    raise_exceptions=True, 
    callbacks=None
)

In [None]:
evaluate_llm_with_ragas(
    dataset=dataset_democrat, 
    metrics=[
        #faithfulness, 
        answer_relevancy, 
        answer_correctness, 
        #context_precision, 
        #context_recall, 
        answer_similarity
    ], 
    llm=evaluator_llm, 
    embeddings=evaluator_embed, 
    run_config=RunConfig(timeout=300, max_retries=10, max_wait=300, log_tenacity=False), 
    raise_exceptions=True, 
    callbacks=None
)