In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm.notebook import tqdm
import time
import itertools

In [4]:
local_path = "../pdf/RL34273.26.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

In [5]:
len(data[0].page_content)

63504

In [22]:
def compare_chunk_size_performance(chunk_size, chunk_overlap):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    
    # Add the chunks to vector database, which uses nomic for model embeddings
    vector_db = Chroma.from_documents(documents=chunks, 
                                    embedding=OllamaEmbeddings(model="nomic-embed-text"),
                                    collection_name="local-rag"
                                    )
    local_llm = "llama3.2"
    llm = ChatOllama(model=local_llm)
    
    # set up a basic PromptTemplate as the backbones of the solution
    QUERY_PROMPT = PromptTemplate(
        input_variables = ["question"],
        template="""You are an AI Language model assistant. Your task is to generate three different versions of the given user question 
        to retrieve relavant documents from a vector database. By generating multiple perspectives on the user question, your goal is 
        to help the user overcome some of the limitations of the distance-based similarity search. Please be as concise as possible and 
        limit your response to 200 words or less. 
        Original question: {question} """
    )
    
    retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

    # use a ChatPromptTemplate to initiate a conversation, allowing the System to assume a Role
    chat_template = """Answer the question based only on the following context: 
    {context}
    Question: {question}
    """
    
    prompt = ChatPromptTemplate.from_template(chat_template)
    
    chain = (
        {"context":retriever, "question":RunnablePassthrough()}
        | prompt 
        | llm 
        | StrOutputParser()
    )

    q = 'Can you summarize the primary proposals in this document?'
    context = 'Role: you are a person that enjoys recreating in public lands while living in Colorado'

    response = chain.invoke(input={'context': context, 'question': q})
    
    print('*** \n')
    print(f"Chunk_Size: {chunk_size}, Chunk_Overlap: {chunk_overlap}")
    print(f"Question - {q}")
    print(f"Context - {context} \n")
    print("Response: \n", response)

In [23]:
compare_chunk_size_performance(chunk_size=1000, chunk_overlap=100)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 1000, Chunk_Overlap: 100
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recreating in public lands and lives in Colorado, I found the document to be quite informative about the management and conservation of these areas.

The primary proposal that caught my attention is the Great American Outdoors Act (GAOA), which aims to increase funding for the Land and Water Conservation Fund (LWCF). The LWCF has been a vital source of funding for the acquisition and conservation of public lands, including national parks and wildlife refuges. The GAOA proposes to make this funding mandatory, ensuring that the LWCF continues to support these important conservation efforts.

Another proposal I'd like to mention is the "America the Beautiful" initiative, announced by the Biden-Harris Administration in 2021. This initiative aims to 

In [24]:
compare_chunk_size_performance(chunk_size=750, chunk_overlap=100)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 750, Chunk_Overlap: 100
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 Based on the provided context, it appears that the document discusses the management and acquisition of public lands. As someone who enjoys recreating in public lands while living in Colorado, I can summarize the primary proposals mentioned in the document as follows:

One proposal mentions the need for more national parks (Sierra Club Magazine, "A Modest Proposal: We Need More National Parks", August 17, 2021) and another proposal is to keep public lands in public hands (Public Lands Foundation, "Keep Public Lands in Public Hands", April 5, 2025).

These proposals seem to be more general calls for increased protection and management of public lands, rather than specific policies or laws. The document does not provide a clear summary of primary proposals that I can co

In [25]:
compare_chunk_size_performance(chunk_size=400, chunk_overlap=100)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 400, Chunk_Overlap: 100
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 Based on the provided context, it appears that the document is discussing federal land ownership and acquisition/disposal authorities. However, since the question specifically asks for a summary of the primary proposals related to recreating in public lands, I will provide an answer based on my understanding of the context.

As someone who enjoys recreating in public lands while living in Colorado, I'm excited to share that there are some proposed changes that could benefit outdoor enthusiasts like myself. According to the document, one proposal is to "Keep Public Lands in Public Hands," which suggests that Congress should consider the long-term management and protection of our public lands for future generations.

Another proposal mentioned is "A Modest Proposal: We 

In [26]:
compare_chunk_size_performance(chunk_size=200, chunk_overlap=100)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 200, Chunk_Overlap: 100
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recreating in public lands while living in Colorado, I must say that I found this document to be quite informative. While it's not directly related to recreational activities, it provides valuable insights into federal land ownership and acquisition/disposal authorities.

From what I gathered, the primary proposals in this document concern federal land ownership and management. Specifically:

1. The Migratory Bird Conservation Fund was created in 1934 and later renamed the Duck Stamp Act, which is used to conserve migratory bird habitats.
2. The National Park Service (NPS) has a mission to conserve natural and historic objects, as well as wildlife, while providing for their enjoyment by future generations.
3. There are federal land ownership acqu

### Does chunk_overlap greatly affect the responses?

In [27]:
compare_chunk_size_performance(chunk_size=750, chunk_overlap=300)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 750, Chunk_Overlap: 300
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recreating in public lands and lives in Colorado, I'd be happy to summarize the primary proposals mentioned in this document.

The main topic of discussion is the acquisition and disposal authorities for federal land. It seems that there are different agencies with varying levels of authority when it comes to acquiring or disposing of federal land.

One key aspect mentioned is the need for a comprehensive review of certain aspects of the National Park System, which suggests that there may be efforts to expand or establish new park system units. The document mentions that the acquisition authorities differ in terms of their scope and application, with the US Forest Service (FS) having narrower authorities compared to the Bureau of Land Management 

In [28]:
compare_chunk_size_performance(chunk_size=750, chunk_overlap=200)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 750, Chunk_Overlap: 200
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recreating in public lands and lives in Colorado, I'd like to summarize the primary proposals mentioned in this document. The document appears to be a Congressional Research Service report on Federal Land Ownership: Acquisition and Disposal Authorities.

From what I can gather, the report discusses various proposals related to acquiring and disposing of federal land. Some key points that caught my attention include:

* The Biden-Harris administration's "America the Beautiful" initiative, which aims to expand national parks and public lands.
* The Public Lands Foundation's proposal for Indigenous community objectives, which includes space for housing, service industries, and recreation areas, but excludes commercial enterprises or new industries t

In [29]:
compare_chunk_size_performance(chunk_size=750, chunk_overlap=100)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 750, Chunk_Overlap: 100
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recharging in public lands, I've taken the time to review this report. The main points of interest for me revolve around how our federal government manages and acquires land. There are several key takeaways that have caught my attention.

1. **Extension of Mandates**: Initially, a mandate was introduced to continually monitor the welfare of areas under consideration for expansion, requiring a periodic report to Congress about potential additions (54 U.S.C. §100507). However, it seems this mandate has been revised and made permanent through provisions of the Consolidated Appropriations Act in 2018.

2. **Authority Update**: The authority for managing National Park Service lands was extended for one year in 2010 but later became permanent. This cha

In [30]:
compare_chunk_size_performance(chunk_size=750, chunk_overlap=50)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


*** 

Chunk_Size: 750, Chunk_Overlap: 50
Question - Can you summarize the primary proposals in this document?
Context - Role: you are a person that enjoys recreating in public lands while living in Colorado 

Response: 
 As someone who enjoys recreating in public lands living in Colorado, I'd be happy to summarize the primary proposals in this document for me.

From what I can gather, the document appears to be about Federal Land Ownership: Acquisition and Disposal Authorities. After reading through it, here are the main points that caught my attention:

1. The National Park Service (NPS) has a mission to conserve scenic beauty, natural objects, and wildlife in the park system, while also providing for public enjoyment.
2. NPS primarily relies on certain provisions of law for conveying limited interests in land, like right-of-way or building leases, rather than disposing of NPS land. 
3. The National Park System is required to include potential changes to boundaries and provide reasons