In [None]:
# !brew list
# !pip show langchain
# !pip show chromadb

In [27]:
from typing import Any
import pickle
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
path = "2020-Democratic-Citizenship_Our-Common-Purpose.pdf"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=path,
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

# Specify the directory and the filename for the pickle file
output_file_path = "/workspaces/Semi-structured-RAG/pickles/raw_pdf_elements.pkl"

with open(output_file_path, "wb") as file:
    pickle.dump(raw_pdf_elements, file)

In [None]:
import pickle

# Specify the path to your pickle file
pickle_file_path = "/workspaces/Semi-structured-RAG/pickles/raw_pdf_elements.pkl"

# Using a 'with' statement ensures the file is properly closed after reading
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    raw_pdf_elements = pickle.load(file)

# Now you can use the 'data' variable which contains your deserialized object

In [None]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

In [None]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

In [None]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOpenAI(temperature=0, openai_api_base="http://host.docker.internal:1234/v1", openai_api_key="dummy_key")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [24]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

# Adding summaries to a dictionary
summaries = {
    "table_summaries": table_summaries,
    "tables": tables,
    "text_summaries": text_summaries,
    "texts": texts
}

# Specify the directory and the filename for the pickle file
summaries_file_path = "/workspaces/Semi-structured-RAG/pickles/summaries.pkl"

with open(summaries_file_path, "wb") as file:
    pickle.dump(summaries, file)

NameError: name 'table_elements' is not defined

In [None]:
import pickle

# Specify the path to your pickle file
pickle_file_path = "/workspaces/Semi-structured-RAG/pickles/summaries.pkl"

# Using a 'with' statement ensures the file is properly closed after reading
with open(pickle_file_path, 'rb') as file:
    # Load the data from the file
    summaries = pickle.load(file)

# Now you can use the 'data' variable which contains your deserialized object

In [25]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OllamaEmbeddings(model="llama2", base_url='http://host.docker.internal:11434'))

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in summaries.get("text")]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries.get("text_summaries"))
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, summaries.get("text"))))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in summaries.get("tables")]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(summaries.get("table_summaries"))
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, summaries.get("tables"))))

In [30]:
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, openai_api_base="http://host.docker.internal:1234/v1", openai_api_key="dummy_key")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [32]:
chain.invoke("What are the recommendations from the Our Common Purpose document?")

'\nThe Our Common Purpose document recommends several policies to reduce the influence of money in politics, give weight to a wider range of voices, and increase the legitimacy of our institutions. These recommendations include:\n\n1. Campaign finance reform: The report suggests that campaign finance laws should be updated to limit the role of large donors and special interest groups in political campaigns. This could include measures such as limiting the amount of money that can be spent on campaigns, requiring disclosure of donations, and prohibiting foreign interference in U.S. elections.\n2. Ranked-choice voting: The report recommends implementing ranked-choice voting systems in federal and state elections to encourage more civil and issue-based campaigns. This system would allow voters to rank their preferences for multiple candidates, rather than just choosing between two candidates.\n3. Voter education and outreach: The report suggests that voter education and outreach efforts s