In [1]:
# !brew list
# !pip show langchain
# !pip show chromadb

In [2]:
from typing import Any
import pickle
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path = "2020-Democratic-Citizenship_Our-Common-Purpose.pdf"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=path,
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 77,
 "<class 'unstructured.documents.elements.Table'>": 3}

In [6]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

3
77


In [7]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOpenAI(temperature=0, openai_api_base="http://host.docker.internal:1234/v1", openai_api_key="dummy_key")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [8]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

# Adding summaries to a dictionary
summaries = {
    "table_summaries": table_summaries,
    "tables": tables,
    "text_summaries": text_summaries,
    "texts": texts
}

In [10]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OllamaEmbeddings(model="mistral", base_url='http://host.docker.internal:11434'))

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in summaries.get("texts")]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries.get("text_summaries"))
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, summaries.get("texts"))))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in summaries.get("tables")]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(summaries.get("table_summaries"))
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, summaries.get("tables"))))

In [11]:
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, openai_api_base="http://host.docker.internal:1234/v1", openai_api_key="dummy_key")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [15]:
import dill

# Define the directory and file name
directory = "pickles"  # Make sure this directory exists or the code will raise an error
file_name = "session.pkl"
full_path = f"{directory}/{file_name}"

# Use dill to save the session
dill.dump_session(full_path)

TypeError: no default __reduce__ due to non-trivial __cinit__

In [12]:
chain.invoke("What are the recommendations from the Our Common Purpose document?")

"\nThe Our Common Purpose document provides several recommendations to help equalize representation and even out the weighting of citizen voice in American democracy. These recommendations include:\n\n1. Establish a universal expectation of a year of national service and dramatically expand funding for service programs or fellowships that would offer young people paid service opportunities. Such opportunities should be made available not only in AmeriCorps or the military but also in local programs offered by municipal governments, local news outlets, and nonprofit organizations.\n2. Create new participatory opportunities that bring new voices and perspectives into the policy-making process on all levels of government. This can include mechanisms for individual members of Congress to interact directly with representative samples of their constituents and for Congress, as a whole, to interact with the people as a whole.\n3. Redesign public meetings like town halls, city council meetings