In [21]:
import os
import time
path = 'documents/'
files = os.listdir(path)
files

['2304.12244v2.pdf',
 '2402.05930v1.pdf',
 '2405.16506v1.pdf',
 '2102.00151.pdf',
 '2310.06825v1.pdf',
 '2112.07916v2.pdf',
 '2401.13919v4.pdf',
 '2312.12423.pdf',
 '2305.10601v2.pdf',
 '2406.05085v1.pdf',
 '2409.00149v1.pdf',
 '2212.10423v1.pdf',
 '2311.07587v2.pdf',
 '2409.04109v1.pdf',
 '2310.12931v2.pdf',
 '2309.14521.pdf',
 '2408.06292v3.pdf',
 '2402.15301v2.pdf',
 '2108.10447v1.pdf',
 '2406.12430v1.pdf',
 '2311.18751v2.pdf',
 '2402.03146v1.pdf',
 '2312.10997v5.pdf',
 '2404.00610v1.pdf',
 '2203.05794v1.pdf',
 '2303.18223v13.pdf',
 '2409.03284v1.pdf',
 '2308.02357v1.pdf',
 '2209.11755v1.pdf',
 '2309.15698v1.pdf',
 '2408.06292v1.pdf',
 '2305.05084v6.pdf',
 '2405.10292v2.pdf',
 '2402.18041v1.pdf',
 '2310.08184v1.pdf',
 '2312.10029v2.pdf',
 '2305.13453v2.pdf',
 '2310.11511v1.pdf',
 '2406.14550v1.pdf',
 'P10-1031.pdf',
 '2409.04004v2.pdf',
 '2109.05679v2.pdf',
 '2102.01187.pdf',
 '2312.15713v1.pdf',
 '2408.03010v1.pdf',
 '2305.17888v1.pdf',
 '2303.11366v4.pdf',
 '2307.03109v9.pdf',
 '2

In [7]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
# Get elements
raw_pdf_elements = partition_pdf(
    filename=path + files[0],
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

In [8]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 41,
 "<class 'unstructured.documents.elements.Table'>": 6}

In [9]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

6
41


In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
os.environ['OPENAI_API_KEY']='any'

In [13]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
base_url = "http://localhost:8080/"
# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o",base_url=base_url)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [20]:
# Apply to text
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 1},)


2777


In [25]:
text_summaries = []
for e in text_elements:
    t = e.text
    s = summarize_chain.invoke(t)
    text_summaries.append(s)
    time.sleep(5)
    

In [26]:
import chromadb
client = chromadb.PersistentClient('./chroma_docs')
collection = client.get_or_create_collection('documents_summary')
#collection_table = client.get_or_create_collection('table_summary')


ids = [files[0]+'_'+ str(i) for i in range(len(text_summaries))]
collection.upsert(ids=ids,documents=text_summaries)
collection.count()




41

In [27]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 1})



In [29]:
ids = [files[0]+'_table_'+ str(i) for i in range(len(table_elements))]
collection.add(ids=ids,documents=table_summaries)
collection.count()

Add of existing embedding ID: 2304.12244v2.pdf_table_0
Add of existing embedding ID: 2304.12244v2.pdf_table_1
Add of existing embedding ID: 2304.12244v2.pdf_table_2
Add of existing embedding ID: 2304.12244v2.pdf_table_3
Add of existing embedding ID: 2304.12244v2.pdf_table_4
Add of existing embedding ID: 2304.12244v2.pdf_table_5
Insert of existing embedding ID: 2304.12244v2.pdf_table_0
Insert of existing embedding ID: 2304.12244v2.pdf_table_1
Insert of existing embedding ID: 2304.12244v2.pdf_table_2
Insert of existing embedding ID: 2304.12244v2.pdf_table_3
Insert of existing embedding ID: 2304.12244v2.pdf_table_4
Insert of existing embedding ID: 2304.12244v2.pdf_table_5


47