In [11]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from unstructured.partition.pdf import partition_pdf
import pytesseract

In [13]:
filename = "../DocumentStore/The_Art_Of_War.pdf"
elements = partition_pdf(
    filename = filename,

    strategy="hi_res",
    infer_table_structure=True,
    model_name="yolox"
)

In [6]:
from lxml import html
from pydantic import BaseModel
from typing import Any, Optional

In [5]:
category_counts = {}

for element in elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.Table'>": 224,
 "<class 'unstructured.documents.elements.Title'>": 798,
 "<class 'unstructured.documents.elements.ListItem'>": 219,
 "<class 'unstructured.documents.elements.NarrativeText'>": 844,
 "<class 'unstructured.documents.elements.Text'>": 123,
 "<class 'unstructured.documents.elements.Header'>": 150,
 "<class 'unstructured.documents.elements.Image'>": 71,
 "<class 'unstructured.documents.elements.Footer'>": 97,
 "<class 'unstructured.documents.elements.FigureCaption'>": 23,
 "<class 'unstructured.documents.elements.Formula'>": 1}

In [12]:
class Element(BaseModel):
    type: str
    text: Any

# Categorize by type
categorized_elements = []
for element in elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

224
0


In [10]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OpenAIEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

AttributeError: 'list' object has no attribute 'unstructured'