In [2]:
#https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb?ref=blog.langchain.dev

In [3]:
import os
import time
path = 'documents/'
files = os.listdir(path)
len(files)

['2304.12244v2.pdf',
 '2402.05930v1.pdf',
 '2405.16506v1.pdf',
 '2102.00151.pdf',
 '2310.06825v1.pdf',
 '2112.07916v2.pdf',
 '2401.13919v4.pdf',
 '2312.12423.pdf',
 '2305.10601v2.pdf',
 '2406.05085v1.pdf',
 '2409.00149v1.pdf',
 '2212.10423v1.pdf',
 '2311.07587v2.pdf',
 '2409.04109v1.pdf',
 '2310.12931v2.pdf',
 '2309.14521.pdf',
 '2408.06292v3.pdf',
 '2402.15301v2.pdf',
 '2108.10447v1.pdf',
 '2406.12430v1.pdf',
 '2311.18751v2.pdf',
 '2402.03146v1.pdf',
 '2312.10997v5.pdf',
 '2404.00610v1.pdf',
 '2203.05794v1.pdf',
 '2303.18223v13.pdf',
 '2409.03284v1.pdf',
 '2308.02357v1.pdf',
 '2209.11755v1.pdf',
 '2309.15698v1.pdf',
 '2408.06292v1.pdf',
 '2305.05084v6.pdf',
 '2405.10292v2.pdf',
 '2402.18041v1.pdf',
 '2310.08184v1.pdf',
 '2312.10029v2.pdf',
 '2305.13453v2.pdf',
 '2310.11511v1.pdf',
 '2406.14550v1.pdf',
 'P10-1031.pdf',
 '2409.04004v2.pdf',
 '2109.05679v2.pdf',
 '2102.01187.pdf',
 '2312.15713v1.pdf',
 '2408.03010v1.pdf',
 '2305.17888v1.pdf',
 '2303.11366v4.pdf',
 '2307.03109v9.pdf',
 '2

In [4]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
# Get elements


In [5]:
class Element(BaseModel):
        type: str
        text: Any
def get_text_and_table(filename):
    raw_pdf_elements = partition_pdf(
    filename=path + filename,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
    )
    # Create a dictionary to store counts of each type
    category_counts = {}

    for element in raw_pdf_elements:
        category = str(type(element))
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1

    # Unique_categories will have unique elements
    unique_categories = set(category_counts.keys())
    category_counts



    # Categorize by type
    categorized_elements = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            categorized_elements.append(Element(type="table", text=str(element)))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            categorized_elements.append(Element(type="text", text=str(element)))

    # Tables
    table_elements = [e for e in categorized_elements if e.type == "table"]
    print(len(table_elements))

    # Text
    text_elements = [e for e in categorized_elements if e.type == "text"]
    print(len(text_elements))
    
    return text_elements,table_elements
    

In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
os.environ['OPENAI_API_KEY']='any'

In [7]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
base_url = "http://localhost:8080/"
# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o",base_url=base_url)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [8]:

def batch_sleep(batch,sleep=2):
    new_list = []
    for t in batch:
        s = summarize_chain.invoke(t)
        new_list.append(s)
        time.sleep(sleep)
    
    return new_list
    
    

In [17]:
import chromadb
client = chromadb.PersistentClient('./chroma_docs')
collection = client.get_or_create_collection('documents_summary')
collection_raw= client.get_or_create_collection('documents_raw')
#collection_table = client.get_or_create_collection('table_summary')

for i,filename in enumerate(files[36:]):
    
    print(i,filename)
    text_elements,table_elements = get_text_and_table(filename)
    # Apply to text
    texts = [i.text for i in text_elements]
    tables = [i.text for i in table_elements]
    
    ids = [filename+'_'+ str(i) for i in range(len(texts))]
    ids_tables = [filename+'_table_'+ str(i) for i in range(len(tables))]
    
    if len(ids)>0:
        collection_raw.add(ids=ids,documents=texts)
        text_summaries = batch_sleep(texts)
        collection.add(ids=ids,documents=text_summaries)
    
    if len(ids_tables)>0:
        collection_raw.add(ids=ids_tables,documents=tables)
        
        table_summaries = batch_sleep(tables)
        collection.add(ids=ids_tables,documents=table_summaries)
        
    print(collection.count())





0 2305.13453v2.pdf


Insert of existing embedding ID: 2305.13453v2.pdf_0
Insert of existing embedding ID: 2305.13453v2.pdf_1
Insert of existing embedding ID: 2305.13453v2.pdf_2
Insert of existing embedding ID: 2305.13453v2.pdf_3
Insert of existing embedding ID: 2305.13453v2.pdf_4
Insert of existing embedding ID: 2305.13453v2.pdf_5
Insert of existing embedding ID: 2305.13453v2.pdf_6
Insert of existing embedding ID: 2305.13453v2.pdf_7
Insert of existing embedding ID: 2305.13453v2.pdf_8
Insert of existing embedding ID: 2305.13453v2.pdf_9
Insert of existing embedding ID: 2305.13453v2.pdf_10
Insert of existing embedding ID: 2305.13453v2.pdf_11
Add of existing embedding ID: 2305.13453v2.pdf_0
Add of existing embedding ID: 2305.13453v2.pdf_1
Add of existing embedding ID: 2305.13453v2.pdf_2
Add of existing embedding ID: 2305.13453v2.pdf_3
Add of existing embedding ID: 2305.13453v2.pdf_4
Add of existing embedding ID: 2305.13453v2.pdf_5
Add of existing embedding ID: 2305.13453v2.pdf_6
Add of existing embedding ID: 2

1
12
2078
1 2310.11511v1.pdf
7
39
2124
2 2406.14550v1.pdf
8
32
2164
3 P10-1031.pdf
2
17
2183
4 2409.04004v2.pdf
14
39
2236
5 2109.05679v2.pdf
1
29
2266
6 2102.01187.pdf
4
17
2287
7 2312.15713v1.pdf
8
31
2326
8 2408.03010v1.pdf
3
13
2342
9 2305.17888v1.pdf
12
21
2375
10 2303.11366v4.pdf
7
26
2408
11 2307.03109v9.pdf
13
69
2490
12 2408.08921v2.pdf
1
59
2550
13 2106.14807v1.pdf
2
9
2561
14 2304.12210.pdf
6
68
2635
15 2307.12856v4.pdf
11
35
2681
16 2404.16130.pdf
4
20
2705
17 2409.00786v1.pdf
12
34
2751
18 2403.08345v1.pdf
1
11
2763
19 2312.14238.pdf
38
54
2855
20 2402.03216v4.pdf
15
31
2901
21 2403.14403v2.pdf
7
26
2934
22 2209.11000v1.pdf
6
19
2959
23 2307.08621v4.pdf
8
21
2988
24 2406.11736v1.pdf
9
22
3019
25 2206.08896v1.pdf
2
46
3067
26 2210.03945v2.pdf
10
24
3101
27 2210.11416v5.pdf
46
64
3211
28 2112.08778v1.pdf
5
15
3231


: 