In [3]:
#https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb?ref=blog.langchain.dev

In [4]:
import os
import time
path = 'documents/'
files = os.listdir(path)
len(files)

66

In [5]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
# Get elements


In [6]:
class Element(BaseModel):
        type: str
        text: Any
def get_text_and_table(filename):
    raw_pdf_elements = partition_pdf(
    filename=path + filename,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
    )
    # Create a dictionary to store counts of each type
    category_counts = {}

    for element in raw_pdf_elements:
        category = str(type(element))
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1

    # Unique_categories will have unique elements
    unique_categories = set(category_counts.keys())
    category_counts



    # Categorize by type
    categorized_elements = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            categorized_elements.append(Element(type="table", text=str(element)))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            categorized_elements.append(Element(type="text", text=str(element)))

    # Tables
    table_elements = [e for e in categorized_elements if e.type == "table"]
    print(len(table_elements))

    # Text
    text_elements = [e for e in categorized_elements if e.type == "text"]
    print(len(text_elements))
    
    return text_elements,table_elements
    

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
os.environ['OPENAI_API_KEY']='any'

In [8]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
base_url = "http://localhost:8080/"
# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o",base_url=base_url)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [9]:

def batch_sleep(batch,sleep=2):
    new_list = []
    for t in batch:
        s = summarize_chain.invoke(t)
        new_list.append(s)
        time.sleep(sleep)
    
    return new_list
    
    

In [10]:
"""import chromadb
client = chromadb.PersistentClient('./chroma_docs')
collection = client.get_or_create_collection('documents_summary')
collection_raw= client.get_or_create_collection('documents_raw')
#collection_table = client.get_or_create_collection('table_summary')

for i,filename in enumerate(files[36:]):
    
    print(i,filename)
    text_elements,table_elements = get_text_and_table(filename)
    # Apply to text
    texts = [i.text for i in text_elements]
    tables = [i.text for i in table_elements]
    
    ids = [filename+'_'+ str(i) for i in range(len(texts))]
    ids_tables = [filename+'_table_'+ str(i) for i in range(len(tables))]
    
    if len(ids)>0:
        collection_raw.add(ids=ids,documents=texts)
        text_summaries = batch_sleep(texts)
        collection.add(ids=ids,documents=text_summaries)
    
    if len(ids_tables)>0:
        collection_raw.add(ids=ids_tables,documents=tables)
        
        table_summaries = batch_sleep(tables)
        collection.add(ids=ids_tables,documents=table_summaries)
        
    print(collection.count())



"""

"import chromadb\nclient = chromadb.PersistentClient('./chroma_docs')\ncollection = client.get_or_create_collection('documents_summary')\ncollection_raw= client.get_or_create_collection('documents_raw')\n#collection_table = client.get_or_create_collection('table_summary')\n\nfor i,filename in enumerate(files[36:]):\n    \n    print(i,filename)\n    text_elements,table_elements = get_text_and_table(filename)\n    # Apply to text\n    texts = [i.text for i in text_elements]\n    tables = [i.text for i in table_elements]\n    \n    ids = [filename+'_'+ str(i) for i in range(len(texts))]\n    ids_tables = [filename+'_table_'+ str(i) for i in range(len(tables))]\n    \n    if len(ids)>0:\n        collection_raw.add(ids=ids,documents=texts)\n        text_summaries = batch_sleep(texts)\n        collection.add(ids=ids,documents=text_summaries)\n    \n    if len(ids_tables)>0:\n        collection_raw.add(ids=ids_tables,documents=tables)\n        \n        table_summaries = batch_sleep(tables)\n

In [14]:
from utils import client,electronic_collection
files = ['soldering.pdf']
texts=[]
tabels=[]
ids=[]
ids_tables=[]
BS = 100
for i,filename in enumerate(files):
    
    print(i,filename)
    text_elements,table_elements = get_text_and_table(filename)
    # Apply to text
    texts += [i.text for i in text_elements]
    ids += [filename+'_'+ str(i) for i in range(len(texts))]

    
    while len(ids)>BS:
        electronic_collection.add(ids=ids[:BS],documents=texts[:BS])
        ids = ids[BS:]
        texts = texts[BS:]
    

    tabels += [i.text for i in table_elements]
    ids_tables += [filename+'_table_'+ str(i) for i in range(len(tabels))]

    while len(ids_tables)>BS:
        electronic_collection.add(ids=ids_tables[:BS],documents=tabels[:BS])
        ids_tables = ids_tables[BS:]
        tabels = tabels[BS:]
        
    print(electronic_collection.count())
    #break


electronic_collection.add(ids=ids,documents=texts)
electronic_collection.add(ids=ids_tables,documents=tabels)




0 soldering.pdf




37
270


[92mRequest to litellm:[0m
[92mlitellm.embedding('openai/text', input=["6G a®*: s +. i 2 a <, s e Ged SECOND EDITION\n\nSoldering in Electronics Assembly\n\nThis Page Intentionally Left Blank\n\nSoldering in Electronics Assembly   Mike Judd and Keith Brindley   Newnes  \n\nNewnes\n\nNewnes An imprint of Butterworth-Heinemann Linacre House, Jordan Hill, Oxford OX2 8DP 225 Wildwood Avenue, Woburn, MA 01801-2041 A division of Reed Educationa and Professional Publishing Ltd RQ A member of the Reed Elsevier plc group First published 1992 Second edition 1999 © Mike Judd & Keith Brindley 1992, 1999 All rights reserved. No part of this publication may be reproduced in any material form (including photocopying or storing in any medium by electronic Means and whether or not transiently or incidentally to some other use of this publication) without the written permission of the copyright holders except in accordance with the provisions of the Copyright, Design and Patents Act 1988 or 

Insert of existing embedding ID: soldering.pdf_0


RAW RESPONSE:
CreateEmbeddingResponse(data=[Embedding(embedding=[0.012247447, 0.03888274, 0.015037207, 0.025281178, 0.038108665, -0.019933999, 0.019961806, 0.052479, -0.0004996389, -0.047285352, -0.048603136, 0.02478564, 0.060537282, -0.0034965097, -0.042449858, -0.044364315, 0.035981197, 0.038776126, -0.10862808, -0.038030427, -0.016782666, 0.016504027, 0.004067207, 0.03043184, -0.014730855, -0.037212733, -0.024413751, -0.064463876, -0.036158305, -0.039569303, 0.020561438, 0.033672027, 0.009234581, 0.02646496, 0.037626244, 0.01763127, -0.01865369, 0.026104894, 0.022605743, -0.10472937, -0.048065875, -0.032018095, -0.035778057, 0.009301816, 0.0026889206, -0.024218967, -0.005288623, 0.026007425, -0.066284, 0.010543455, 0.04146419, 0.03312873, -0.03891705, 0.011815319, -0.041008446, 0.019816985, -0.0051985825, -0.0021894656, -0.039123468, -0.0030000347, 0.015279365, -0.014427295, -0.0609909, -0.010729981, 0.009567548, -0.016256656, -0.043141514, 0.0072875367, -0.028039837, -0.020372307, 

Add of existing embedding ID: soldering.pdf_0


101


In [23]:
from utils import client,electronic_collection

electronic_collection.count()



307

In [13]:
d = collection.query(query_texts='2 Electronics assemblies')
d


NameError: name 'collection' is not defined

In [None]:
d['ids'][0][1]


In [None]:
import pandas as pd
test_path = '/home/cip/ce/ix05ogym/Majid/LLM/GraphRag/elec_graph/input/'
#pd.DataFrame({"id":data['ids'],"text":data['documents']}).iloc[0:1,:].to_csv(test_path+'test.csv')
f = pd.DataFrame({"id":data['ids'],"text":data['documents']}).iloc[0:26]
f.to_csv(test_path+'soldering.csv')
f