In [2]:
#https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb?ref=blog.langchain.dev

In [18]:
import os
import time
path = '/home/cip/ce/ix05ogym/Majid/LLM/AI-Scientist/sciGraph/documents/'
collection_name = 'sciGraph'
csv_output_path = f'/home/cip/ce/ix05ogym/Majid/LLM/AI-Scientist/sciGraph/input/{collection_name}.csv'
files = os.listdir(path)
len(files)

1

In [4]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
# Get elements


In [5]:
class Element(BaseModel):
        type: str
        text: Any
def get_text_and_table(filename):
    raw_pdf_elements = partition_pdf(
    filename=path + filename,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
    )
    # Create a dictionary to store counts of each type
    category_counts = {}

    for element in raw_pdf_elements:
        category = str(type(element))
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1

    # Unique_categories will have unique elements
    unique_categories = set(category_counts.keys())
    category_counts



    # Categorize by type
    categorized_elements = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            categorized_elements.append(Element(type="table", text=str(element)))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            categorized_elements.append(Element(type="text", text=str(element)))

    # Tables
    table_elements = [e for e in categorized_elements if e.type == "table"]
    print(len(table_elements))

    # Text
    text_elements = [e for e in categorized_elements if e.type == "text"]
    print(len(text_elements))
    
    return text_elements,table_elements
    

In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
os.environ['OPENAI_API_KEY']='any'

In [7]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
base_url = "http://localhost:8080/"
# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o",base_url=base_url)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [8]:

def batch_sleep(batch,sleep=2):
    new_list = []
    for t in batch:
        s = summarize_chain.invoke(t)
        new_list.append(s)
        time.sleep(sleep)
    
    return new_list
    
    

In [9]:
"""import chromadb
client = chromadb.PersistentClient('./chroma_docs')
collection = client.get_or_create_collection('documents_summary')
collection_raw= client.get_or_create_collection('documents_raw')
#collection_table = client.get_or_create_collection('table_summary')

for i,filename in enumerate(files[36:]):
    
    print(i,filename)
    text_elements,table_elements = get_text_and_table(filename)
    # Apply to text
    texts = [i.text for i in text_elements]
    tables = [i.text for i in table_elements]
    
    ids = [filename+'_'+ str(i) for i in range(len(texts))]
    ids_tables = [filename+'_table_'+ str(i) for i in range(len(tables))]
    
    if len(ids)>0:
        collection_raw.add(ids=ids,documents=texts)
        text_summaries = batch_sleep(texts)
        collection.add(ids=ids,documents=text_summaries)
    
    if len(ids_tables)>0:
        collection_raw.add(ids=ids_tables,documents=tables)
        
        table_summaries = batch_sleep(tables)
        collection.add(ids=ids_tables,documents=table_summaries)
        
    print(collection.count())



"""

"import chromadb\nclient = chromadb.PersistentClient('./chroma_docs')\ncollection = client.get_or_create_collection('documents_summary')\ncollection_raw= client.get_or_create_collection('documents_raw')\n#collection_table = client.get_or_create_collection('table_summary')\n\nfor i,filename in enumerate(files[36:]):\n    \n    print(i,filename)\n    text_elements,table_elements = get_text_and_table(filename)\n    # Apply to text\n    texts = [i.text for i in text_elements]\n    tables = [i.text for i in table_elements]\n    \n    ids = [filename+'_'+ str(i) for i in range(len(texts))]\n    ids_tables = [filename+'_table_'+ str(i) for i in range(len(tables))]\n    \n    if len(ids)>0:\n        collection_raw.add(ids=ids,documents=texts)\n        text_summaries = batch_sleep(texts)\n        collection.add(ids=ids,documents=text_summaries)\n    \n    if len(ids_tables)>0:\n        collection_raw.add(ids=ids_tables,documents=tables)\n        \n        table_summaries = batch_sleep(tables)\n

In [10]:
from utils import client,electronic_collection,Embed

collection = client.get_or_create_collection(collection_name,
                                                        embedding_function= Embed(),
                                                        metadata={"hnsw:space": "ip"} # "l2", "ip, "or "cosine"
                                                        )



In [11]:
files

['survey.pdf']

In [13]:
#files = ['soldering.pdf']
texts=[]
tabels=[]
ids=[]
ids_tables=[]
BS = 100
for i,filename in enumerate(files):
    
    print(i,filename)
    text_elements,table_elements = get_text_and_table(filename)
    # Apply to text
    texts += [i.text for i in text_elements]
    ids += [filename+'_'+ str(i) for i in range(len(texts))]

    
    while len(ids)>BS:
        collection.add(ids=ids[:BS],documents=texts[:BS])
        ids = ids[BS:]
        texts = texts[BS:]
    

    tabels += [i.text for i in table_elements]
    ids_tables += [filename+'_table_'+ str(i) for i in range(len(tabels))]

    while len(ids_tables)>BS:
        collection.add(ids=ids_tables[:BS],documents=tabels[:BS])
        ids_tables = ids_tables[BS:]
        tabels = tabels[BS:]
        
    print(collection.count())
    #break


collection.add(ids=ids,documents=texts)
collection.add(ids=ids_tables,documents=tabels)




0 survey.pdf
1
59
0


In [14]:
from utils import client,electronic_collection

collection.count()



60

In [16]:
d = collection.query(query_texts='train')
d


{'ids': [['survey.pdf_32',
   'survey.pdf_31',
   'survey.pdf_33',
   'survey.pdf_45',
   'survey.pdf_44',
   'survey.pdf_54',
   'survey.pdf_53',
   'survey.pdf_43',
   'survey.pdf_51',
   'survey.pdf_52']],
 'distances': [[0.570658791603757,
   0.5806293273093693,
   0.6052365071338619,
   0.6197181148438935,
   0.6261861649235646,
   0.6266750702960826,
   0.6283849403130295,
   0.6319310817943187,
   0.6339340472697477,
   0.6424901430602312]],
 'metadatas': [[None, None, None, None, None, None, None, None, None, None]],
 'embeddings': None,
 'documents': [['8.1 Training Strategies of Retriever\n\n8.1.1 Training-Free. There are two primary types of Training-Free Retrievers currently in use. The first type consists of non-parametric retrievers. These retrievers rely on pre-defined rules or traditional graph search algorithms rather than specific models [158, 189]. The second type utilizes pre-trained LMs as retrievers. Specifically, one group of works utilizes pre-trained embedding 

In [17]:
data= collection.get()
data

{'ids': ['survey.pdf_0',
  'survey.pdf_1',
  'survey.pdf_2',
  'survey.pdf_3',
  'survey.pdf_4',
  'survey.pdf_5',
  'survey.pdf_6',
  'survey.pdf_7',
  'survey.pdf_8',
  'survey.pdf_9',
  'survey.pdf_10',
  'survey.pdf_11',
  'survey.pdf_12',
  'survey.pdf_13',
  'survey.pdf_14',
  'survey.pdf_15',
  'survey.pdf_16',
  'survey.pdf_17',
  'survey.pdf_18',
  'survey.pdf_19',
  'survey.pdf_20',
  'survey.pdf_21',
  'survey.pdf_22',
  'survey.pdf_23',
  'survey.pdf_24',
  'survey.pdf_25',
  'survey.pdf_26',
  'survey.pdf_27',
  'survey.pdf_28',
  'survey.pdf_29',
  'survey.pdf_30',
  'survey.pdf_31',
  'survey.pdf_32',
  'survey.pdf_33',
  'survey.pdf_34',
  'survey.pdf_35',
  'survey.pdf_36',
  'survey.pdf_37',
  'survey.pdf_38',
  'survey.pdf_39',
  'survey.pdf_40',
  'survey.pdf_41',
  'survey.pdf_42',
  'survey.pdf_43',
  'survey.pdf_44',
  'survey.pdf_45',
  'survey.pdf_46',
  'survey.pdf_47',
  'survey.pdf_48',
  'survey.pdf_49',
  'survey.pdf_50',
  'survey.pdf_51',
  'survey.pdf_5

In [19]:
import pandas as pd
#pd.DataFrame({"id":data['ids'],"text":data['documents']}).iloc[0:1,:].to_csv(test_path+'test.csv')
f = pd.DataFrame({"id":data['ids'],"text":data['documents']}).iloc[0:26]
f.to_csv(csv_output_path)
f

Unnamed: 0,id,text
0,survey.pdf_0,4\n\n2024\n\n2\n\n0\n\n2\n\np\n\ne S 0 1 ] I A...
1,survey.pdf_1,†Corresponding Author.\n\nAuthors’ Contact Inf...
2,survey.pdf_2,1 Introduction\n\nThe development of Large Lan...
3,survey.pdf_3,"J. ACM, Vol. 37, No. 4, Article 111. Publicati..."
4,survey.pdf_4,Response\n\nThe artistic movements of the 19th...
5,survey.pdf_5,• We discuss the core technologies underpinnin...
6,survey.pdf_6,2 Comparison with Related Techniques and Surve...
7,survey.pdf_7,2.2 LLMs on Graphs\n\nLLMs are revolutionizing...
8,survey.pdf_8,"3 Preliminaries\n\nIn this section, we introdu..."
9,survey.pdf_9,3.3 Language Models\n\nLanguage models (LMs) e...
