# Using unstructured

In [42]:
#Examples: https://colab.research.google.com/gist/alejandro-ao/47db0b8b9d00b10a96ab42dd59d90b86/langchain-multimodal.ipynb#scrollTo=8326a750

#!more .env
import os
import base64
import uuid
from dotenv import load_dotenv

load_dotenv() 

True

### Unstructured: Using chunking_strategy="by_title" and reconstructing orig_elements


In [32]:
from langchain_community.document_loaders import UnstructuredPDFLoader

path = "../data/"
file = "BV_page225.pdf"

# Using the UnstructuredPDFLoader to load the document. In special Images and Tables.
# Since we are usinng chunking strategy by_title, the image and tables are inside the orig_elements metadata.
loader = UnstructuredPDFLoader(path+file,  
                               mode="elements", 
                               strategy="hi_res",
                               extract_image_block_types=["Image", "Table"],
                               extract_image_block_to_payload = True, 
                               chunking_strategy="by_title", 
                               max_characters=4000,  
                               new_after_n_chars=3800
                               )
pages = loader.load()
print(f"Loaded {len(pages)} documents from {file}")

Loaded 5 documents from BV_page225.pdf


## Text

In [34]:
document_text = "".join([page.page_content for page in pages])
document_text

'Pt C, Ch 1, Sec 10\n\n1.4 Symbols and units\n\n1.4.1 The following symbols and related units are commonly used in this Section. Additional symbols, related to some formulae indicated in this Section, are listed wherever it is necessary.\n\n: Design pressure, in MPa\n\np\n\nT : Design temperature, in °C\n\nt : Rule required minimum thickness, in mm\n\nD : Pipe external diameter, in mm.\n\n1.5 Class of piping systems\n\n1.5.1 Purpose of the classes of piping systems\n\nPiping systems are subdivided into three classes, denoted as class |, class II and class III, for the purpose of acceptance of materials, selection of joints, heat treatment, welding, pressure testing and the certification of fittings.\n\n1.5.2 Definitions of the classes of piping systems\n\na) Classes |, Il and III are defined in Tab 3\n\nb) The following systems are not covered by Tab 3:\n\n* cargo piping for oil tankers, gas tankers and chemical tankers, and\n\n¢ fluids for refrigerating plants.\n\nTable 3 : Class of p

## Tables

In [36]:
#Add tables to the document text
tables = []
for doc in pages:
    if 'orig_elements' in doc.metadata:
            for orig_element in elements_from_base64_gzipped_json(doc.metadata["orig_elements"]):
                if orig_element.category == "Table" :
                    tables.append(str(orig_element))
# Join all table elements into a single string
document_table = "\n".join(tables)
print(document_table)

Media conveyed by the piping system Class | Class Il (1) (4) Class Ill (7) Toxic media without special safeguards (3) | not applicable not applicable Corrosive media without special safeguards (3) | with special safeguards (3) | not applicable Flammable media: * heated above flashpoint, or ¢ having flashpoint < 60°C Liquefied gas without special safeguards (3) | with special safeguards (3) | not applicable Oxyacetylene irrespective of p not applicable not applicable Steam p> 1,6 or T > 300 other (2) p<0,7 andT<170 Thermal oil p> 1,6 or T > 300 other (2) p<0,7 andT<150 Fuel oil (8) Lubricating oil p>1,6o0rT>150 other (2) p <0,7 and T <60 Flammable hydraulic oil (5) Other media (5) (6) p>4orT>300 other (2) p<1,6 and T <200
Working temperature T, in °C Working pressure P, in bar T<60 T > 60 p<7 3 bar or max. working pressure, 3 bar or max. working pressure, ~ whichever is the greater whichever is the greater . 14 bar or max. working pressure, Po? max. working pressure whichever is the gre

In [39]:
document_all = "\n".join([document_text, document_table])
document_all

'Pt C, Ch 1, Sec 10\n\n1.4 Symbols and units\n\n1.4.1 The following symbols and related units are commonly used in this Section. Additional symbols, related to some formulae indicated in this Section, are listed wherever it is necessary.\n\n: Design pressure, in MPa\n\np\n\nT : Design temperature, in °C\n\nt : Rule required minimum thickness, in mm\n\nD : Pipe external diameter, in mm.\n\n1.5 Class of piping systems\n\n1.5.1 Purpose of the classes of piping systems\n\nPiping systems are subdivided into three classes, denoted as class |, class II and class III, for the purpose of acceptance of materials, selection of joints, heat treatment, welding, pressure testing and the certification of fittings.\n\n1.5.2 Definitions of the classes of piping systems\n\na) Classes |, Il and III are defined in Tab 3\n\nb) The following systems are not covered by Tab 3:\n\n* cargo piping for oil tankers, gas tankers and chemical tankers, and\n\n¢ fluids for refrigerating plants.\n\nTable 3 : Class of p

### Chunks

In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Adjust as needed
    chunk_overlap=200  # Adjust as needed
)
chunks = text_splitter.create_documents([document_all])
chunks

[Document(metadata={}, page_content='Pt C, Ch 1, Sec 10\n\n1.4 Symbols and units\n\n1.4.1 The following symbols and related units are commonly used in this Section. Additional symbols, related to some formulae indicated in this Section, are listed wherever it is necessary.\n\n: Design pressure, in MPa\n\np\n\nT : Design temperature, in °C\n\nt : Rule required minimum thickness, in mm\n\nD : Pipe external diameter, in mm.\n\n1.5 Class of piping systems\n\n1.5.1 Purpose of the classes of piping systems\n\nPiping systems are subdivided into three classes, denoted as class |, class II and class III, for the purpose of acceptance of materials, selection of joints, heat treatment, welding, pressure testing and the certification of fittings.\n\n1.5.2 Definitions of the classes of piping systems\n\na) Classes |, Il and III are defined in Tab 3\n\nb) The following systems are not covered by Tab 3:\n\n* cargo piping for oil tankers, gas tankers and chemical tankers, and\n\n¢ fluids for refrigera

## Images

In [5]:
from unstructured.staging.base import elements_from_base64_gzipped_json

orig_elements = elements_from_base64_gzipped_json(sub_docs[0].metadata["orig_elements"])
orig_elements[1].category

'Title'

In [11]:
orig_elements[7].category

'ListItem'

In [12]:
# Retrive the image_base64 from orig_elements metadata and store in a list. More than one image can exist per chunk.
for doc in sub_docs:
    image_base64_list = []
    if 'orig_elements' in doc.metadata:
        for orig_element in elements_from_base64_gzipped_json(doc.metadata["orig_elements"]):
            if orig_element.category == "Image" and orig_element.metadata.image_base64 != "":
                image_base64_list.append(orig_element.metadata.image_base64)
                print(f"    {doc.metadata['element_id']} \
                      {doc.metadata['page_number']} \
                        {orig_element.category}: \
                            {orig_element.metadata.image_base64}")
        if (len(image_base64_list) > 0):
            doc.metadata['list_image_base64'] = image_base64_list
            print(f"Document {doc.metadata['element_id']} has {len(image_base64_list)} images")    
    
            