In [None]:
#!pip install -r ../container_app/requirements.txt
#!pip install chromadb

# Using unstructured

In [1]:
#!more .env
import os
import base64
import uuid
from dotenv import load_dotenv

load_dotenv() 

True

## Creating Embeddings

In [2]:
from unstructured.partition.pdf import partition_pdf

path = "./doc_dev_image/products/"
file = "110-0076545A_Wind_Farm_Route_Planning_Option_9.x_OPM.pdf"

# Test
u_raw_pdf_elements = partition_pdf(
    mode="elements",
    filename=path+file,
    strategy="hi_res",
    #extract_images_in_pdf=True,                            # mandatory to set as ``True``
    extract_image_block_types=["Image"],
    extract_image_block_to_payload = True,
    image_output_dir_path=path,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
)

# Working
# raw_pdf_elements = partition_pdf(
#    filename=path+file,
#    extract_images_in_pdf=True,
#    infer_table_structure=True,
#    chunking_strategy="by_title",
#    max_characters=4000,
#    new_after_n_chars=3800,
#    combine_text_under_n_chars=2000,
#    image_output_dir_path=path,
# )

In [3]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in u_raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1
# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 33,
 "<class 'unstructured.documents.elements.Table'>": 11}

In [4]:
# Categorize text elements by type
tables = []
texts = []
images = []
compositeelement = []
NarrativeText = []

for element in u_raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        tables.append(str(element))
    elif "unstructured.documents.elements.Image" in str(type(element)):
        images.append(str(element))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        compositeelement.append(str(element))
    elif "unstructured.documents.elements.Text" in str(type(element)):
        texts.append(str(element))
    elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
        NarrativeText.append(str(element))

In [5]:
# Tables
print("The length of table elements are :", len(tables))

# Text
print("The length of compositeelement elements are :", len(compositeelement))
#print(len(texts[0]))

# Image
print("The length of image elements are :", len(images))


The length of table elements are : 11
The length of compositeelement elements are : 33
The length of image elements are : 0


In [6]:
# Get all original elements as it contains the images.
# https://github.com/Unstructured-IO/unstructured/issues/2603
orig_elements = [
    e
    for chunk in u_raw_pdf_elements if not chunk.metadata.is_continuation
    for e in chunk.metadata.orig_elements
]


In [7]:
for orig_element in orig_elements:
    if orig_element.category == "Image":
        print(orig_element.metadata.image_base64)

/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCADaAN0DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACsy/8RaPpVwLe/1K3t5iobZI+Dg9/wBDWnXh/wAW/wDkco/+vOP/ANCepnLlVz0MswccXX9lJ2Vj1P8A4TTw1/0G7L/v4KP+E08Nf9Buy/7+CvnCisvas+h/1bofzv8AA+j/APhNPDX/AEG7L/v4KP8A

In [12]:
# Printing all metadata from orig_elements
print(orig_elements)
#for orig_element in orig_elements:
#    if orig_element.category == "Image":
#        print(orig_element.metadata.image_base64)

[<unstructured.documents.elements.NarrativeText object at 0x7f11c18572b0>, <unstructured.documents.elements.Image object at 0x7f11c1854040>, <unstructured.documents.elements.Title object at 0x7f11c1856980>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1856050>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1856320>, <unstructured.documents.elements.Title object at 0x7f11c1857160>, <unstructured.documents.elements.FigureCaption object at 0x7f11c1857a30>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1855b40>, <unstructured.documents.elements.Title object at 0x7f11c39a1e70>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1856860>, <unstructured.documents.elements.Title object at 0x7f11c19dcd90>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1856590>, <unstructured.documents.elements.NarrativeText object at 0x7f11c1857640>, <unstructured.documents.elements.Title object at 0x7f11c18578b0>, <unstructur

In [10]:
# # Folder with pdf and extracted images
# output_path = "./figures/"#"/Users/rlm/Desktop/photos/"

# # Function to encode images
# def encode_image(image_path):
#     with open(image_path, "rb") as image_file:
#         return base64.b64encode(image_file.read()).decode('utf-8')
    
# for image_file in os.listdir(output_path):
#     if image_file.endswith(('.png', '.jpg', '.jpeg')):
#         image_path = os.path.join(output_path, image_file)
#         encoded_image = encode_image(image_path)
#         image_elements.append(encoded_image)

# # image
# print("The length of image elements are :",len(image_elements))

## Using Embeddings

In [48]:
from init_openai import init_embedding
from init_v2 import create_vectorstore_io_structure
from pathlib import Path
import logging
import logging.config

from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore, InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document

#location of the input documents
doc_dir = './doc_dev_image'

#location of the output vectorstores (root folder)
output_root_dir = './vectorstore_image' #'./vectorstore'   

embedding = init_embedding()
CONTEXTUALIZED=True

#Size of chunk and chunk overlap
chunk_size=4000
chunk_overlap=400


# Create chroma
vectorstore = Chroma(
    collection_name="mm_rag_clip_photos", embedding_function=embedding
)
store = InMemoryByteStore() #InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)

In [None]:
#By https://medium.com/@shravankoninti/multimodal-rag-with-gpt-4-vision-and-langchain-60a6a13a92e4

# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    print("The length of doc_ids are :", len(doc_ids))
    
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))


# Add text summaries
#add_documents_to_retriever(text_summaries, text_elements)

# Add table summaries
#add_documents_to_retriever(table_summaries, table_elements)

# Add image summaries
#add_documents_to_retriever(image_summaries, image_elements) # hopefully real images soon

# Add docs as summary and images
add_documents_to_retriever(texts, image_elements) # hopefully real images soon

In [None]:
# Search
filter = None
query = "how to add turbines to the planned route ?"
documents = vectorstore.similarity_search_with_score( query, k=5, filter=filter)
print(f"Found {len(documents)} documents")
for doc in documents:
        document = doc[0]
        score = doc[1]
        #text = ""
        #text += document.page_content.replace('\n', ' ').replace('\0', ' ') #replace new line with space and null character with space.
        #print(f"\n\n score: {score}, text: {text}")
        print(f"score: {score}, document: {document}")


In [None]:
retriever.vectorstore.similarity_search("how to add turbines to the planned route ?")[0]

In [26]:
#retriever.invoke("how to add turbines to the planned route ?")[0]
from IPython.display import HTML, display
import base64

#query = "how to add turbines to the planned route ?"
#query = "how to add GPS?"
query = "turbine"

def is_base64(s):
    """Check if a string is Base64 encoded"""
    try:
        return base64.b64encode(base64.b64decode(s)) == s.encode()
    except Exception:
        return False
    
def plt_img_base64(img_base64):
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'

    # Display the image by rendering the HTML
    display(HTML(image_html))

def str_to_base64(s):
    return base64.b64encode(s.encode()).decode()

#docs = retriever.invoke(query, k=1)
#for doc in docs:
#    if is_base64(doc):
#        plt_img_base64(doc)
#    else:
#        print(doc)



## Using UnstructuredPDFLoader

In [None]:
# 
# https://docs.unstructured.io/open-source/core-functionality/chunking

from langchain_community.document_loaders import UnstructuredPDFLoader

path = "./doc_dev_image/products/"
file = "110-0076545A_Wind_Farm_Route_Planning_Option_9.x_OPM.pdf"

loader = UnstructuredPDFLoader(path+file,  
                               mode="elements", 
                               strategy="hi_res",
                               extract_image_block_types=["Image", "Table"],
                               extract_image_block_to_payload = True, 
                               #chunking_strategy="by_title", 
                               max_characters=4000,  
                               new_after_n_chars=3800
                               )
sub_docs = loader.load()
print(f"Loaded {len(sub_docs)} documents from {file}")

In [None]:
sub_docs[112].metadata

In [None]:
sub_docs[112].page_content

In [None]:
plt_img_base64(sub_docs[112].metadata['image_base64'])

In [None]:
i = 0
j = 0
for doc in sub_docs:
    j += 1
    if 'image_base64' in doc.metadata: 
        print(f"Image {i}, in document {j}")
        #plt_img_base64(doc.metadata['image_base64'])
        i += 1
print(f"Found {i} images")

### Unstructured: Using chunking_strategy="by_title" and reconstructing orig_elements


In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

path = "./doc_dev_image/products/"
file = "110-0076545A_Wind_Farm_Route_Planning_Option_9.x_OPM.pdf"

# Using the UnstructuredPDFLoader to load the document. In special Images and Tables.
# Since we are usinng chunking strategy by_title, the image and tables are inside the orig_elements metadata.
loader = UnstructuredPDFLoader(path+file,  
                               mode="elements", 
                               strategy="hi_res",
                               extract_image_block_types=["Image", "Table"],
                               extract_image_block_to_payload = True, 
                               chunking_strategy="by_title", 
                               max_characters=4000,  
                               new_after_n_chars=3800
                               )
sub_docs = loader.load()
print(f"Loaded {len(sub_docs)} documents from {file}")

In [None]:
sub_docs[0].metadata

In [None]:
sub_docs[0].page_content

In [None]:
from unstructured.staging.base import elements_from_base64_gzipped_json

orig_elements = elements_from_base64_gzipped_json(sub_docs[0].metadata["orig_elements"])
orig_elements[1].category

In [None]:
# Retrive the image_base64 from orig_elements metadata and store in a list. More than one image can exist per chunk.
for doc in sub_docs:
    image_base64_list = []
    if 'orig_elements' in doc.metadata:
        for orig_element in elements_from_base64_gzipped_json(doc.metadata["orig_elements"]):
            if orig_element.category == "Image" and orig_element.metadata.image_base64 != "":
                image_base64_list.append(orig_element.metadata.image_base64)
                print(f"    {doc.metadata['element_id']} \
                      {doc.metadata['page_number']} \
                        {orig_element.category}: \
                            {orig_element.metadata.image_base64}")
        if (len(image_base64_list) > 0):
            doc.metadata['list_image_base64'] = image_base64_list
            print(f"Document {doc.metadata['element_id']} has {len(image_base64_list)} images")    
    
            

In [None]:
sub_docs[1].metadata

In [None]:
sub_docs[0].page_content

In [None]:
for i, doc in enumerate(sub_docs):
    if 'list_image_base64' in doc.metadata and doc.metadata['list_image_base64'] != []:
        print(f"{i}: {len(doc.metadata['list_image_base64'])} Image in page: {doc.metadata['page_number']}")
    


In [None]:
for doc in sub_docs:
    #if 'orig_elements' in doc.metadata:
    #    for orig_element in elements_from_base64_gzipped_json(doc.metadata["orig_elements"]):
    #        if orig_element.category == "Image":
    #            print(plt_img_base64(orig_element.metadata.image_base64))
    if 'list_image_base64' in doc.metadata and doc.metadata['list_image_base64'] != []:
        for i in doc.metadata['list_image_base64']:
            print(plt_img_base64(i))
                
                

In [None]:
# # Show text and images associated to the page
# for doc in sub_docs:
#     if 'list_image_base64' in doc.metadata and doc.metadata['list_image_base64'] != []:
#         print(f"Image in page: {doc.metadata['page_number']} \
#               with text: {doc.page_content} and figure")
#         #for img in doc.metadata['list_image_base64']:
#         #    plt_img_base64(img)

print(sub_docs[5].page_content)
for image in sub_docs[5].metadata['list_image_base64']:
    print(plt_img_base64(image))

In [None]:
for key in sub_docs[5].metadata.keys():
    print(f"key: {key}, type: {type(sub_docs[5].metadata[key])}")

## Add to vectorstore

In [None]:
from init_openai import init_embedding
from init_v2 import create_vectorstore_io_structure
from pathlib import Path
import logging
import logging.config
import uuid

from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore, InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.vectorstores import utils as chromautils


#location of the input documents
doc_dir = './doc_dev_image'

#location of the output vectorstores (root folder)
output_root_dir = './vectorstore_image' #'./vectorstore'   

embedding = init_embedding()
CONTEXTUALIZED=True

#Size of chunk and chunk overlap
chunk_size=4000
chunk_overlap=400


# Create chroma
vectorstore = Chroma(
    collection_name="mm_rag_clip_photos", embedding_function=embedding
)

# Use a unique delimiter that is unlikely to appear in the base64 strings
unique_delimiter = "||DELIMITER||"

my_sub_docs = sub_docs

for doc in my_sub_docs:
    if 'list_image_base64' in doc.metadata:
        doc.metadata['list_image_base64'] = unique_delimiter.join(doc.metadata['list_image_base64'])
        #doc.metadata['list_image_base64'] = ','.join(doc.metadata['list_image_base64'])


my_sub_docs = chromautils.filter_complex_metadata(sub_docs)

vectorstore.add_documents(my_sub_docs)




In [None]:
#docs[0].metadata.keys()
my_sub_docs[0].metadata.keys()


In [None]:
# Search
filter = None
query = "Key terms associated with a visit to a turbine"
#query = "Wind farm without (left) and with (right) “white background”"
#query = "Viewing the wind farm clearly on the display"
#query = "Specifying the docking gate for a turbine?"
#query = "how to add turbines to the planned route ?"
#query = "Viewing the wind farm clearly on the display"
documents = vectorstore.similarity_search_with_score( query, k=5, filter=filter)
print(f"Found {len(documents)} documents")
for doc in documents:
        #print(len(doc))
        print(doc[0].metadata.keys())
        if 'list_image_base64' in doc[0].metadata and doc[0].metadata['list_image_base64'] != '':
            print(f"score: {doc[1]}, document: {doc[0].metadata['page_number']}, \
              list_image_base64: {doc[0].metadata['list_image_base64']}")
            doc[0].metadata['list_image_base64'] = doc[0].metadata['list_image_base64'].split(unique_delimiter)
            print(len(doc[0].metadata['list_image_base64']))
            for image in doc[0].metadata['list_image_base64']:
                 if (is_base64(image)): plt_img_base64(image)
                 else:
                      plt_img_base64(str_to_base64(image))
                      #print(f"Not a base64 image: {image}")
                 
        


# PyPDF2

In [1]:
from PyPDF2 import PdfReader


In [4]:
path = "./doc_dev_image/products/"
file = "110-0076545A_Wind_Farm_Route_Planning_Option_9.x_OPM.pdf"
reader = PdfReader(path+file)


In [None]:
print(len(reader.pages))


In [10]:
# Extracting Page text
page1 = reader.pages[0]
textPage1 = page1.extract_text()
#print(textPage1)


In [None]:
# Extracting Images From PDF Files
# Ref. https://geekflare.com/extract-text-links-images-from-pdf-using-python/


