In [None]:
!apt-get update

In [None]:
!apt-get install poppler-utils libleptonica-dev tesseract-ocr libtesseract-dev python3-pil tesseract-ocr-eng tesseract-ocr-script-latn

In [None]:
!pip install unstructured[all-docs] cmake python-dotenv pdf2image python-dateutil faiss-cpu sentence-transformers langchain==0.2.5 langchain-community==0.2.5 langchain-core==0.2.9 langchain-openai==0.1.9 bitsandbytes accelerate xformers triton transformers

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

In [None]:
!pip install -U nltk

In [None]:
import os

filename = "Codex - Adeptus Mechanicus.pdf"

elements = partition_pdf(filename,
              chunking_strategy='by_title',
              infer_table_structure=True,
              extract_image_block_types=['Image'],
              max_characters=4000,
              new_after_n_chars=3800,
              combine_text_under_n_chars=2000,
              strategy='hi_res')

- extract_images_in_pdf:
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_image_block_types' for broader extraction capabilities.

- extract_image_block_types:
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
encoded data within metadata fields.

In [None]:
partition_pdf?

In [None]:
# 影像的位置

os.listdir('figures')

In [None]:
"""
- CompositeElement: 文字
# - Table: 表格
"""

elements

In [None]:
import configparser

from langchain_openai import ChatOpenAI


def credential_init():

  credential_file = "credentials.ini"

  if os.path.exists(credential_file):
      credentials = configparser.ConfigParser()
      credentials.read(credential_file)
      os.environ['OPENAI_API_KEY'] = credentials['openai'].get('api_key')
  else:
      os.environ['OPENAI_API_KEY'] = os.environ['OPENAI']

credential_init()


model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
           model_name="gpt-4o-2024-05-13", temperature=0)

In [None]:
prompt = f"Summarize the following text:\n\n{elements[0]}\n\nSummary:"

In [None]:
prompt

In [None]:
model.invoke(prompt)

In [None]:
prompt = f"Summarize the following text:\n\n{elements[7]}\n\nSummary:"
model.invoke(prompt)

## 要如何判別文字或是表格?

In [None]:
str(type(elements[0]))

In [None]:
for element in elements:
  print(str(type(element)))

## 影像，表格，文字 三位一體 檢索系統

一個簡單的範例，你當然可以做得很複雜。像是整合text的部分然後用Semantic Splitting拆分。

In [None]:
import io
import base64

from PIL import Image
from langchain.docstore.document import Document
from langchain_core.messages.human import HumanMessage


def image_to_base64(image_path):

  with Image.open(image_path) as image:
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    image_str = base64.b64encode(buffered.getvalue())
  return image_str.decode('utf-8')


def get_summary(element, model):

  str_type = str(type(element))

  if 'CompositeElement' in str_type:
    prompt = f"Summarize the following text:\n\n{element}\n\nSummary:"
  if 'Table' in str_type:
    prompt = f"Summarize the following table:\n\n{element}\n\nSummary:"

  response = model.invoke(prompt)

  return response.content


def get_image_summary(filename, model):

  image_str = image_to_base64(filename)

  human_message = HumanMessage(content=[{'type': 'text',
                        'text': 'What is in this image?'},
                        {'type': 'image_url',
                         'image_url': {
                          'url': f"data:image/png;base64,{image_str}"}
                        }])

  response = model.invoke([human_message])

  return response.content


In [None]:
documents = []


for element in elements:

  str_type = str(type(element))

  summary = get_summary(element, model)

  if 'CompositeElement' in str_type:
    type_ = 'text'
  if 'Table' in str_type:
    type_ = 'table'

  documents.append(Document(page_content=summary, metadata={'type': type_}))

**Can you adapt the process above and use batch to speed?**

In [None]:
for image_file in os.listdir('figures'):

  image_path = f'figures/{image_file}'

  summary = get_image_summary(image_path, model)

  documents.append(Document(page_content=summary, metadata={'type': 'image', 'filename': image_path}))

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

In [None]:
from operator import itemgetter

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


image_retriever = vectorstore.as_retriever(search_kwargs={"filter": {'type': 'image'}})

template = ("Answer the question based only on the following context, which can include text, images, and tables:\n\n"
            "{context}\n\n"
            "Question: {question}")

prompt = ChatPromptTemplate.from_template(template)

pipeline_ = RunnablePassthrough.assign(context=itemgetter("question")|image_retriever)| prompt| model| StrOutputParser()

In [None]:
print(pipeline_.invoke({"question": "What is the chemical structure of Glycerin?"}))

In [None]:
image_retriever.invoke("What is the chemical structure of Glycerin?")

## Create a second vectorstore

In [None]:
from typing import List, Optional
from unstructured.documents.elements import CompositeElement


def create_trinity_vectorstore(filename: str):
    """
    Create a vectorstore from a pdf file.
    """

    # remove the ext of the filename

    dir_ = filename.split('.')[0].split("/")[-1]

    fig_dir = f"{dir_}/figures"

    if not os.path.isdir(fig_dir):
        os.makedirs(fig_dir)
    else:
        return None

    elements = partition_pdf(filename,
                             chunking_strategy='by_title',
                             infer_table_structure=True,
                             extract_image_block_types=['Image', 'Table'],
                             max_characters=4000,
                             new_after_n_chars=3800,
                             combine_text_under_n_chars=2000,
                             extract_image_block_output_dir=fig_dir,
                             strategy='hi_res')

    vectorstore = elements_2_vectorstore(elements, filename=filename.split("/")[-1], fig_dir=fig_dir)

    return vectorstore


def elements_2_vectorstore(elements: List[CompositeElement], filename: str,
                           fig_dir: Optional[str] = None,
                           ):
    documents = []

    for element in elements:

        str_type = str(type(element))

        summary = get_summary(element, model)

        if 'CompositeElement' in str_type:
            type_ = 'text'
        if 'Table' in str_type:
            type_ = 'table'

        documents.append(Document(page_content=summary, metadata={'type': type_, "filename": filename}))

    for image_file in os.listdir(fig_dir):
        image_path = f'{fig_dir}/{image_file}'

        summary = get_image_summary(image_path, model)

        documents.append(
            Document(page_content=summary, metadata={'type': 'image', 'image_source': f'{fig_dir}/{image_file}',
                                                     "filename": filename}))

    vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

    return vectorstore


def image_to_base64(image_path):

  with Image.open(image_path) as image:
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    image_str = base64.b64encode(buffered.getvalue())
  return image_str.decode('utf-8')


def get_summary(element, model):

  str_type = str(type(element))

  if 'CompositeElement' in str_type:
    prompt = f"Summarize the following text:\n\n{element}\n\nSummary:"
  if 'Table' in str_type:
    prompt = f"Summarize the following table:\n\n{element}\n\nSummary:"

  response = model.invoke(prompt)

  return response.content


def get_image_summary(filename, model):

  image_str = image_to_base64(filename)

  human_message = HumanMessage(content=[{'type': 'text',
                        'text': 'What is in this image?'},
                        {'type': 'image_url',
                         'image_url': {
                          'url': f"data:image/png;base64,{image_str}"}
                        }])

  response = model.invoke([human_message])

  return response.content

In [None]:
vectorstore_2 = create_trinity_vectorstore("Hypochlorous Acid.pdf")

### Save the vectorstore

In [None]:
vectorstore.save_local("faiss_index")

### Load the vectorstore

In [None]:
vectorstore_1 = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)

vectorstore_1.docstore._dict

### Concatenate vectorstore

In [None]:
vectorstore_1.merge_from(vectorstore_2)

**Another example with the Bert Thesis**

You might need a different strategy for a different PDF format.

In [None]:
def create_trinity_vectorstore(filename: str):
    """
    Create a vectorstore from a pdf file.
    """
    
    dir_ = filename.split('.')[0]
    
    fig_dir = f"{dir_}/figures"
    
    if not os.path.isdir(fig_dir):
        os.makedirs(fig_dir)
    
    elements = partition_pdf(filename,
                chunking_strategy='by_title',
                combine_text_under_n_chars=500,
                max_characters=1000,
                new_after_n_chars=800,
                strategy='ocr_only')
    
    vectorstore = elements_2_vectorstore(elements, filename=filename, fig_dir=fig_dir)
    
    return vectorstore

In [None]:
vectorstore_BERT = create_trinity_vectorstore("bertv2.pdf")