In [11]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence


In [12]:
from src.ocr.document_intelligence import AzureDocumentIntelligenceManager

document_intelligence_client = AzureDocumentIntelligenceManager()

In [10]:
# We will begin with the Fisher EWD/EWS/EWT Valves through NPS 12x8 Instruction Manual,
# which can be found at the following URL:
# https://www.emerson.com/documents/automation/instruction-manual-fisher-ewd-ews-ewt-valves-through-nps-12x8-en-124788.pdf
# We will use the 'prebuilt-layout' model for this task. This is the default model provided by Azure's Document Analysis Client,
# and it is capable of extracting text, tables, selection marks, and structure elements from the document.
# one of the latest feature is the abulity to extract content in a specific format, such as markdown.

document_url = "https://www.emerson.com/documents/automation/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"
model_type = "prebuilt-read"

result_ocr = document_intelligence_client.analyze_document(
    document_input=document_url,
    model_type=model_type,
    output_format="markdown",
    features=["OCR_HIGH_RESOLUTION"],
    pages="32-33",
)

In [8]:
import os
import fitz  # pip install --upgrade pip; pip install --upgrade pymupdf
from tqdm import tqdm  # pip install tqdm

In [9]:
import pdfquery

In [13]:
from typing import Any

from unstructured.partition.pdf import partition_pdf

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Define the paths
Path_File = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"
Path_File_2 = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\sample-pdf-with-images.pdf"
Path_File_3 = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\2308.08155.pdf"
workdir = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\output\\"

In [16]:
import pytesseract

# Set the tesseract_cmd variable to the tesseract executable in the specified directory
pytesseract.pytesseract.tesseract_cmd = (
    r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
)

In [15]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=Path_File,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path="workdir",
)

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [5]:
import sys
from pathlib import Path
import shutil
import io
from binascii import b2a_hex
from pdfminer.high_level import extract_pages
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import resolve1
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


def get_meta_data(input_file_path):
    with open(input_file_path, "rb") as input_file:
        input_parser = PDFParser(input_file)
        input_document = PDFDocument(input_parser)
        total_pages = resolve1(input_document.catalog["Pages"])["Count"]
        return {"total_pages": total_pages}


def get_page_layouts(input_file_path, total_pages):
    generator = extract_pages(input_file_path)
    page_layouts = [page_layout for page_layout in generator]
    return page_layouts


def get_lt_figures(page_layout):
    lt_figures = [
        element for element in page_layout if type(element).__name__ == "LTFigure"
    ]
    return lt_figures


def get_image_type(stream_first_4_bytes):
    bytes_as_hex = b2a_hex(stream_first_4_bytes).decode()
    if bytes_as_hex.startswith("ffd8"):
        return "jpeg"
    if bytes_as_hex == "89504e47":
        return "png"
    if bytes_as_hex == "47494638":
        return "gif"
    if bytes_as_hex.startswith("424d"):
        return "bmp"
    return "png"  # return 'png' as a default image type


def get_image_info(element, page_index, image_index):
    try:
        if hasattr(element._objs[0], "stream"):
            stream = element._objs[0].stream.rawdata
            image = Image.open(io.BytesIO(stream))
            return {
                "type": get_image_type(stream[:4]),
                "width": element._objs[0].width,
                "height": element._objs[0].height,
                "bounding_box": {
                    "x1": element._objs[0].x0,
                    "y1": element._objs[0].y0,
                    "x2": element._objs[0].x1,
                    "y2": element._objs[0].y1,
                },
                "pil_image": image,
            }
        else:
            print("Skipping non-image figure: No image stream")
            return None
    except Exception as e:
        print(
            f"An error occurred on page {page_index} while extracting image {image_index}: {e}"
        )
        return None


def extract_images_from_pdf(input_file_path, output_path):
    meta_data = get_meta_data(input_file_path)
    page_layouts = get_page_layouts(input_file_path, meta_data["total_pages"])

    for page_index, page_layout in enumerate(page_layouts):
        figures = get_lt_figures(page_layout)
        for figure_index, figure in enumerate(figures):
            image_info = get_image_info(figure, page_index, figure_index)
            if image_info:
                image_save_path = Path(
                    output_path,
                    f"Page_{page_index+1}_Fig_{figure_index+1}.{image_info['type']}",
                )
                image_info["pil_image"].save(str(image_save_path))

In [6]:
from pathlib import Path
import shutil

# Define the paths
Path_File = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"
Path_File_2 = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\sample-pdf-with-images.pdf"
Path_File_3 = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\2308.08155.pdf"
workdir = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\output\\"

# Define the output directory
output_directory = Path(workdir)

# Remove the output directory if it exists
shutil.rmtree(output_directory, ignore_errors=True)

# Create the output directory
output_directory.mkdir(parents=True, exist_ok=True)

# Call the function to extract images from the PDF
extract_images_from_pdf(Path_File, output_directory)

An error occurred on page 0 while extracting image 1: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 0 while extracting image 2: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 2 while extracting image 0: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 2 while extracting image 1: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 2 while extracting image 2: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 3 while extracting image 0: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 3 while extracting image 1: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error occurred on page 4 while extracting image 0: cannot identify image file <_io.BytesIO object at 0x00000189A07A49A0>
An error

In [13]:
import sys
import io
from pathlib import Path
import shutil
from pdfminer.high_level import extract_pages
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
import sys
import io
from pathlib import Path
import shutil
from pdfminer.high_level import extract_pages
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


def extract_images_from_pdf(input_file_path, output_dir):
    input_file_path_posix = Path(input_file_path)
    output_base_path = Path(output_dir)
    shutil.rmtree(output_base_path, ignore_errors=True)
    output_base_path.mkdir(parents=True, exist_ok=True)

    meta_data = get_meta_data(input_file_path)
    page_layouts = get_page_layouts(input_file_path, meta_data["total_pages"])

    for page_layout_index, page_layout in enumerate(page_layouts):
        figures = get_lt_figures(page_layout)
        for figure_index, figure in enumerate(figures):
            image_info = get_image_info(figure)
            if image_info is not False:
                if isinstance(image_info, list):
                    for image_info_index, candidate in enumerate(image_info):
                        try:
                            candidate["image_save_path"] = output_base_path.joinpath(
                                f"Page_{page_layout_index+1}_{figure_index+1}_{candidate['mode']}.{candidate['type']}"
                            )
                            candidate["pil_image"].save(
                                str(candidate["image_save_path"])
                            )
                        except Exception as e:
                            print(e)
                else:
                    if image_info["type"] is not False:
                        image_info["image_save_path"] = output_base_path.joinpath(
                            f"Page_{page_layout_index+1}_{figure_index+1}.{image_info['type']}"
                        )
                        image_info["pil_image"].save(str(image_info["image_save_path"]))

In [7]:
# import libraries
import fitz
import io
from PIL import Image

# Define the paths
file = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"
# Directory where images will be saved
output_directory = "C:\\Users\\pablosal\\Desktop\\gbbai-azure-ai-document-intelligence\\notebooks\\dev\\output2\\"

# open the file
pdf_file = fitz.open(file)

# STEP 3
# iterate over PDF pages
for page_index in range(len(pdf_file)):
    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print("[!] No images found on page", page_index)

    for image_index, img in enumerate(page.get_images(), start=1):
        try:
            # get the XREF of the image
            xref = img[0]

            # extract the image bytes
            base_image = pdf_file.extract_image(xref)
            image_bytes = base_image["image"]

            # get the image extension
            image_ext = base_image["ext"]

            # load it to PIL
            image = Image.open(io.BytesIO(image_bytes))

            # save the image to the specified directory
            image_save_path = f"{output_directory}image{page_index+1}_{image_index}.png"
            image.save(image_save_path)
            print(f"Image saved at: {image_save_path}")

        except Exception as e:
            print(
                f"An error occurred on page {page_index} while extracting image {image_index}: {e}"
            )

[+] Found a total of 3 images in page 0
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image1_1.png
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image1_2.png
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image1_3.png
[!] No images found on page 1
[+] Found a total of 3 images in page 2
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image3_1.png
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image3_2.png
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image3_3.png
[+] Found a total of 2 images in page 3
Image saved at: C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\output2\image4_1.png
Image saved at: C:\Users\pablosal\Desktop\gbbai-azur

In [9]:
import os
import pytesseract

# Get the current working directory
script_dir = os.getcwd()

# Set the tesseract_cmd variable to the tesseract executable in the script's directory
pytesseract.pytesseract.tesseract_cmd = os.path.join(
    script_dir, "Tesseract-OCR", "tesseract.exe"
)

In [4]:
import pytesseract

# Set the tesseract_cmd variable to the tesseract executable in the specified directory
pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Users\pablosal\AppData\Local\Tesseract-OCR\tesseract.exe"
)

In [None]:
pdf_document = fitz.open("file.pdf")
for current_page in range(len(pdf_document)):
    for image in pdf_document.getPageImageList(current_page):
        xref = image[0]
        pix = fitz.Pixmap(pdf_document, xref)
        if pix.n < 5:  # this is GRAY or RGB
            pix.writePNG("page%s-%s.png" % (current_page, xref))
        else:  # CMYK: convert to RGB first
            pix1 = fitz.Pixmap(fitz.csRGB, pix)
            pix1.writePNG("page%s-%s.png" % (current_page, xref))
            pix1 = None
        pix = None

In [11]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=Path_File,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path="workdir",
)

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [17]:
doc = fitz.Document(Path_File)

In [21]:
import fitz  # PyMuPDF
import io
from PIL import Image

pdf_path = Path_File  # Update this to your PDF file path
doc = fitz.open(pdf_path)
extracted_images = []

for i in range(len(doc)):
    for img in doc.get_page_images(i):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_data = base_image.get("image")

        if image_data:
            try:
                img = Image.open(io.BytesIO(image_data))
                extracted_images.append(img)
            except IOError as e:
                print(f"Error processing image xref {xref} on page {i}: {e}")
        else:
            print(f"No image found for xref {xref} on page {i}")

Error processing image xref 22 on page 7: cannot identify image file <_io.BytesIO object at 0x000002723FC81680>
Error processing image xref 26 on page 8: cannot identify image file <_io.BytesIO object at 0x0000027240290180>
Error processing image xref 27 on page 8: cannot identify image file <_io.BytesIO object at 0x0000027240290180>
Error processing image xref 30 on page 9: cannot identify image file <_io.BytesIO object at 0x0000027240290180>
Error processing image xref 37 on page 11: cannot identify image file <_io.BytesIO object at 0x0000027240290130>
Error processing image xref 50 on page 15: cannot identify image file <_io.BytesIO object at 0x000002724028D180>
Error processing image xref 54 on page 16: cannot identify image file <_io.BytesIO object at 0x000002724028D180>
Error processing image xref 67 on page 20: cannot identify image file <_io.BytesIO object at 0x000002724028D180>
Error processing image xref 95 on page 28: cannot identify image file <_io.BytesIO object at 0x00000

In [19]:
from PIL import Image
import os
import io

for i in tqdm(range(len(doc)), desc="pages"):
    for img in tqdm(doc.get_page_images(i), desc="page_images"):
        xref = img[0]
        base = doc.extract_image(xref)
        image_data = base["image"]

        # Convert the image data to a PIL Image and display it
        img = Image.open(io.BytesIO(image_data))
        img.show()

page_images: 100%|██████████| 3/3 [00:12<00:00,  4.14s/it]
page_images: 0it [00:00, ?it/s]0:12<09:44, 12.44s/it]
page_images: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]
page_images: 100%|██████████| 1/1 [00:03<00:00,  3.42s/it]
page_images: 0it [00:00, ?it/s]0:19<03:02,  4.15s/it]
page_images: 100%|██████████| 1/1 [00:03<00:00,  3.39s/it]
page_images: 100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
page_images:   0%|          | 0/2 [00:00<?, ?it/s]it]
pages:  15%|█▍        | 7/48 [00:26<02:33,  3.74s/it]


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000002723EAB12C0>

In [14]:
doc

Document('C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence\notebooks\dev\instruction-manual-fisher-ewd-ews-ewt-valves-through-nps-12x8-en-124788.pdf')

In [7]:
pdf = pdfquery.PDFQuery(Path_File)
pdf.load()

In [9]:
pdf.tree

<lxml.etree._ElementTree at 0x2723e6ac580>