In [None]:
from PIL import Image
import io 
import matplotlib.pyplot as plt

intel_doc = "../data/docs/pdf/2022 Q3 INTC.pdf"

page_with_image = 3
page_with_table = 34

In [None]:
def print_images(images):
    num_images = len(images)
    fig, axes = plt.subplots(1, num_images, figsize=(15, 5))

    if num_images == 1:
        # For a single image, axes is not a list
        axes = [axes]

    for ax, img in zip(axes, images):
        ax.imshow(img)
        ax.axis('off')  # Turn off axis

    plt.tight_layout()
    plt.show()

## Pymudf

In [None]:
import fitz  

doc = fitz.open(intel_doc)

### Page with image

In [None]:
page = doc.load_page(page_with_image)
text = page.get_text()

print(text)


### Page with table

In [None]:
page = doc.load_page(page_with_table)
text = page.get_text()

print(text)


## Conclusion

Have to test/check how well it performs with retrieval.

Images need to be dealt with separately.
Tables need to be dealt with separately

## PymudfLLM

In [None]:
import pymupdf4llm

markdown = pymupdf4llm.to_markdown(intel_doc, pages=[page_with_image])
print(markdown)

In [None]:
markdown = pymupdf4llm.to_markdown(intel_doc, pages=[page_with_table])
print(markdown)

## Conclusion

Markdown seems to improve handling of tables? Not entirely sure. Need to test both methods with retrieval.

Images need to be dealt with separately

## Unstructured

In [None]:
from unstructured.partition.pdf import partition_pdf

raw_pdf_elements = partition_pdf(
    filename=intel_doc,
    extract_images_in_pdf=False,
    infer_table_structure=True,
    strategy = "hi_res"
)

In [None]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

In [None]:
raw_pdf_elements[38].text

## Conclusion

Only detected 1 table, so performance is not what I expected. Requires quite a bit of set-up, so not suitable for easy set-up / demo purposes.

## Handling images

In [None]:
# Set up the plot for side-by-side images
doc = fitz.open(intel_doc)
page = doc.load_page(page_with_image)
image_list = page.get_images(full=True)
    
high_res_images = []
for i, img in enumerate(image_list):
    xref = img[0]  # Image XREF
    base_image = doc.extract_image(xref)
    
    image_bytes = base_image["image"]
    image = Image.open(io.BytesIO(image_bytes))    
    high_res_images.append(image)

print_images(high_res_images)

In [None]:
# Set up the plot for side-by-side images
doc = fitz.open(intel_doc)
page = doc.load_page(page_with_image)
image_list = page.get_images(full=True)

low_res_images = []
for i, img in enumerate(image_list):
    xref = img[0]  # Image XREF
    base_image = doc.extract_image(xref)
    bbox, matrix = page.get_image_rects(xref, transform=True)[0]
    bbox_height = bbox[3] - bbox[1]
    extended_y0 = max(0, bbox[1] - 0.3 * bbox_height)  
    extended_bbox = (bbox[0], extended_y0, bbox[2], bbox[1])   
    
    zoom_x = 2.5  # horizontal zoom
    zoom_y = 2.5  # vertical zoom
    mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
    pix = page.get_pixmap(clip=extended_bbox, matrix=mat)  # Clip the page to the adjusted bounding box
    image = Image.open(io.BytesIO(pix.tobytes())) 
    low_res_images.append(image)

print_images(low_res_images)


In [None]:
final_images = []
for image, top_image in zip(high_res_images, low_res_images):
    # Resize the top_image to match the width of the image
    if top_image.width != image.width:
        aspect_ratio = top_image.height / top_image.width
        new_width = image.width
        new_height = int(new_width * aspect_ratio)
        top_image = top_image.resize((new_width, new_height))
    
    # Create a new image with the combined height of the two images
    combined_height = top_image.height + image.height
    combined_image = Image.new("RGB", (image.width, combined_height))
    
    # Paste the top_image and image onto the combined_image
    combined_image.paste(top_image, (0, 0))
    combined_image.paste(image, (0, top_image.height))
    final_images.append(combined_image)

print_images(final_images)

In [None]:
final_images[1].save("../data/test_image.png")

## Conclusion

Able to extract the images and its  high resolution headers. Tested with GPT-4o and it's able to read from it. 

## Tables

In [None]:
import pymupdf
import fitz

doc = fitz.open(intel_doc)
page = doc.load_page(13)
zoom_x = 2  # horizontal zoom
zoom_y = 2  # vertical zoom
mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

display(image)

In [None]:
import pymupdf

apple_doc = "../data/docs/2022 Q3 AAPL.pdf"
doc = fitz.open(apple_doc)
page = doc.load_page(3)
zoom_x = 2  # horizontal zoom
zoom_y = 2  # vertical zoom
mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

display(image)

In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/microsoft/table-transformer-detection"
headers = {"Authorization": "Bearer "}

doc_root = "../data/docs/pdf/"
doc_name = "2022 Q3 AAPL.pdf"

doc = fitz.open(doc_root + doc_name)

for page in doc:
    zoom_x = 2  # horizontal zoom
    zoom_y = 2  # vertical zoom
    mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
    pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
    # Create a Pillow Image object from the pixmap
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    def query():
        with io.BytesIO() as output:
            image.save(output, format="PNG")  # You can specify the desired format, e.g., "JPEG", "PNG"
            data = output.getvalue()  # Get the binary data
            response = requests.post(API_URL, headers=headers, data=data)
            return response.json()

    output = query()
    output

In [None]:
images = []
for table in output:
    box = table['box']
    crop_coordinates = (box['xmin'], box['ymin'], box['xmax'], box['ymax'])    
    width =  box['xmax'] - box['xmin']
    height = box['ymax'] - box['ymin']
    
    extension_percentage = 0.10
    extended_width = int(width * extension_percentage)
    extended_height = int(height * extension_percentage)

    # Update the coordinates with the extension, ensuring they do not go out of bounds
    new_xmin = max(box['xmin'] - extended_width, 0)
    new_ymin = max(box['ymin'] - extended_height, 0)
    new_xmax = min(box['xmax'] + extended_width, image.width)
    new_ymax = min(box['ymax'] + extended_height, image.height)

    # New crop coordinates
    crop_coordinates = (new_xmin, new_ymin, new_xmax, new_ymax)
    table_image = image.crop(crop_coordinates)
    images.append(table_image)

print_images(images)

In [None]:
import pymupdf

doc = fitz.open(intel_doc)
page = doc.load_page(38)
zoom_x = 3  # horizontal zoom
zoom_y = 3  # vertical zoom
mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

display(image)

In [None]:
import requests


import pymupdf

doc = fitz.open(intel_doc)
page = doc.load_page(38)
zoom_x = 3  # horizontal zoom
zoom_y = 3  # vertical zoom
mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

display(image)

API_URL = "https://api-inference.huggingface.co/models/microsoft/table-transformer-detection"
headers = {"Authorization": "Bearer "}

def query():
    with io.BytesIO() as output:
        image.save(output, format="PNG")  # You can specify the desired format, e.g., "JPEG", "PNG"
        data = output.getvalue()  # Get the binary data
        response = requests.post(API_URL, headers=headers, data=data)
        return response.json()

output = query()
output

### Can ChatGPT properly read Markdown / page.get_text()?


In [None]:
markdown = pymupdf4llm.to_markdown(intel_doc, pages=[38])
print(markdown)

In [None]:
print(page.get_text())

In [None]:
doc = fitz.open(intel_doc)
doc = fitz.open("../data/docs/2022 Q3 MSFT.pdf")

for page in doc:
    tabs = page.find_tables()
    if tabs.tables:
        print(tabs[0].extract())

In [None]:
import pdfplumber

# Open the PDF and extract pages
with pdfplumber.open("../data/docs/2022 Q3 MSFT.pdf") as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()  # Extract tables
        print(tables)

In [None]:
import pymupdf4llm

markdown = pymupdf4llm.to_markdown(intel_doc, pages=[38])
text_file = open("test.txt", "w")
text_file.write(markdown)
text_file.close()

In [None]:
from unstructured.partition.text import partition_text

raw_elements = partition_text(filename="test.txt")

# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

In [None]:
import os
os.remove("test.txt")

In [None]:
from unstructured.partition.pdf import partition_pdf

fname = "../data/docs/pdf/2022 Q3 INTC.pdf"

elements = partition_pdf(filename=fname,
                         infer_table_structure=True,
                         strategy='hi_res',
           )

tables = [el for el in elements if el.category == "Table"]

In [None]:
tables[4].metadata.text_as_html

In [None]:
import pymupdf
import fitz
from PIL import Image

intel_doc = "../data/docs/pdf/2023 Q2 INTC.pdf"
doc = fitz.open(intel_doc)
page = doc.load_page(5)
zoom_x = 3  # horizontal zoom
zoom_y = 3  # vertical zoom
mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

display(Image)

In [None]:
from ultralyticsplus import YOLO, render_result

# load model
model = YOLO('foduucom/table-detection-and-extraction')

# set model parameters
model.overrides['conf'] = 0.25  # NMS confidence threshold
model.overrides['iou'] = 0.45  # NMS IoU threshold
model.overrides['agnostic_nms'] = False  # NMS class-agnostic
model.overrides['max_det'] = 1000  # maximum number of detections per image

In [None]:
results = model.predict(image)
render = render_result(model=model, image=image, result=results[0])
display(render)

## Conclusion

One eternity later, but we can sucesfully detect tables. Now to extract them to markdown

In [None]:
from PIL import Image
from transformers import TableTransformerForObjectDetection
from transformers import DetrFeatureExtractor
import torch
import pandas as pd
import pytesseract

feature_extractor = DetrFeatureExtractor()
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")

In [None]:
def compute_boxes(image_path):
    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    encoding = feature_extractor(image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**encoding)

    results = feature_extractor.post_process_object_detection(outputs, threshold=0.7, target_sizes=[(height, width)])[0]
    boxes = results['boxes'].tolist()
    labels = results['labels'].tolist()

    return boxes,labels

def extract_table(image_path):
    image = Image.open(image_path).convert("RGB")
    boxes,labels = compute_boxes(image_path)
    
    cell_locations = []

    for box_row, label_row in zip(boxes, labels):
        if label_row == 2:
            for box_col, label_col in zip(boxes, labels):
                if label_col == 1:
                    cell_box = (box_col[0], box_row[1], box_col[2], box_row[3])
                    cell_locations.append(cell_box)

    cell_locations.sort(key=lambda x: (x[1], x[0]))
    
    num_columns = 0
    box_old = cell_locations[0]

    for box in cell_locations[1:]:
        x1, y1, x2, y2 = box
        x1_old, y1_old, x2_old, y2_old = box_old
        num_columns += 1
        if y1 > y1_old:
            break
        
        box_old = box
        
    headers = []
    for box in cell_locations[:num_columns]:
        x1, y1, x2, y2 = box
        cell_image = image.crop((x1, y1, x2, y2)) 
        new_width = cell_image.width * 4
        new_height = cell_image.height * 4
        cell_image = cell_image.resize((new_width, new_height), resample=Image.LANCZOS)
        cell_text = pytesseract.image_to_string(cell_image)
        headers.append(cell_text.rstrip()) 

    df = pd.DataFrame(columns=headers)

    row = []
    for box in cell_locations[num_columns:]:
        x1, y1, x2, y2 = box
        cell_image = image.crop((x1, y1, x2, y2)) 
        new_width = cell_image.width * 4
        new_height = cell_image.height * 4
        cell_image = cell_image.resize((new_width, new_height), resample=Image.LANCZOS)
        cell_text = pytesseract.image_to_string(cell_image)

        if len(cell_text) > num_columns:
            cell_text = cell_text[:num_columns]

        row.append(cell_text.rstrip())

        if len(row) == num_columns:
            df.loc[len(df)] = row
            row = []
            
    return df

image_path = "../data/tables/2022 Q3 AAPL/5_0.png"

df = extract_table(image_path)

df

In [None]:
# Open a new PDF
import fitz
pdf_document = fitz.open()

# Create a new PDF page with the same dimensions as the image
img = fitz.Pixmap(image_path)
page = pdf_document.new_page(width=img.width, height=img.height)
# Insert the image into the PDF page
page.insert_image(page.rect, pixmap=img)
tables = page.find_tables()  # detect the tables on the current page

a = 2


In [None]:
image_path = "../data/tables/2022 Q3 AAPL/7_0.png"


def image_to_pdf(image_path):
    # Open a new PDF
    pdf_document = fitz.open()

    # Create a new PDF page with the same dimensions as the image
    img = fitz.Pixmap(image_path)
    page = pdf_document.new_page(width=img.width, height=img.height)

    # Insert the image into the PDF page
    page.insert_image(page.rect, pixmap=img)

    return pdf_document

markdown = pymupdf4llm.to_markdown(image_to_pdf(image_path))
print(markdown)