# PDF and image experiments
Leverages PyMuPDF, Image and other libraries

In [29]:
import pymupdf
import easyocr, torch
import pytesseract
import yaml
from datetime import datetime
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
import cv2
from PIL import Image
import io
from IPython.display import display
import numpy as np
from Levenshtein import distance

In [5]:
# Read config
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

# Extract images from PDFs
pdf_file = config["sample_pdf"]["doc1"]

# Read PDF
doc = pymupdf.open(pdf_file)
print(f"PDF file: {pdf_file}")
print(f"Number of pages: {doc.page_count}")
print(f"creator: {doc.metadata['creator']}")
print(f"producer: {doc.metadata['producer']}")
date_format = "%Y%m%d%H%M%S"
creation_date = doc.metadata['creationDate'][2:16]
creation_str = datetime.strptime(creation_date, date_format)
modified_date = doc.metadata['modDate'][2:16]
modified_str = datetime.strptime(modified_date, date_format)
print(f"creationDate: {creation_str}")
print(f"modifiedDate: {modified_str}")

PDF file: /Users/xyz-ai/Developer/data/jfk/docs/pdf/20250318/198-10007-10029.pdf
Number of pages: 16
creator: Aspose Pty Ltd.
producer: Aspose.PDF for .NET 23.2.0
creationDate: 2025-03-18 03:49:29
modifiedDate: 2025-03-18 03:49:29


The get_page_images(pno, full=False) returns followings:

 - xref (int) is the image object number
 - smask (int) is the object number of its soft-mask image
 - width (int) is the image width
 - height (int) is the image height
 - bpc (int) denotes the number of bits per component (normally 8)
 - colorspace (str) a string naming the colorspace (like DeviceRGB)
 - alt_colorspace (str) is any alternate colorspace depending on the value of colorspace
 - name (str) is the symbolic name by which the image is referenced
 - filter (str) is the decode filter of the image (Adobe PDF References, pp. 22).
 - referencer (int) the xref of the referencer. Zero if directly referenced by the page. Only present if full=True.

In [None]:
# Print the number of pages in the PDF
page_number = 0
for page in doc:
    page_number += 1
    image_count = 0
    images = page.get_images()
    for image in images:
        image_count += 1
        xref = image[0] # Reference number of the image object
        w, h = image[2:4] # width and height of the image
        bw = image[4] # bits per pixel, 1 is black and white
        print(f"Page: {page_number}, Images: {image_count}: {xref} of {w}x{h}")

In [None]:
# Test the first page images
page1_images = doc[0].get_images()
for image in page1_images:
    print(image)
first_image_xref = page1_images[0][0]

In [5]:
# Extract the first image and show it
pix = pymupdf.Pixmap(doc, first_image_xref)  # create a Pixmap
img = Image.open(io.BytesIO(pix.tobytes("png")))
#display(img)

## Tesseract
Use the [pytesseract](https://pypi.org/project/pytesseract/) to perform the OCR.

In [None]:
# Perform OCR on the image, using pytesseract
text_pytesseract = pytesseract.image_to_string(img)
print(text_pytesseract)

## EasyOCR
[EasyOCR: A Comprehensive Guide](https://medium.com/@adityamahajan.work/easyocr-a-comprehensive-guide-5ff1cb850168) is a easy guide for OCR. 

In [None]:
# Perform OCR on the image, using easyocr
# Set device to MPS if available, else fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'], gpu=True) # gpu=True for MPS
reader.detector.to(device) # Move detection model to MPS
reader.recognizer.to(device) # Move recognition model to MPS


In [None]:
type(img)

In [12]:

# Load and process image
# direct PNG file: image_path = config["sample_image"]["page1"]
if isinstance(img, str):
    image_path = img
else:
    image_path = np.array(img)

text_easyocr = reader.readtext(image_path)

In [None]:
# Print results
for (bbox, text, prob) in text_easyocr:
    print(f"{text}")

## Microsoft TrOCR


In [30]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import cv2
import numpy as np

# Load TrOCR processor and model
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

# Optional: Move to GPU (CUDA or MPS) if available
device = torch.device("mps" if torch.backends.mps.is_available(
) else "cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

Using device: mps


In [35]:
# Load and preprocess the image
image_path = config["sample_image"]["page1"]
# Load the image in grayscale, better for text recognition
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
image = cv2.convertScaleAbs(image, alpha=1.5, beta=10)
image_denoised = cv2.fastNlMeansDenoising(image, h=20)
_, image_binary = cv2.threshold(image_denoised, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
kernel = np.ones((2, 2), np.uint8)  # Smaller kernel to avoid over-connection
image_binary = cv2.dilate(image_binary, kernel, iterations=1)  # Reduced iterations

# Pytesseract detection
custom_config = r'--psm 11 --oem 3 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:- -c min_characters_to_try=3'
boxes = pytesseract.image_to_data(image_binary, output_type=pytesseract.Output.DICT, config=custom_config)

In [36]:
# Collect detected regions with stricter filters
# Collect detected regions
detected_regions = []
for i in range(len(boxes['text'])):
    text = boxes['text'][i].strip()
    if text and len(text) >= 3:
        x, y, w, h = (boxes['left'][i], boxes['top'][i],
                      boxes['width'][i], boxes['height'][i])
        if w > 30 and h > 8:
            detected_regions.append((x, y, w, h))

# Contour detection
contours, _ = cv2.findContours(
    image_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
    x, y, w, h = cv2.boundingRect(contour)
    if w > 30 and h > 8 and not any(abs(r[0] - x) < 15 and abs(r[1] - y) < 15 for r in detected_regions):
        detected_regions.append((x, y, w, h))

In [37]:
# Merge overlapping regions
def merge_regions(regions):
    merged = []
    regions = sorted(regions, key=lambda r: (r[1], r[0]))  # Sort by y, then x
    while regions:
        x1, y1, w1, h1 = regions.pop(0)
        i = 0
        while i < len(regions):
            x2, y2, w2, h2 = regions[i]
            if (abs(y1 - y2) < 15 and  # Vertical overlap/close
                    max(x1, x2) < min(x1 + w1, x2 + w2) + 20):  # Horizontal overlap with buffer
                w1 = max(x1 + w1, x2 + w2) - min(x1, x2)
                h1 = max(h1, h2)
                x1 = min(x1, x2)
                y1 = min(y1, y2)
                regions.pop(i)
            else:
                i += 1
        merged.append((x1, y1, w1, h1))
    return merged

In [38]:
merged_regions = merge_regions(detected_regions)

In [44]:
seen_texts = set()
text_trocr = []
for (x, y, w, h) in merged_regions:
    line_image = image[y:y+h, x:x+w]
    line_pil = Image.fromarray(line_image).convert('RGB')
    pixel_values = processor(line_pil, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values, num_beams=5, max_length=50)
    trocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    if len(trocr_text) >= 1:
        seen_texts.add(trocr_text)
        text_trocr.append(trocr_text)


In [46]:
print("\n".join(text_trocr))

2125 RELEASE UNDER THE PRESIDENT JOHN F. KENNEDY ASSASSNATION RECORDS ACT OF 1992
198-10007-10029
JFK ASSASSINATION SYSTEM
DATE:
6/24/201
IDENTIFICATION FORM
AGENCY INFORMATION
AGENCY
ARMY
RECORD NUMBER
198-10007-10029
RECORD SERIES
CALIFANO PAPERS
AGENCY FILE NUMBER
DOCUMENT INFORMATION
ORIGINATOR
CIA
FROM
C. TRACY BARNES
REPORT ON THE COLD WAR USE OF RADIO BROADCASTING BY CLA
TITLE
06/25/1953
DATE
PAGES
SUBJECTS
RADIO BROADCASTING - CIA
DOCUMENT TYPE
PAPER, TEXTUAL DOCUMENT
CLASSIFICATION
SECRET
RESTRICTIONS
1A; 1B
CURRENT STATUS
REDACT
DATE OF LAST REVIEW
04/03/1998
OPENING CRITERIA
CALIFANO PAPERS, BOX 1, FOLDER 2, MEMO FROM BAMES RE: REPOT ON THE COLD WAR USE OF RATIO
COMMENTS :
BROADCASTING BY CIA.
JFK RAV:UN
JF
K
AVV
AV
DEPARTMENT OF THE ARMY EO 13526
DELASILY EXCLUDE D
EXEMPT
AUTHORITY
D REFER TO
DATE ANGIS BY MARNGLV/6-9
REVIEW
WWWWWWWWWWWWWWWW
V9.1
NW 50955 DOCID:32424022 PAGE 1


In [27]:
# Optional: Visualize bounding boxes
image_color = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
for (x, y, w, h) in merged_regions:
    cv2.rectangle(image_color, (x, y), (x+w, y+h), (0, 255, 0), 2)

In [28]:
# convert image_color to PIL image
image_color_pil = Image.fromarray(image_color)
# display(image_color_pil)
# Save the image
cv2.imwrite('/Users/xyz-ai/Downloads/jfk_page1_annotated.png', image_color)

True

In [None]:
print(np.version.version)