# OCR
This project implements OCR for the first 10 pages of a general PDF. This program will be compatiable with SQL.

### Things to Note
- to use a juypter notebook in vs code through uv, you must do these things:
    - uv init the project
    - create a venv
    - uv add --dev ipykernel
    - in vs code when selecting a kernel for the notebook, select pyhton then your venv
- Using `easyocr`
    - in order to read pdf documents, I've had to using the package pdf2image which requires poppler binary files which are attached in this project
    - this introduces system a system depency with the bin files
    - also I've been experiencing some crashing on my computer due to the model running on my cpu I think
    - I have a gpu though I'm not exactly sure how to get it to run on there
    - it would be most ideal if I could use google colabs gpus (I already have an account with them)
    - using easyocr automatically uses a cpu version of torch, even tried to uninstall easyocr and download a cuda 12.6 torch version
      but "uv add easyocr" deletes gpu version and reinstalls cpu version

### Next Steps
- increase computation speed (switch to gpu)
- visualize which words are read on pdf (box detected letters)
- add compatability with SQL

In [3]:
import sys
# check env
print(sys.executable)

c:\Users\dsbor\OneDrive\Desktop\Personal\Davis-PDF-Extraction\.venv\Scripts\python.exe


In [2]:
# check if gpu is there
import torch
print(torch.cuda.is_available())

False


In [None]:
# testing easyocr
import easyocr

reader = easyocr.Reader(['en'], gpu=True)
result = reader.readtext('good_ex.jpg')

for (bbox, text, prob) in result:
    print(f"Text: {text}, Probability: {prob:.4f}")

In [None]:
# testing pdf to image conversion
from pdf2image import convert_from_path
import easyocr
import numpy as np

pages = convert_from_path('HW3-1.pdf', poppler_path=r'poppler_bin', first_page=1, last_page=10)

reader = easyocr.Reader(['en'], gpu=True)

for i, page_image in enumerate(pages, start=1):
    # convert PIL image to np array
    img_np = np.array(page_image)

    result = reader.readtext(img_np)

    print(f"\n--- Text on page {i} ---")
    for bbox, text, prob in result:
        #print(f'Text: {text}, Probability: {prob:.4f}')
        print(f"{text}")

In [None]:
# putting it all together and boxing and saving what is reconginzed
from pdf2image import convert_from_path
import easyocr
import numpy as np
import cv2

pages = convert_from_path('example.pdf', poppler_path=r'poppler_bin', first_page=1, last_page=10)

reader = easyocr.Reader(['en'], gpu=True)

for i, page_image in enumerate(pages, start=1):
    
    img_np = np.array(page_image)
    img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)

    result = reader.readtext(img_np)

    for bbox, text, prob in result:
        
        pts = np.array(bbox, dtype=np.int32)
        cv2.polylines(img_cv, [pts], isClosed=True, color=(0, 255, 0), thickness=2)

        top_left = tuple(pts[0])
        cv2.putText(img_cv, text, (top_left[0], top_left[1]-5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)

    cv2.imwrite(f'page_{i}_boxed.png', img_cv)

    print(f"Page {i} processed, saved as page_{i}_boxed.png")


In [1]:
import cv2
print(cv2.__version__)


ModuleNotFoundError: No module named 'cv2'

In [2]:
from pdf2image import convert_from_path
import pytesseract
import numpy as np
import cv2

# If needed on Windows:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

pages = convert_from_path(
    'example.pdf',
    poppler_path=r'poppler_bin',
    first_page=1,
    last_page=10
)

for i, page_image in enumerate(pages, start=1):

    img_np = np.array(page_image)
    img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)

    # OCR with bounding boxes
    data = pytesseract.image_to_data(
        img_cv,
        lang='eng',
        output_type=pytesseract.Output.DICT
    )

    n_boxes = len(data['text'])

    for j in range(n_boxes):
        text = data['text'][j].strip()
        conf = int(data['conf'][j])

        # Skip empty or low-confidence text
        if text == "" or conf < 40:
            continue

        x = data['left'][j]
        y = data['top'][j]
        w = data['width'][j]
        h = data['height'][j]

        # Draw bounding box
        cv2.rectangle(
            img_cv,
            (x, y),
            (x + w, y + h),
            (0, 255, 0),
            2
        )

        # Put recognized text
        cv2.putText(
            img_cv,
            text,
            (x, y - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 255),
            1,
            cv2.LINE_AA
        )

    cv2.imwrite(f'page_{i}_boxed.png', img_cv)
    print(f"Page {i} processed, saved as page_{i}_boxed.png")


ModuleNotFoundError: No module named 'cv2'