In [None]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# from src.io.read import read_pdfs, _read_single_pdf
from src.config import DATA_DIR

import torch
import pandas as pd
import numpy as np

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from pdf2image import convert_from_path
import pytesseract

from PIL import Image, ImageDraw

import warnings

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    module="transformers"
)


from tqdm import tqdm


In [None]:
def _read_single_pdf_pytesseract(path: Path):
    print(f"Reading {path.name} with pytesseract")
    pages = convert_from_path(path)
    
    data = []
    for page in pages[:1]:  # limit to first 10 pages for now
        print(f'Processing page {len(data)+1} / {len(pages)}')
        data.append(
            pd.DataFrame(
                pytesseract.image_to_data(
                    image = page,
                    lang = 'eng',
                    output_type=pytesseract.Output.DICT)
                        )
                    )

    return pages[0], data[0] # TODO: limited to first page for now


def _read_pdf(path: Path, method = 'pytesseract'):
    pdfs = []
    if method == 'pytesseract':
        for i in tqdm(range(30)):
            pdfs.append(_read_single_pdf_pytesseract(path / f"record{i}.pdf"))
    else:
        raise ValueError(f"Unknown method {method} for reading PDF")
    return zip(*pdfs)

In [4]:
pages, ocrs = _read_pdf(DATA_DIR, method='pytesseract')

# page, ocr_data = _read_single_pdf_pytesseract(DATA_DIR / f"record{4}.pdf")

# len(ocr_data['text'])

Reading record0.pdf with pytesseract
Processing page 1 / 1
Reading record1.pdf with pytesseract
Processing page 1 / 2
Reading record2.pdf with pytesseract
Processing page 1 / 4
Reading record3.pdf with pytesseract
Processing page 1 / 194
Reading record4.pdf with pytesseract
Processing page 1 / 24
Reading record5.pdf with pytesseract
Processing page 1 / 22
Reading record6.pdf with pytesseract
Processing page 1 / 27
Reading record7.pdf with pytesseract
Processing page 1 / 17
Reading record8.pdf with pytesseract
Processing page 1 / 11
Reading record9.pdf with pytesseract
Processing page 1 / 11
Reading record10.pdf with pytesseract
Processing page 1 / 24
Reading record11.pdf with pytesseract
Processing page 1 / 24
Reading record12.pdf with pytesseract
Processing page 1 / 13
Reading record13.pdf with pytesseract
Processing page 1 / 13
Reading record14.pdf with pytesseract
Processing page 1 / 6
Reading record15.pdf with pytesseract
Processing page 1 / 11
Reading record16.pdf with pytesseract

In [5]:
len(pages)
len(ocrs)

30

In [6]:
def draw_boxes(ocr_data, page):
  coordinates = ocr_data[['left', 'top', 'width', 'height']]
  actual_boxes = []
  for idx, row in coordinates.iterrows():
      x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
      actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box 
      actual_boxes.append(actual_box)

  draw = ImageDraw.Draw(page, "RGB")
  for box in actual_boxes:
    draw.rectangle(box, outline='red')
  return page

def concatenate(s):
    return ' '.join([str(t) for t in s if str(t) != 'nan' and str(t).strip() != ''])

def display_by_line(ocr_data):
    lines = ocr_data.groupby(
        ['page_num','block_num','line_num'])[['text']].agg(concatenate)
    return str('\n'.join(lines['text'].values))

In [8]:
def convert_text_to_boxes(ocr_data: pd.DataFrame):
    words = []
    boxes = []

    page_sizes = ocr_data[['width', 'height']].max().values
    w, h = page_sizes[0], page_sizes[1]

    for i, text in enumerate(ocr_data['text']):
        
        if text.strip() == "" and ocr_data['level'][i] != 5:
            continue
        words.append(text)
        x0 = int(ocr_data['left'][i] / w * 1000)
        y0 = int(ocr_data['top'][i] / h * 1000)
        x1 = int((ocr_data['left'][i] + ocr_data['width'][i]) / w * 1000)
        y1 = int((ocr_data['top'][i] + ocr_data['height'][i]) / h * 1000)
        boxes.append([x0, y0, x1, y1])

    print(f'Converted OCR data into {len(words)} words and {len(boxes)} boxes')
    return words, boxes

In [9]:
from transformers import AutoTokenizer, LayoutLMForQuestionAnswering, AutoModel, LayoutLMForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlmv3-base")
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

tokenizer_for_QA = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa")
model_for_QA = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", dtype=torch.float16)

In [10]:
def get_embeddings(ocrs):
    embeddings = []
    for ocr_data in ocrs:
        words, boxes = convert_text_to_boxes(ocr_data)

        encoding = tokenizer(
            words,
            boxes=boxes,
            return_token_type_ids=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
        )
        encoding["bbox"] = torch.tensor([boxes])
        pd.DataFrame(encoding).head()

        bbox = []
        for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
            if s == 1 and w is not None:
                bbox.append(boxes[w])
            elif i == tokenizer.sep_token_id:
                bbox.append([1000] * 4)
            else:
                bbox.append([0] * 4)
        encoding["bbox"] = torch.tensor([bbox])

        outputs = model(**encoding)
        embedding = outputs.last_hidden_state.detach().numpy()
        embeddings.append(embedding.mean(axis=1))

    return np.array(embeddings).squeeze()


In [11]:
embeddings = get_embeddings(ocrs)
embeddings.shape

Converted OCR data into 117 words and 117 boxes
Converted OCR data into 283 words and 283 boxes
Converted OCR data into 490 words and 490 boxes
Converted OCR data into 294 words and 294 boxes
Converted OCR data into 164 words and 164 boxes
Converted OCR data into 403 words and 403 boxes
Converted OCR data into 95 words and 95 boxes
Converted OCR data into 1 words and 1 boxes
Converted OCR data into 125 words and 125 boxes
Converted OCR data into 125 words and 125 boxes
Converted OCR data into 374 words and 374 boxes
Converted OCR data into 374 words and 374 boxes
Converted OCR data into 127 words and 127 boxes
Converted OCR data into 127 words and 127 boxes
Converted OCR data into 221 words and 221 boxes
Converted OCR data into 164 words and 164 boxes
Converted OCR data into 222 words and 222 boxes
Converted OCR data into 118 words and 118 boxes
Converted OCR data into 159 words and 159 boxes
Converted OCR data into 291 words and 291 boxes
Converted OCR data into 418 words and 418 boxe

(30, 768)

In [None]:
def ask_a_question(question = "what kind/type/category of document is this?", ocrs = ocrs):
    types = []
    for ocr_data in tqdm(ocrs):
        words, boxes = convert_text_to_boxes(ocr_data)

        encoding = tokenizer_for_QA(
            question.split(),
            words,
            is_split_into_words=True,
            return_token_type_ids=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
        )
        encoding["bbox"] = torch.tensor([boxes])

        bbox = []
        for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
            if s == 1 and w is not None:
                bbox.append(boxes[w])
            elif i == tokenizer.sep_token_id:
                bbox.append([1000] * 4)
            else:
                bbox.append([0] * 4)
        encoding["bbox"] = torch.tensor([bbox])

        outputs = model_for_QA(**encoding)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        start_probabilities = torch.softmax(start_scores, dim=1).squeeze()
        end_probabilities = torch.softmax(end_scores, dim=1).squeeze()

        topk_start = torch.topk(start_probabilities, k=3)[1].squeeze()
        topk_end = torch.topk(end_probabilities, k=3)[1].squeeze()

        word_ids = encoding.word_ids(0)
        possible_answers = []
        for start_token, end_token in zip(topk_start, topk_end):
            if start_probabilities[start_token] > 0.1 and end_probabilities[end_token] > 0.1:
                start_word, end_word = word_ids[start_token], word_ids[end_token]

                if start_word is not None and end_word is not None and start_word <= end_word and end_word < start_word + 10:
                    possible_answers.append(" ".join(words[start_word : end_word + 1]))

        types.append(possible_answers if len(possible_answers) > 0 else ["N/A"])

    return types


In [57]:
possible_types = ask_a_question()

Converted OCR data into 117 words and 117 boxes
Converted OCR data into 283 words and 283 boxes
Converted OCR data into 490 words and 490 boxes
Converted OCR data into 294 words and 294 boxes
Converted OCR data into 164 words and 164 boxes
Converted OCR data into 403 words and 403 boxes
Converted OCR data into 95 words and 95 boxes
Converted OCR data into 1 words and 1 boxes
Converted OCR data into 125 words and 125 boxes
Converted OCR data into 125 words and 125 boxes
Converted OCR data into 374 words and 374 boxes
Converted OCR data into 374 words and 374 boxes
Converted OCR data into 127 words and 127 boxes
Converted OCR data into 127 words and 127 boxes
Converted OCR data into 221 words and 221 boxes
Converted OCR data into 164 words and 164 boxes
Converted OCR data into 222 words and 222 boxes
Converted OCR data into 118 words and 118 boxes
Converted OCR data into 159 words and 159 boxes
Converted OCR data into 291 words and 291 boxes
Converted OCR data into 418 words and 418 boxe

In [58]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=9, random_state=42)
kmeans_model.fit(embeddings)
labels = kmeans_model.labels_
print(len(labels))


for label in set(labels):
    print(f"Cluster {label}:")
    for i, ocr_data in enumerate(ocrs):
        if labels[i] == label:
            print(f" - Document {i} - Possible types: {possible_types[i]}")

30
Cluster 0:
 - Document 10 - Possible types: ['Incident Report Report Cover Sheet']
 - Document 11 - Possible types: ['Incident Report Report Cover Sheet']
 - Document 19 - Possible types: ['Passenger Car/ Automobile']
Cluster 1:
 - Document 12 - Possible types: ['CONTROLLED DOCUMENT - DO NOT DUPLICATE Arrest/Detention Information']
 - Document 13 - Possible types: ['CONTROLLED DOCUMENT - DO NOT DUPLICATE Arrest/Detention Information']
 - Document 18 - Possible types: ['N/A']
 - Document 24 - Possible types: ['N/A']
 - Document 27 - Possible types: ['SWORN EMPLOYEE DISCIPLINARY PACKAGE CHECKLIST']
 - Document 28 - Possible types: ['Discovery Package']
Cluster 2:
 - Document 0 - Possible types: ['Action']
 - Document 17 - Possible types: ['ShotSpotter°']
 - Document 22 - Possible types: ['ShotSpotter°']
 - Document 23 - Possible types: ['ShotSpotter°']
Cluster 3:
 - Document 21 - Possible types: ['Large overview']
 - Document 26 - Possible types: ['Large overview']
Cluster 4:
 - Docum

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [286]:
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3Model
feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained("microsoft/layoutlmv3-base")
image = page.convert("RGB")
print(image.size)
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values




(1642, 2152)


In [283]:
pixel_values.shape

torch.Size([1, 3, 224, 224])