In [1]:
import sys
import json

In [2]:
import pdfminer3

In [4]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar, LTPage


In [22]:
def group_characters_into_words(characters, threshold=1.5, toffset=1):
    def median(lst):
        sorted_lst = sorted(lst)
        mid = len(lst) // 2
        return (sorted_lst[mid] + sorted_lst[-mid-1]) / 2

    words = []
    current_word = []

    if not characters:
        return words

    distances = []
    for i in range(len(characters) - 1):
        x1_current, _, x2_current, _ = characters[i]['bbox']
        x1_next, _, _, _ = characters[i + 1]['bbox']
        distances.append(x1_next - x2_current)

    threshold = median(distances) * threshold + toffset

    for i in range(len(characters) - 1):
        current_word.append(characters[i])

        x1_current, _, x2_current, _ = characters[i]['bbox']
        x1_next, _, _, _ = characters[i + 1]['bbox']

        if x1_next - x2_current > threshold:
            words.append(current_word)
            current_word = []

    current_word.append(characters[-1])
    words.append(current_word)

    return words


In [74]:
def estimate_threshold(characters):
    def median(lst):
        assert len(lst) > 0
        if len(lst) == 1:
            return lst[0]
        sorted_lst = sorted(lst)
        mid = len(lst) // 2
        return (sorted_lst[mid] + sorted_lst[-mid-1]) / 2

    if not characters:
        return 0

    distances = []
    for i in range(len(characters) - 1):
        x1_current, _, x2_current, _ = characters[i]['bbox']
        x1_next, _, _, _ = characters[i + 1]['bbox']
        distances.append(x1_next - x2_current)

    if len(distances) == 0:
        return 0
    
    return median(distances)


In [123]:
from typing import List, Dict

def create_word_dict(characters: List[Dict[str, str]]) -> Dict[str, str]:
    x_min = min(char['bbox'][0] for char in characters)
    y_min = min(char['bbox'][1] for char in characters)
    x_max = max(char['bbox'][2] for char in characters)
    y_max = max(char['bbox'][3] for char in characters)
    
    word_bbox = (x_min, y_min, x_max, y_max)
    word_text = ''.join([c['text'] for c in characters])
    return {'word': word_text, 'bbox': word_bbox}

def group_characters_into_words(characters: List[Dict[str, str]], threshold: float) -> List[Dict[str, str]]:
    words = []
    current_word = []

    # Return an empty list if there are no characters
    if not characters:
        return words

    # Iterate through all characters, except the last one
    for i in range(len(characters) - 1):
        # Add the current character to the current_word list
        current_word.append(characters[i])

        # Get the current and next character's x-coordinates
        x1_current, _, x2_current, _ = characters[i]['bbox']
        x1_next, _, _, _ = characters[i + 1]['bbox']

        # Check if the distance between the current and next character is greater than the threshold
        if x1_next < x2_current-10 or x1_next - x2_current > threshold:
            # If so, create a word from the current_word list and append it to the words list
            words.append(create_word_dict(current_word))
            # Reset the current_word list
            current_word = []

    # Add the last character to the current_word list
    # This is necessary because the loop above processes pairs of consecutive characters,
    # so the last character is not yet included in any word
    current_word.append(characters[-1])

    # Create a word from the remaining characters in the current_word list and append it to the words list
    words.append(create_word_dict(current_word))

    return words


In [124]:
global_dictionary = set()

def read_system_dictionary():
    global global_dictionary
    with open('/usr/share/dict/words', 'r') as file:
        for line in file:
            word = line.strip().lower()
            global_dictionary.add(word)

read_system_dictionary()


In [125]:
import re

def is_word_in_dictionary(word):
    global global_dictionary
    cleaned_word = re.sub(r'\W+', '', word).lower()
    return cleaned_word in global_dictionary


In [126]:
def find_optimal_grouping(characters, predicate, thresholds):
    max_count = 0
    optimal_words = []

    for threshold in thresholds:
        words = group_characters_into_words(characters, threshold)
        count = sum(1 for word_info in words if predicate(word_info['word']))
        print(">>>", threshold, count)

        if count > max_count:
            max_count = count
            optimal_words = words

    return optimal_words


In [127]:
from typing import List, Dict

def estimate_and_group_words(characters: List[Dict[str, str]]) -> List[Dict[str, str]]:
    candidates = []
    for i in range(1, 10):
        words = group_characters_into_words(characters, i)
        quality = sum(1 for word_info in words if is_word_in_dictionary(word_info['word']))
        candidates.append((quality, words))
    candidates.sort(reverse=True, key=lambda x: x[0])
    return candidates[0][1]

def extract_word_bounding_boxes(pdf_path: str) -> List[Dict[str, object]]:
    # Type check for pdf_path
    assert isinstance(pdf_path, str), f"pdf_path should be a string, got {type(pdf_path)} instead."

    pages_bounding_boxes = []

    with open(pdf_path, 'rb') as file:
        parser = PDFParser(file)
        document = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page in the PDF
        pageno = 0
        for page in PDFPage.create_pages(document):
            pageno += 1
            print("page", pageno)
            interpreter.process_page(page)
            layout = device.get_result()
            page_bbox = None

            # Get the bounding box for the entire page
            if isinstance(layout, LTPage):
                page_bbox = layout.bbox

            word_bounding_boxes = []

            # Iterate through the layout elements
            for element in layout:
                # Process text boxes
                char_bounding_boxes = []
                if isinstance(element, LTTextBox):
                    # Process text lines within the text box
                    for text_line in element:
                        if isinstance(text_line, LTTextLine):
                            
                            # Extract character bounding boxes from the text line
                            for character in text_line:
                                if isinstance(character, LTChar):
                                    char_bounding_boxes.append({
                                        'text': character.get_text(),
                                        'bbox': character.bbox
                                    })

                # Estimate and group words for the given text line
                # words = estimate_and_group_words(char_bounding_boxes)
                words = estimate_and_group_words(char_bounding_boxes)

                # Append the words and their bounding boxes to the word_bounding_boxes list
                word_bounding_boxes.extend(words)

                if True:
                    text = " ".join([w["word"] for w in words])
                    print("---")
                    print(textwrap.fill(text, 80))

            # Add the page bounding box and word bounding boxes to the output list
            pages_bounding_boxes.append({
                'page_bbox': page_bbox,
                'word_bboxes': word_bounding_boxes
            })

    return pages_bounding_boxes


In [131]:
def print_words(words):
    text = " ".join([w["word"] for w in words])
    print("---")
    print(textwrap.fill(text, 80))

In [134]:
from typing import List, Dict
from pdfminer.layout import LTFigure

def extract_image_bounding_boxes(page: LTPage) -> List[Dict[str, tuple]]:
    image_bounding_boxes = []

    # Iterate through the layout elements
    for element in page:
        # Process image elements
        if isinstance(element, LTFigure):
            image_bbox = element.bbox

            # Append the image bounding box to the image_bounding_boxes list
            image_bounding_boxes.append({'bbox': image_bbox})

    return image_bounding_boxes


In [137]:
from typing import List, Dict

def process_page(page: LTPage) -> Dict[str, object]:
    page_bbox = page.bbox
    word_bounding_boxes = []

    # Iterate through the layout elements
    for element in page:
        # Process text boxes
        if isinstance(element, LTTextBox):
            char_bounding_boxes = []

            # Process text lines within the text box
            for text_line in element:
                if isinstance(text_line, LTTextLine):

                    # Extract character bounding boxes from the text line
                    for character in text_line:
                        if isinstance(character, LTChar):
                            char_bounding_boxes.append({
                                'text': character.get_text(),
                                'bbox': character.bbox
                            })

            # Estimate and group words for the given text line
            words = estimate_and_group_words(char_bounding_boxes)
            print_words(words)

            # Append the words and their bounding boxes to the word_bounding_boxes list
            word_bounding_boxes.extend(words)

    return {'page_bbox': page_bbox, 'word_bboxes': word_bounding_boxes}

def extract_word_bounding_boxes(pdf_path: str) -> List[Dict[str, object]]:
    # Type check for pdf_path
    assert isinstance(pdf_path, str), f"pdf_path should be a string, got {type(pdf_path)} instead."

    pages_bounding_boxes = []

    with open(pdf_path, 'rb') as file:
        parser = PDFParser(file)
        document = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page in the PDF
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()

            # Get the page and word bounding boxes
            page_data = process_page(layout)
            page_data["image_bboxes"] = extract_image_bounding_boxes(layout)

            # Add the page bounding box and word bounding boxes to the output list
            pages_bounding_boxes.append(page_data)

    return pages_bounding_boxes


In [138]:
import textwrap

result = extract_word_bounding_boxes("test.pdf")
page = result[2]
words = page['word_bboxes']
# result = " | ".join([w["word"] for w in words])
# print(textwrap.fill(result, 80))

---
6 1 0 2
---
p e S 2
---
]
---
V C . s c [
---
2 v 1 7 9 5 0 . 8 0 6 1 : v i X r a
---
STFCN: Spatio-Temporal FCN for Semantic Video Segmentation
---
Mohsen Fayyaz1, Mohammad Hajizadeh Saﬀar1, Mohammad Sabokrou1, Mahmood Fathy2,
Reinhard Klette3, Fay Huang4
---
1Malek-Ashtar University of Technology, 2 Iran University of Science and
Technology, 3Auckland University of Technology, 3National Ilan University
---
Abstract. This paper presents a novel method to involve both spa- tial and
temporal features for semantic segmentation of street scenes. Current work on
convolutional neural networks (CNNs) has shown that CNNs provide advanced
spatial features supporting a very good perfor- mance of solutions for the
semantic segmentation task. We investigate how involving temporal features also
has a good eﬀect on segmenting video data. We propose a module based on a long
short-term memory (LSTM) architecture of a recurrent neural network for
interpreting the temporal characteristics of video 