Skip to content

Commit

Permalink
turn on more strict checks, and untangle the layout mess with generics
Browse files Browse the repository at this point in the history
Status:
$ mypy pdfminer
pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame"
pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports
pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs
pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs
pdfminer/pdfdevice.py:191: error: Argument 1 to "write" of "IO" has incompatible type "str"; expected "bytes"
pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL"
Found 5 errors in 4 files (checked 27 source files)

pdfdevice.py:191 appears to be a real bug
  • Loading branch information
0xabu committed Aug 21, 2021
1 parent 5c9c0b1 commit ff4b6a9
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 79 deletions.
18 changes: 18 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[mypy]
warn_unused_configs = True
disallow_any_generics = True
disallow_subclassing_any = True
#disallow_untyped_calls = True
#disallow_untyped_defs = True
#disallow_incomplete_defs = True
#check_untyped_defs = True
disallow_untyped_decorators = True
no_implicit_optional = True
warn_redundant_casts = True
warn_unused_ignores = True
warn_return_any = True
no_implicit_reexport = True
strict_equality = True

[mypy-pdfminer.*]
ignore_missing_imports = True
12 changes: 6 additions & 6 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import logging
from pdfminer.pdfcolor import PDFColorSpace
from typing import List
from typing import Any, List, Optional, Sequence
import re
import sys

Expand All @@ -27,7 +27,7 @@
from .pdfinterp import PDFGraphicState, PDFResourceManager
from .pdfpage import PDFPage
from .pdftypes import PDFStream
from .utils import Matrix, Rect
from .utils import Matrix, Rect, PathSegment
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import enc
Expand All @@ -41,7 +41,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
ctm: Matrix

def __init__(self, rsrcmgr: PDFResourceManager, pageno: int = 1,
laparams: LAParams = None):
laparams: Optional[LAParams] = None):
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
self.laparams = laparams
Expand Down Expand Up @@ -70,7 +70,7 @@ def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
return

def end_figure(self, _) -> None:
def end_figure(self, _: Any) -> None:
fig = self.cur_item
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
self.cur_item = self._stack.pop()
Expand All @@ -85,8 +85,8 @@ def render_image(self, name: str, stream: PDFStream) -> None:
self.cur_item.add(item)
return

def paint_path(self, gstate: PDFGraphicState, stroke, fill, evenodd, path
) -> None:
def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool,
evenodd: bool, path: Sequence[PathSegment]) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path)

Expand Down
89 changes: 50 additions & 39 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import heapq
import logging
from typing import (Any, Dict, Generic, Iterable, Iterator, List, Optional,
Sequence, Set, Tuple, TypeVar, cast)
Sequence, Set, Tuple, TypeVar, Union, cast)

from .utils import INF
from .utils import Matrix
Expand Down Expand Up @@ -296,8 +296,9 @@ def get_text(self) -> str:
class LTChar(LTComponent, LTText):
"""Actual letter in the text as a Unicode string."""

def __init__(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise,
text: str, textwidth, textdisp, ncs: PDFColorSpace,
def __init__(self, matrix: Matrix, font: PDFFont, fontsize: float,
scaling: float, rise: float, text: str, textwidth: float,
textdisp: Point, ncs: PDFColorSpace,
graphicstate: PDFGraphicState):
LTText.__init__(self)
self._text = text
Expand Down Expand Up @@ -351,15 +352,15 @@ def is_compatible(self, obj: Any) -> bool:
return True


LTContainerElement = TypeVar('LTContainerElement', LTItem, LTComponent)
LTItemT = TypeVar('LTItemT', bound=LTItem)


class LTContainer(LTComponent, Generic[LTContainerElement]):
class LTContainer(LTComponent, Generic[LTItemT]):
"""Object that can be extended and analyzed"""

def __init__(self, bbox: Rect):
LTComponent.__init__(self, bbox)
self._objs: List[LTContainerElement] = []
self._objs: List[LTItemT] = []
return

def __iter__(self):
Expand All @@ -368,11 +369,11 @@ def __iter__(self):
def __len__(self):
return len(self._objs)

def add(self, obj: LTContainerElement) -> None:
def add(self, obj: LTItemT) -> None:
self._objs.append(obj)
return

def extend(self, objs: Iterable[LTContainerElement]) -> None:
def extend(self, objs: Iterable[LTItemT]) -> None:
for obj in objs:
self.add(obj)
return
Expand All @@ -383,19 +384,21 @@ def analyze(self, laparams: LAParams) -> None:
return


class LTExpandableContainer(LTContainer):
class LTExpandableContainer(LTContainer[LTItemT]):
def __init__(self):
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
return

def add(self, obj: LTComponent) -> None:
LTContainer.add(self, obj)
# Incompatible override: we take an LTComponent (with bounding box), but
# super() LTContainer only considers LTItem (no bounding box).
def add(self, obj: LTComponent) -> None: # type: ignore[override]
LTContainer.add(self, cast(LTItemT, obj))
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
max(self.x1, obj.x1), max(self.y1, obj.y1)))
return


class LTTextContainer(LTExpandableContainer, LTText):
class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
def __init__(self):
LTText.__init__(self)
LTExpandableContainer.__init__(self)
Expand All @@ -406,15 +409,18 @@ def get_text(self) -> str:
if isinstance(obj, LTText))


class LTTextLine(LTTextContainer):
TextLineElement = Union[LTChar, LTAnno]


class LTTextLine(LTTextContainer[TextLineElement]):
"""Contains a list of LTChar objects that represent a single text line.
The characters are aligned either horizontally or vertically, depending on
the text's writing mode.
"""

def __init__(self, word_margin: float):
LTTextContainer.__init__(self)
super().__init__()
self.word_margin = word_margin
return

Expand All @@ -428,27 +434,28 @@ def analyze(self, laparams: LAParams) -> None:
LTContainer.add(self, LTAnno('\n'))
return

def find_neighbors(self, plane, ratio):
def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLine"]:
raise NotImplementedError


class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin):
def __init__(self, word_margin: float):
LTTextLine.__init__(self, word_margin)
self._x1 = +INF
self._x1: float = +INF
return

def add(self, obj: LTComponent) -> None:
# Incompatible override: we take an LTComponent (with bounding box), but
# LTContainer only considers LTItem (no bounding box).
def add(self, obj: LTComponent) -> None: # type: ignore[override]
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0 - margin:
LTContainer.add(self, LTAnno(' '))
self._x1 = obj.x1
LTTextLine.add(self, obj)
super().add(obj)
return

def find_neighbors(self, plane: Plane, ratio: float
) -> List["LTTextLineHorizontal"]:
def find_neighbors(self, plane: Plane, ratio: float) -> List[LTTextLine]:
"""
Finds neighboring LTTextLineHorizontals in the plane.
Expand Down Expand Up @@ -494,22 +501,23 @@ def _is_same_height_as(self, other: LTComponent, tolerance: float = 0


class LTTextLineVertical(LTTextLine):
def __init__(self, word_margin):
def __init__(self, word_margin: float):
LTTextLine.__init__(self, word_margin)
self._y0 = -INF
self._y0: float = -INF
return

def add(self, obj: LTComponent) -> None:
# Incompatible override: we take an LTComponent (with bounding box), but
# LTContainer only considers LTItem (no bounding box).
def add(self, obj: LTComponent) -> None: # type: ignore[override]
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if obj.y1 + margin < self._y0:
LTContainer.add(self, LTAnno(' '))
self._y0 = obj.y0
LTTextLine.add(self, obj)
super().add(obj)
return

def find_neighbors(self, plane: Plane, ratio: float
) -> List["LTTextLineVertical"]:
def find_neighbors(self, plane: Plane, ratio: float) -> List[LTTextLine]:
"""
Finds neighboring LTTextLineVerticals in the plane.
Expand Down Expand Up @@ -553,7 +561,7 @@ def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
return abs(other.width - self.width) <= tolerance


class LTTextBox(LTTextContainer):
class LTTextBox(LTTextContainer[LTTextLine]):
"""Represents a group of text chunks in a rectangular area.
Note that this box is created by geometric analysis and does not
Expand All @@ -563,7 +571,7 @@ class LTTextBox(LTTextContainer):

def __init__(self):
LTTextContainer.__init__(self)
self.index = -1
self.index: int = -1
return

def __repr__(self):
Expand All @@ -574,7 +582,7 @@ def __repr__(self):

class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams: LAParams) -> None:
LTTextBox.analyze(self, laparams)
super().analyze(laparams)
self._objs.sort(key=lambda obj: -obj.y1)
return

Expand All @@ -584,24 +592,27 @@ def get_writing_mode(self) -> str:

class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams: LAParams) -> None:
LTTextBox.analyze(self, laparams)
super().analyze(laparams)
self._objs.sort(key=lambda obj: -obj.x1)
return

def get_writing_mode(self) -> str:
return 'tb-rl'


class LTTextGroup(LTTextContainer):
def __init__(self, objs: Iterable[LTContainerElement]):
LTTextContainer.__init__(self)
TextGroupElement = Union[LTTextBox, "LTTextGroup"]


class LTTextGroup(LTTextContainer[TextGroupElement]):
def __init__(self, objs: Iterable[TextGroupElement]):
super().__init__()
self.extend(objs)
return


class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams: LAParams) -> None:
LTTextGroup.analyze(self, laparams)
super().analyze(laparams)
# reorder the objects from top-left to bottom-right.
self._objs.sort(
key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
Expand All @@ -611,15 +622,15 @@ def analyze(self, laparams: LAParams) -> None:

class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams: LAParams) -> None:
LTTextGroup.analyze(self, laparams)
super().analyze(laparams)
# reorder the objects from top-right to bottom-left.
self._objs.sort(
key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1)
- (1 - laparams.boxes_flow) * obj.y1)
return


class LTLayoutContainer(LTContainer):
class LTLayoutContainer(LTContainer[LTComponent]):
def __init__(self, bbox: Rect):
LTContainer.__init__(self, bbox)
self.groups: Optional[List[LTTextGroup]] = None
Expand Down Expand Up @@ -782,8 +793,8 @@ def isany(obj1: LTComponent, obj2: LTComponent) -> Set[LTComponent]:
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))

dists: List[Tuple[bool, float, int, int, LTTextContainer,
LTTextContainer]] = []
dists: List[Tuple[bool, float, int, int, Union[LTTextBox, LTTextGroup],
Union[LTTextBox, LTTextGroup]]] = []
for i in range(len(boxes)):
box1 = boxes[i]
for j in range(i+1, len(boxes)):
Expand Down

0 comments on commit ff4b6a9

Please sign in to comment.