From ff4b6a9bd46b352583d823d39065652c9a6f05f4 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Fri, 20 Aug 2021 22:49:06 -0700 Subject: [PATCH] turn on more strict checks, and untangle the layout mess with generics Status: $ mypy pdfminer pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame" pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs pdfminer/pdfdevice.py:191: error: Argument 1 to "write" of "IO" has incompatible type "str"; expected "bytes" pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL" Found 5 errors in 4 files (checked 27 source files) pdfdevice.py:191 appears to be a real bug --- mypy.ini | 18 +++++++++ pdfminer/converter.py | 12 +++--- pdfminer/layout.py | 89 ++++++++++++++++++++++++------------------- pdfminer/pdfdevice.py | 36 ++++++++--------- pdfminer/pdfinterp.py | 17 +++++---- pdfminer/pdfpage.py | 4 +- pdfminer/psparser.py | 2 +- pdfminer/utils.py | 14 ++++--- 8 files changed, 113 insertions(+), 79 deletions(-) create mode 100644 mypy.ini diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..c276f0d3 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,18 @@ +[mypy] +warn_unused_configs = True +disallow_any_generics = True +disallow_subclassing_any = True +#disallow_untyped_calls = True +#disallow_untyped_defs = True +#disallow_incomplete_defs = True +#check_untyped_defs = True +disallow_untyped_decorators = True +no_implicit_optional = True +warn_redundant_casts = True +warn_unused_ignores = True +warn_return_any = True +no_implicit_reexport = True +strict_equality = True + +[mypy-pdfminer.*] +ignore_missing_imports = True diff --git a/pdfminer/converter.py b/pdfminer/converter.py index ec1735bf..0b978876 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,7 +1,7 @@ import io import logging from pdfminer.pdfcolor import PDFColorSpace -from typing import List +from typing import Any, List, Optional, Sequence import re import sys @@ -27,7 +27,7 @@ from .pdfinterp import PDFGraphicState, PDFResourceManager from .pdfpage import PDFPage from .pdftypes import PDFStream -from .utils import Matrix, Rect +from .utils import Matrix, Rect, PathSegment from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc @@ -41,7 +41,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): ctm: Matrix def __init__(self, rsrcmgr: PDFResourceManager, pageno: int = 1, - laparams: LAParams = None): + laparams: Optional[LAParams] = None): PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno self.laparams = laparams @@ -70,7 +70,7 @@ def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) return - def end_figure(self, _) -> None: + def end_figure(self, _: Any) -> None: fig = self.cur_item assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) self.cur_item = self._stack.pop() @@ -85,8 +85,8 @@ def render_image(self, name: str, stream: PDFStream) -> None: self.cur_item.add(item) return - def paint_path(self, gstate: PDFGraphicState, stroke, fill, evenodd, path - ) -> None: + def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool, + evenodd: bool, path: Sequence[PathSegment]) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 7f34f160..f1c5652e 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,7 +1,7 @@ import heapq import logging from typing import (Any, Dict, Generic, Iterable, Iterator, List, Optional, - Sequence, Set, Tuple, TypeVar, cast) + Sequence, Set, Tuple, TypeVar, Union, cast) from .utils import INF from .utils import Matrix @@ -296,8 +296,9 @@ def get_text(self) -> str: class LTChar(LTComponent, LTText): """Actual letter in the text as a Unicode string.""" - def __init__(self, matrix: Matrix, font: PDFFont, fontsize, scaling, rise, - text: str, textwidth, textdisp, ncs: PDFColorSpace, + def __init__(self, matrix: Matrix, font: PDFFont, fontsize: float, + scaling: float, rise: float, text: str, textwidth: float, + textdisp: Point, ncs: PDFColorSpace, graphicstate: PDFGraphicState): LTText.__init__(self) self._text = text @@ -351,15 +352,15 @@ def is_compatible(self, obj: Any) -> bool: return True -LTContainerElement = TypeVar('LTContainerElement', LTItem, LTComponent) +LTItemT = TypeVar('LTItemT', bound=LTItem) -class LTContainer(LTComponent, Generic[LTContainerElement]): +class LTContainer(LTComponent, Generic[LTItemT]): """Object that can be extended and analyzed""" def __init__(self, bbox: Rect): LTComponent.__init__(self, bbox) - self._objs: List[LTContainerElement] = [] + self._objs: List[LTItemT] = [] return def __iter__(self): @@ -368,11 +369,11 @@ def __iter__(self): def __len__(self): return len(self._objs) - def add(self, obj: LTContainerElement) -> None: + def add(self, obj: LTItemT) -> None: self._objs.append(obj) return - def extend(self, objs: Iterable[LTContainerElement]) -> None: + def extend(self, objs: Iterable[LTItemT]) -> None: for obj in objs: self.add(obj) return @@ -383,19 +384,21 @@ def analyze(self, laparams: LAParams) -> None: return -class LTExpandableContainer(LTContainer): +class LTExpandableContainer(LTContainer[LTItemT]): def __init__(self): LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) return - def add(self, obj: LTComponent) -> None: - LTContainer.add(self, obj) + # Incompatible override: we take an LTComponent (with bounding box), but + # super() LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] + LTContainer.add(self, cast(LTItemT, obj)) self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0), max(self.x1, obj.x1), max(self.y1, obj.y1))) return -class LTTextContainer(LTExpandableContainer, LTText): +class LTTextContainer(LTExpandableContainer[LTItemT], LTText): def __init__(self): LTText.__init__(self) LTExpandableContainer.__init__(self) @@ -406,7 +409,10 @@ def get_text(self) -> str: if isinstance(obj, LTText)) -class LTTextLine(LTTextContainer): +TextLineElement = Union[LTChar, LTAnno] + + +class LTTextLine(LTTextContainer[TextLineElement]): """Contains a list of LTChar objects that represent a single text line. The characters are aligned either horizontally or vertically, depending on @@ -414,7 +420,7 @@ class LTTextLine(LTTextContainer): """ def __init__(self, word_margin: float): - LTTextContainer.__init__(self) + super().__init__() self.word_margin = word_margin return @@ -428,27 +434,28 @@ def analyze(self, laparams: LAParams) -> None: LTContainer.add(self, LTAnno('\n')) return - def find_neighbors(self, plane, ratio): + def find_neighbors(self, plane: Plane, ratio: float) -> List["LTTextLine"]: raise NotImplementedError class LTTextLineHorizontal(LTTextLine): - def __init__(self, word_margin): + def __init__(self, word_margin: float): LTTextLine.__init__(self, word_margin) - self._x1 = +INF + self._x1: float = +INF return - def add(self, obj: LTComponent) -> None: + # Incompatible override: we take an LTComponent (with bounding box), but + # LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0 - margin: LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 - LTTextLine.add(self, obj) + super().add(obj) return - def find_neighbors(self, plane: Plane, ratio: float - ) -> List["LTTextLineHorizontal"]: + def find_neighbors(self, plane: Plane, ratio: float) -> List[LTTextLine]: """ Finds neighboring LTTextLineHorizontals in the plane. @@ -494,22 +501,23 @@ def _is_same_height_as(self, other: LTComponent, tolerance: float = 0 class LTTextLineVertical(LTTextLine): - def __init__(self, word_margin): + def __init__(self, word_margin: float): LTTextLine.__init__(self, word_margin) - self._y0 = -INF + self._y0: float = -INF return - def add(self, obj: LTComponent) -> None: + # Incompatible override: we take an LTComponent (with bounding box), but + # LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if obj.y1 + margin < self._y0: LTContainer.add(self, LTAnno(' ')) self._y0 = obj.y0 - LTTextLine.add(self, obj) + super().add(obj) return - def find_neighbors(self, plane: Plane, ratio: float - ) -> List["LTTextLineVertical"]: + def find_neighbors(self, plane: Plane, ratio: float) -> List[LTTextLine]: """ Finds neighboring LTTextLineVerticals in the plane. @@ -553,7 +561,7 @@ def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: return abs(other.width - self.width) <= tolerance -class LTTextBox(LTTextContainer): +class LTTextBox(LTTextContainer[LTTextLine]): """Represents a group of text chunks in a rectangular area. Note that this box is created by geometric analysis and does not @@ -563,7 +571,7 @@ class LTTextBox(LTTextContainer): def __init__(self): LTTextContainer.__init__(self) - self.index = -1 + self.index: int = -1 return def __repr__(self): @@ -574,7 +582,7 @@ def __repr__(self): class LTTextBoxHorizontal(LTTextBox): def analyze(self, laparams: LAParams) -> None: - LTTextBox.analyze(self, laparams) + super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.y1) return @@ -584,7 +592,7 @@ def get_writing_mode(self) -> str: class LTTextBoxVertical(LTTextBox): def analyze(self, laparams: LAParams) -> None: - LTTextBox.analyze(self, laparams) + super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.x1) return @@ -592,16 +600,19 @@ def get_writing_mode(self) -> str: return 'tb-rl' -class LTTextGroup(LTTextContainer): - def __init__(self, objs: Iterable[LTContainerElement]): - LTTextContainer.__init__(self) +TextGroupElement = Union[LTTextBox, "LTTextGroup"] + + +class LTTextGroup(LTTextContainer[TextGroupElement]): + def __init__(self, objs: Iterable[TextGroupElement]): + super().__init__() self.extend(objs) return class LTTextGroupLRTB(LTTextGroup): def analyze(self, laparams: LAParams) -> None: - LTTextGroup.analyze(self, laparams) + super().analyze(laparams) # reorder the objects from top-left to bottom-right. self._objs.sort( key=lambda obj: (1 - laparams.boxes_flow) * obj.x0 @@ -611,7 +622,7 @@ def analyze(self, laparams: LAParams) -> None: class LTTextGroupTBRL(LTTextGroup): def analyze(self, laparams: LAParams) -> None: - LTTextGroup.analyze(self, laparams) + super().analyze(laparams) # reorder the objects from top-right to bottom-left. self._objs.sort( key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1) @@ -619,7 +630,7 @@ def analyze(self, laparams: LAParams) -> None: return -class LTLayoutContainer(LTContainer): +class LTLayoutContainer(LTContainer[LTComponent]): def __init__(self, bbox: Rect): LTContainer.__init__(self, bbox) self.groups: Optional[List[LTTextGroup]] = None @@ -782,8 +793,8 @@ def isany(obj1: LTComponent, obj2: LTComponent) -> Set[LTComponent]: objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) - dists: List[Tuple[bool, float, int, int, LTTextContainer, - LTTextContainer]] = [] + dists: List[Tuple[bool, float, int, int, Union[LTTextBox, LTTextGroup], + Union[LTTextBox, LTTextGroup]]] = [] for i in range(len(boxes)): box1 = boxes[i] for j in range(i+1, len(boxes)): diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index e800d555..a2cb3487 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,4 +1,4 @@ -from typing import (Any, IO, Iterable, List, Optional, Sequence, Tuple, +from typing import (Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, TYPE_CHECKING) from . import utils from .utils import Matrix, Point, Rect @@ -39,13 +39,13 @@ def set_ctm(self, ctm: Matrix) -> None: self.ctm = ctm return - def begin_tag(self, tag: Any, props=None) -> None: + def begin_tag(self, tag: Any, props: Any = None) -> None: return def end_tag(self) -> None: return - def do_tag(self, tag: Any, props=None) -> None: + def do_tag(self, tag: Any, props: Any = None) -> None: return def begin_page(self, page: PDFPage, ctm: Matrix) -> None: @@ -68,7 +68,7 @@ def paint_path(self, graphicstate: "PDFGraphicState", stroke: bool, def render_image(self, name: str, stream: PDFStream) -> None: return - def render_string(self, textstate: "PDFTextState", seq: Iterable, + def render_string(self, textstate: "PDFTextState", seq: Iterable[Any], ncs: PDFColorSpace, graphicstate: "PDFGraphicState" ) -> None: return @@ -76,7 +76,7 @@ def render_string(self, textstate: "PDFTextState", seq: Iterable, class PDFTextDevice(PDFDevice): - def render_string(self, textstate: "PDFTextState", seq: Iterable, + def render_string(self, textstate: "PDFTextState", seq: Iterable[Any], ncs: PDFColorSpace, graphicstate: "PDFGraphicState" ) -> None: assert self.ctm is not None @@ -103,10 +103,11 @@ def render_string(self, textstate: "PDFTextState", seq: Iterable, graphicstate) return - def render_string_horizontal(self, seq: Iterable, matrix: Matrix, - pos: Point, font: PDFFont, fontsize: float, - scaling: float, charspace: float, - wordspace: float, rise: float, dxscale: float, + def render_string_horizontal(self, seq: Iterable[Any], + matrix: Matrix, pos: Point, font: PDFFont, + fontsize: float, scaling: float, + charspace: float, wordspace: float, + rise: float, dxscale: float, ncs: PDFColorSpace, graphicstate: "PDFGraphicState") -> Point: (x, y) = pos @@ -127,10 +128,11 @@ def render_string_horizontal(self, seq: Iterable, matrix: Matrix, needcharspace = True return (x, y) - def render_string_vertical(self, seq: Iterable, matrix: Matrix, pos: Point, - font: PDFFont, fontsize: float, scaling: float, - charspace: float, wordspace: float, rise: float, - dxscale: float, ncs: PDFColorSpace, + def render_string_vertical(self, seq: Iterable[Any], matrix: Matrix, + pos: Point, font: PDFFont, fontsize: float, + scaling: float, charspace: float, + wordspace: float, rise: float, dxscale: float, + ncs: PDFColorSpace, graphicstate: "PDFGraphicState") -> Point: (x, y) = pos needcharspace = False @@ -158,7 +160,7 @@ def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float, class TagExtractor(PDFDevice): - def __init__(self, rsrcmgr: "PDFResourceManager", outfp: IO, + def __init__(self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, codec: str = 'utf-8'): PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp @@ -167,7 +169,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", outfp: IO, self._stack: List[Any] = [] return - def render_string(self, textstate: "PDFTextState", seq: Iterable, + def render_string(self, textstate: "PDFTextState", seq: Iterable[Any], ncs: PDFColorSpace, graphicstate: "PDFGraphicState" ) -> None: font = textstate.font @@ -200,7 +202,7 @@ def end_page(self, page: PDFPage) -> None: self.pageno += 1 return - def begin_tag(self, tag: Any, props=None) -> None: + def begin_tag(self, tag: Any, props: Any = None) -> None: s = '' if isinstance(props, dict): s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v))) @@ -217,7 +219,7 @@ def end_tag(self) -> None: self.outfp.write(utils.make_compat_bytes(out_s)) return - def do_tag(self, tag: Any, props=None) -> None: + def do_tag(self, tag: Any, props: Any = None) -> None: self.begin_tag(tag, props) self._stack.pop(-1) return diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 2015ccd6..01902971 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -1,6 +1,7 @@ import re import logging -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, + Tuple, Union) from io import BytesIO from .cmapdb import CMapDB from .cmapdb import CMap @@ -32,7 +33,7 @@ from .pdffont import PDFCIDFont from .pdfcolor import PDFColorSpace from .pdfcolor import PREDEFINED_COLORSPACE -from .utils import Matrix, Point +from .utils import Matrix, Point, PathSegment from .utils import choplist from .utils import mult_matrix from .utils import MATRIX_IDENTITY @@ -179,7 +180,7 @@ def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: raise return CMap() - def get_font(self, objid: Any, spec) -> PDFFont: + def get_font(self, objid: Any, spec: Mapping[str, Any]) -> PDFFont: if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: @@ -340,7 +341,7 @@ def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice): def dup(self) -> "PDFPageInterpreter": return self.__class__(self.rsrcmgr, self.device) - def init_resources(self, resources) -> None: + def init_resources(self, resources: Any) -> None: """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap = {} @@ -349,7 +350,7 @@ def init_resources(self, resources) -> None: if not resources: return - def get_colorspace(spec) -> Optional[PDFColorSpace]: + def get_colorspace(spec: Any) -> Optional[PDFColorSpace]: if isinstance(spec, list): name = literal_name(spec[0]) else: @@ -391,7 +392,7 @@ def init_state(self, ctm: Matrix) -> None: self.device.set_ctm(self.ctm) self.textstate = PDFTextState() self.graphicstate = PDFGraphicState() - self.curpath: List[Tuple[str, float, float]] = [] + self.curpath: List[PathSegment] = [] # argstack: stack for command arguments. self.argstack: List[Any] = [] # set some global states. @@ -819,7 +820,7 @@ def do_T_a(self): self.textstate.linematrix = (0, 0) return - def do_TJ(self, seq: Iterable): + def do_TJ(self, seq: Iterable[Any]) -> None: """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: @@ -923,7 +924,7 @@ def process_page(self, page: PDFPage) -> None: self.device.end_page(page) return - def render_contents(self, resources, streams: Sequence, + def render_contents(self, resources: Any, streams: Sequence[Any], ctm: Matrix = MATRIX_IDENTITY) -> None: """Render the content streams. diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index b5b89a53..bfcd013b 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -41,7 +41,7 @@ class PDFPage: beads: a chain that represents natural reading order. """ - def __init__(self, doc: PDFDocument, pageid: Any, attrs): + def __init__(self, doc: PDFDocument, pageid: Any, attrs: Any): """Initialize a page object. doc: a PDFDocument object. @@ -67,7 +67,7 @@ def __init__(self, doc: PDFDocument, pageid: Any, attrs): contents = [] if not isinstance(contents, list): contents = [contents] - self.contents: List = contents + self.contents: List[Any] = contents return def __repr__(self) -> str: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index caed19df..5668edca 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -198,7 +198,7 @@ def close(self) -> None: def tell(self) -> int: return self.bufpos+self.charpos - def poll(self, pos=None, n=80) -> None: + def poll(self, pos: Optional[int] = None, n: int = 80) -> None: pos0 = self.fp.tell() if not pos: pos = self.bufpos+self.charpos diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 807ce11b..25f09728 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -5,7 +5,7 @@ import pathlib import struct from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, - Set, Tuple, TypeVar, Union, TYPE_CHECKING) + Set, Tuple, TypeVar, Union, TYPE_CHECKING, cast) from html import escape if TYPE_CHECKING: @@ -71,7 +71,8 @@ def shorten_str(s: str, size: int) -> str: def compatible_encode_method(bytesorstring: Union[bytes, str], - encoding='utf-8', erraction='ignore') -> str: + encoding: str = 'utf-8', + erraction: str = 'ignore') -> str: """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either. @@ -127,6 +128,7 @@ def apply_png_predictor(pred: Any, colors: int, columns: int, Point = Tuple[float, float] Rect = Tuple[float, float, float, float] Matrix = Tuple[float, float, float, float, float, float] +PathSegment = Tuple[str, float, float] # Matrix operations MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) @@ -242,13 +244,13 @@ def nunpack(s: bytes, default: int = 0) -> int: elif length == 1: return ord(s) elif length == 2: - return struct.unpack('>H', s)[0] + return cast(int, struct.unpack('>H', s)[0]) elif length == 3: - return struct.unpack('>L', b'\x00' + s)[0] + return cast(int, struct.unpack('>L', b'\x00' + s)[0]) elif length == 4: - return struct.unpack('>L', s)[0] + return cast(int, struct.unpack('>L', s)[0]) elif length == 8: - return struct.unpack('>Q', s)[0] + return cast(int, struct.unpack('>Q', s)[0]) else: raise TypeError('invalid length: %d' % length)