From 9406040d8e4d627dab3d53ab88b16be43f20bcf1 Mon Sep 17 00:00:00 2001 From: Andrew Baumann <0xabu@users.noreply.github.com> Date: Sat, 9 Oct 2021 07:23:28 -0700 Subject: [PATCH] Add type annotations (#661) Squashed commit of the following: commit fa229f7b7591c07aea4e5a4545f9e0c34246e1cd Merge: eaab3c6 c3e3499 Author: Andrew Baumann Date: Mon Sep 6 20:33:06 2021 -0700 Merge branch 'develop' into mypy (and fixed types) commit eaab3c65e2e3ab5f1f400cfc5186a3834c4ffe34 Author: Andrew Baumann Date: Mon Sep 6 20:00:45 2021 -0700 reformat all multi-line function defs to one-arg-per-line commit 3fe2b69eed9197009d9da6776462f580ebf0dfa3 Author: Andrew Baumann Date: Mon Sep 6 15:58:48 2021 -0700 ccitt nit -- avoid casting needlessly commit 15983d8c1e7162632fde43752c9d1c15938cd980 Author: Andrew Baumann Date: Mon Sep 6 15:58:36 2021 -0700 tweak CHANGELOG commit 13dc0babf782938e7d5b5e482d4c5adf92d82702 Author: Andrew Baumann Date: Mon Sep 6 15:43:46 2021 -0700 add failing tests for dumppdf crash commit 6b509c517876b8c15ac5a98a963884e23bd2e4d8 Author: Andrew Baumann Date: Mon Sep 6 15:24:23 2021 -0700 ccitt: apply misc PR feedback commit feb031ba86d3f22e41cfbbda13f17c039359f1e6 Author: Andrew Baumann Date: Mon Sep 6 15:18:26 2021 -0700 add missing None return type to all __init__ methods commit c0d62d6c54c7ec37b40bea54a3f6a7a618ec0ec6 Author: Andrew Baumann Date: Mon Sep 6 15:13:08 2021 -0700 minor cleanup, remove a few more Any types commit b52a0594e1998a492c172538a9b35491c5fc5f52 Author: Andrew Baumann Date: Sun Sep 5 22:37:28 2021 -0700 tighten up types, avoid Any in favour of explicit casts commit e58fd48bd14f31bebd2de8259f12630ac02756d6 Author: Andrew Baumann Date: Sun Sep 5 14:10:49 2021 -0700 annotate ccitt.py, and fix one definite bug (array.tostring was renamed tobytes) commit 605290633e55595e5e0045840df5c5b1d9de843a Author: Andrew Baumann Date: Sat Sep 4 22:37:38 2021 -0700 python 3.7 back-compat commit 4dbcf8760f8a1d3e3d99f085476f86e6a043c80c Author: Andrew Baumann Date: Sat Sep 4 22:32:43 2021 -0700 annotate pdfminer.jbig2 commit 0d40b7c03a8028dc44acd3f457eac71abd681827 Author: Andrew Baumann Date: Sat Sep 4 22:31:33 2021 -0700 annotate pdf2txt.py commit 5f82eb4f5646b5d1285252689191e0a14557ec7b Author: Andrew Baumann Date: Sat Sep 4 09:16:31 2021 -0700 cleanup: make Plane generic commit 624fc92b88473ff36a174760883f34c22109da2b Author: Andrew Baumann Date: Fri Sep 3 23:16:51 2021 -0700 bluntly ignore calls to cryptography.hazmat commit 96b20439c169f40dbb114cabba6a582ad1ebe91e Author: Andrew Baumann Date: Fri Sep 3 23:01:06 2021 -0700 finish annotating, and disallow_untyped_defs for pdfminer.* _except_ ccitt and jbig2 commit 0ab586347861b72b1d16880dc9293f9ad597e20a Author: Andrew Baumann Date: Fri Sep 3 21:51:56 2021 -0700 annotate pdffont commit 4b689f1bcbdaf654feb9de81023e318ca310a12e Author: Andrew Baumann Date: Fri Sep 3 18:30:02 2021 -0700 annotate a couple more scripts; document sketchy code commit 291981ff3d273952ec9c92ef8ab948473558b787 Author: Andrew Baumann Date: Fri Sep 3 15:02:01 2021 -0700 pacify flake8 commit 45d2ce91ff333f3b7e34322b16e9c52b99b7a972 Author: Andrew Baumann Date: Fri Sep 3 14:31:48 2021 -0700 annotate dumppdf, and comment likely bugs commit 7278d83851cb336a1be3803a0993b5ec0ad39b4c Author: Andrew Baumann Date: Fri Sep 3 13:49:58 2021 -0700 enable mypy on tests and tools, fix one implicit reexport bug commit 4a83166ef4e4733cd2113f43188b585a4fda392b Author: Andrew Baumann Date: Fri Sep 3 13:25:59 2021 -0700 pdfdocument: per dumppdf.py, get_dest accepts either bytes or str commit 43701e1bee068df98f378a253c9c2150ee4ad9f7 Author: Andrew Baumann Date: Fri Sep 3 13:25:00 2021 -0700 layout: LAParams.boxes_flow may be None commit 164f81652f1788e74837466f0ab593e94079bc0f Author: Andrew Baumann Date: Fri Sep 3 09:45:09 2021 -0700 add whitespace, pacify flake8 commit 893b9fb9ec918032b36a30456fc0b7a217da86d8 Author: Andrew Baumann Date: Fri Sep 3 09:40:33 2021 -0700 support old Python without typing.Protocol commit dc245084102b7b04c3f5599d75b5d62ba4290787 Author: Andrew Baumann Date: Fri Sep 3 09:12:03 2021 -0700 Move "# type: ignore" comments to fix mypy on Python < 3.8 The placement of these comments got more flexible in 3.8 due to https://github.com/python/mypy/issues/1032 Satisfying older Python and fitting in flake8's 79-character line limit was quite a challenge! commit da03afe7bd2cf3336e611f467f1c901455940ae8 Author: Andrew Baumann Date: Thu Sep 2 22:59:58 2021 -0700 fix text output from HTMLConverter commit 5401276a2ed3b74a385ebcab5152485224146161 Author: Andrew Baumann Date: Thu Sep 2 22:40:22 2021 -0700 annotate high_level.py and the immediately-reachable internal APIs (mostly converters) commit cc490513f8f17a7adc0bcbab2e0e86f37e832300 Author: Andrew Baumann Date: Thu Sep 2 17:04:35 2021 -0700 * expand and improve annotations in cmap, encryption/decompression and fonts * disallow untyped calls; this way, we have a core set of typed code that can grow over time (just not for ccitt, because there's a ton of work lurking there) * expand "typing: none" comments to suppress a specific error code commit 92df54ba1d53d5dbbd5442757dd85be5b1851f99 Author: Andrew Baumann Date: Wed Sep 1 20:50:59 2021 -0700 update CHANGELOG commit f72aaead45d0615e472a9b3190c9551a6b67b36e Merge: ff787a9 8ea9f10 Author: Andrew Baumann Date: Wed Sep 1 20:47:03 2021 -0700 Merge branch 'develop' into mypy commit ff787a93986c60361536a97182a41774f4a53ac3 Author: Andrew Baumann Date: Sat Aug 21 21:46:14 2021 -0700 be more precise about types on ps/pdf stacks, remove most of the Any annotations commit be1550189e10717f6827dbb7009d6e8c8b3f4c62 Author: Andrew Baumann Date: Sat Aug 21 10:13:58 2021 -0700 silence missing imports, (maybe?) hook to tox commit ff4b6a9bd46b352583d823d39065652c9a6f05f4 Author: Andrew Baumann Date: Fri Aug 20 22:49:06 2021 -0700 turn on more strict checks, and untangle the layout mess with generics Status: $ mypy pdfminer pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame" pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs pdfminer/pdfdevice.py:191: error: Argument 1 to "write" of "IO" has incompatible type "str"; expected "bytes" pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL" Found 5 errors in 4 files (checked 27 source files) pdfdevice.py:191 appears to be a real bug commit 5c9c0b19d26ae391aea0e69c2c819261cc04460c Author: Andrew Baumann Date: Fri Aug 20 17:22:41 2021 -0700 finish annotating layout commit 0e6871c16abb29df2868ab145b4ce451b4b6c777 Author: Andrew Baumann Date: Fri Aug 20 16:54:46 2021 -0700 general progress on annotations * finish utils * annotate more of pdfinterp, pdfdevice * document reason for # type: ignore comments * fix cyclic imports * satisfy flake8 commit 17d59f42917fbf9b2b2eb844d3e83a8f2a3f123a Author: Andrew Baumann Date: Thu Aug 19 21:38:50 2021 -0700 WIP on type annotations With the possible exception of psparser.py, this is far from complete. $ mypy pdfminer pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame" pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL" --- CHANGELOG.md | 1 + docs/source/conf.py | 3 +- mypy.ini | 27 +++ pdfminer/_saslprep.py | 5 +- pdfminer/arcfour.py | 7 +- pdfminer/ascii85.py | 6 +- pdfminer/ccitt.py | 102 ++++---- pdfminer/cmapdb.py | 114 +++++---- pdfminer/converter.py | 300 +++++++++++++++++------- pdfminer/encodingdb.py | 23 +- pdfminer/high_level.py | 63 +++-- pdfminer/image.py | 36 ++- pdfminer/jbig2.py | 157 ++++++++----- pdfminer/latin_enc.py | 7 +- pdfminer/layout.py | 394 ++++++++++++++++++++----------- pdfminer/lzw.py | 25 +- pdfminer/pdfcolor.py | 7 +- pdfminer/pdfdevice.py | 176 ++++++++++---- pdfminer/pdfdocument.py | 335 ++++++++++++++++---------- pdfminer/pdffont.py | 326 ++++++++++++++++---------- pdfminer/pdfinterp.py | 453 ++++++++++++++++++++++-------------- pdfminer/pdfpage.py | 45 ++-- pdfminer/pdfparser.py | 35 +-- pdfminer/pdftypes.py | 111 ++++++--- pdfminer/psparser.py | 148 +++++++----- pdfminer/runlength.py | 2 +- pdfminer/utils.py | 153 ++++++++---- setup.py | 2 +- tests/test_tools_dumppdf.py | 12 +- tools/conv_afm.py | 2 +- tools/conv_cmap.py | 2 +- tools/conv_glyphlist.py | 2 +- tools/dumppdf.py | 89 ++++--- tools/pdf2txt.py | 55 +++-- tools/pdfdiff.py | 13 +- tools/pdfstats.py | 15 +- tools/prof.py | 22 +- tox.ini | 1 + 38 files changed, 2165 insertions(+), 1111 deletions(-) create mode 100644 mypy.ini diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a508b25..29059ac5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614)) - Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537)) +- Type annotations ([#661](https://github.com/pdfminer/pdfminer.six/pull/661)) ### Fixed - `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594)) diff --git a/docs/source/conf.py b/docs/source/conf.py index fcbf595d..ccb6ec1d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,6 +12,7 @@ import os import sys +from typing import List import pdfminer @@ -48,7 +49,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns: List[str] = [] # -- Options for HTML output ------------------------------------------------- diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..eaddd861 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,27 @@ +[mypy] +warn_unused_configs = True +disallow_any_generics = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_incomplete_defs = True +disallow_untyped_decorators = True +no_implicit_optional = True +warn_redundant_casts = True +warn_return_any = True +no_implicit_reexport = True +strict_equality = True + +# This seems impossible to turn on in a version-independent manner +warn_unused_ignores = False + +[mypy-pdfminer.*] +disallow_untyped_defs = True + +[mypy-cryptography.hazmat.*] +ignore_missing_imports = True + +[mypy-nose.*] +ignore_missing_imports = True + +[mypy-setuptools] +ignore_missing_imports = True diff --git a/pdfminer/_saslprep.py b/pdfminer/_saslprep.py index 067a077f..32c68cb2 100644 --- a/pdfminer/_saslprep.py +++ b/pdfminer/_saslprep.py @@ -21,10 +21,11 @@ __all__ = ['saslprep'] import stringprep +from typing import Callable, Tuple import unicodedata # RFC4013 section 2.3 prohibited output. -_PROHIBITED = ( +_PROHIBITED: Tuple[Callable[[str], bool], ...] = ( # A strict reading of RFC 4013 requires table c12 here, but # characters from it are mapped to SPACE in the Map step. Can # normalization reintroduce them somehow? @@ -39,7 +40,7 @@ stringprep.in_table_c9) -def saslprep(data: str, prohibit_unassigned_code_points=True) -> str: +def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str: """An implementation of RFC4013 SASLprep. :param data: The string to SASLprep. diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index e40b0804..dd2697ce 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -5,9 +5,12 @@ """ +from typing import Sequence + + class Arcfour: - def __init__(self, key): + def __init__(self, key: Sequence[int]) -> None: # because Py3 range is not indexable s = [i for i in range(256)] j = 0 @@ -19,7 +22,7 @@ def __init__(self, key): (self.i, self.j) = (0, 0) return - def process(self, data): + def process(self, data: bytes) -> bytes: (i, j) = (self.i, self.j) s = self.s r = b'' diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index cde3f908..7c7c757f 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -9,7 +9,7 @@ # ascii85decode(data) -def ascii85decode(data): +def ascii85decode(data: bytes) -> bytes: """ In ASCII85 encoding, every four bytes are encoded with five ASCII letters, using 85 different types of characters (as 256**4 < 85**5). @@ -47,7 +47,7 @@ def ascii85decode(data): trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) -def asciihexdecode(data): +def asciihexdecode(data: bytes) -> bytes: """ ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the @@ -57,7 +57,7 @@ def asciihexdecode(data): the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. """ - def decode(x): + def decode(x: bytes) -> bytes: i = int(x, 16) return bytes((i,)) diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index 1c00eb0e..4dadc813 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -11,25 +11,39 @@ # FOR GROUP 4 FACSIMILE APPARATUS" -import sys import array +from typing import (Any, Callable, Dict, Iterator, List, MutableSequence, + Optional, Sequence, Union, cast) -def get_bytes(data): +def get_bytes(data: bytes) -> Iterator[int]: yield from data +# Workaround https://github.com/python/mypy/issues/731 +BitParserState = MutableSequence[Any] +# A better definition (not supported by mypy) would be: +# BitParserState = MutableSequence[Union["BitParserState", int, str, None]] + + class BitParser: - def __init__(self): + _state: BitParserState + + # _accept is declared Optional solely as a workaround for + # https://github.com/python/mypy/issues/708 + _accept: Optional[Callable[[Any], BitParserState]] + + def __init__(self) -> None: self._pos = 0 return @classmethod - def add(cls, root, v, bits): - p = root + def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None: + p: BitParserState = root b = None for i in range(len(bits)): if 0 < i: + assert b is not None if p[b] is None: p[b] = [None, None] p = p[b] @@ -37,16 +51,17 @@ def add(cls, root, v, bits): b = 1 else: b = 0 + assert b is not None p[b] = v return - def feedbytes(self, data): + def feedbytes(self, data: bytes) -> None: for byte in get_bytes(data): for m in (128, 64, 32, 16, 8, 4, 2, 1): self._parse_bit(byte & m) return - def _parse_bit(self, x): + def _parse_bit(self, x: object) -> None: if x: v = self._state[1] else: @@ -55,6 +70,7 @@ def _parse_bit(self, x): if isinstance(v, list): self._state = v else: + assert self._accept is not None self._state = self._accept(v) return @@ -318,14 +334,16 @@ class InvalidData(Exception): class ByteSkip(Exception): pass - def __init__(self, width, bytealign=False): + _color: int + + def __init__(self, width: int, bytealign: bool = False) -> None: BitParser.__init__(self) self.width = width self.bytealign = bytealign self.reset() return - def feedbytes(self, data): + def feedbytes(self, data: bytes) -> None: for byte in get_bytes(data): try: for m in (128, 64, 32, 16, 8, 4, 2, 1): @@ -337,7 +355,7 @@ def feedbytes(self, data): break return - def _parse_mode(self, mode): + def _parse_mode(self, mode: object) -> BitParserState: if mode == 'p': self._do_pass() self._flush_line() @@ -361,7 +379,7 @@ def _parse_mode(self, mode): else: raise self.InvalidData(mode) - def _parse_horiz1(self, n): + def _parse_horiz1(self, n: Any) -> BitParserState: if n is None: raise self.InvalidData self._n1 += n @@ -374,7 +392,7 @@ def _parse_horiz1(self, n): else: return self.BLACK - def _parse_horiz2(self, n): + def _parse_horiz2(self, n: Any) -> BitParserState: if n is None: raise self.InvalidData self._n2 += n @@ -389,7 +407,7 @@ def _parse_horiz2(self, n): else: return self.BLACK - def _parse_uncompressed(self, bits): + def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState: if not bits: raise self.InvalidData if bits.startswith('T'): @@ -401,10 +419,10 @@ def _parse_uncompressed(self, bits): self._do_uncompressed(bits) return self.UNCOMPRESSED - def _get_bits(self): + def _get_bits(self) -> str: return ''.join(str(b) for b in self._curline[:self._curpos]) - def _get_refline(self, i): + def _get_refline(self, i: int) -> str: if i < 0: return '[]'+''.join(str(b) for b in self._refline) elif len(self._refline) <= i: @@ -414,7 +432,7 @@ def _get_refline(self, i): '['+str(self._refline[i])+']' + ''.join(str(b) for b in self._refline[i+1:])) - def reset(self): + def reset(self) -> None: self._y = 0 self._curline = array.array('b', [1]*self.width) self._reset_line() @@ -422,18 +440,18 @@ def reset(self): self._state = self.MODE return - def output_line(self, y, bits): + def output_line(self, y: int, bits: Sequence[int]) -> None: print(y, ''.join(str(b) for b in bits)) return - def _reset_line(self): + def _reset_line(self) -> None: self._refline = self._curline self._curline = array.array('b', [1]*self.width) self._curpos = -1 self._color = 1 return - def _flush_line(self): + def _flush_line(self) -> None: if self.width <= self._curpos: self.output_line(self._y, self._curline) self._y += 1 @@ -442,7 +460,7 @@ def _flush_line(self): raise self.ByteSkip return - def _do_vertical(self, dx): + def _do_vertical(self, dx: int) -> None: x1 = self._curpos+1 while 1: if x1 == 0: @@ -467,7 +485,7 @@ def _do_vertical(self, dx): self._color = 1-self._color return - def _do_pass(self): + def _do_pass(self) -> None: x1 = self._curpos+1 while 1: if x1 == 0: @@ -494,7 +512,7 @@ def _do_pass(self): self._curpos = x1 return - def _do_horizontal(self, n1, n2): + def _do_horizontal(self, n1: int, n2: int) -> None: if self._curpos < 0: self._curpos = 0 x = self._curpos @@ -511,7 +529,7 @@ def _do_horizontal(self, n1, n2): self._curpos = x return - def _do_uncompressed(self, bits): + def _do_uncompressed(self, bits: str) -> None: for c in bits: self._curline[self._curpos] = int(c) self._curpos += 1 @@ -521,32 +539,33 @@ def _do_uncompressed(self, bits): class CCITTFaxDecoder(CCITTG4Parser): - def __init__(self, width, bytealign=False, reversed=False): + def __init__(self, width: int, bytealign: bool = False, + reversed: bool = False) -> None: CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.reversed = reversed self._buf = b'' return - def close(self): + def close(self) -> bytes: return self._buf - def output_line(self, y, bits): - bytes = array.array('B', [0]*((len(bits)+7)//8)) + def output_line(self, y: int, bits: Sequence[int]) -> None: + arr = array.array('B', [0]*((len(bits)+7)//8)) if self.reversed: bits = [1-b for b in bits] for (i, b) in enumerate(bits): if b: - bytes[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] - self._buf += bytes.tostring() + arr[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] + self._buf += arr.tobytes() return -def ccittfaxdecode(data, params): +def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes: K = params.get('K') - cols = params.get('Columns') - bytealign = params.get('EncodedByteAlign') - reversed = params.get('BlackIs1') if K == -1: + cols = cast(int, params.get('Columns')) + bytealign = cast(bool, params.get('EncodedByteAlign')) + reversed = cast(bool, params.get('BlackIs1')) parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed) else: raise ValueError(K) @@ -555,19 +574,20 @@ def ccittfaxdecode(data, params): # test -def main(argv): +def main(argv: List[str]) -> None: if not argv[1:]: import unittest - return unittest.main() + unittest.main() + return class Parser(CCITTG4Parser): - def __init__(self, width, bytealign=False): - import pygame + def __init__(self, width: int, bytealign: bool = False) -> None: + import pygame # type: ignore[import] CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.img = pygame.Surface((self.width, 1000)) return - def output_line(self, y, bits): + def output_line(self, y: int, bits: Sequence[int]) -> None: for (x, b) in enumerate(bits): if b: self.img.set_at((x, y), (255, 255, 255)) @@ -575,7 +595,7 @@ def output_line(self, y, bits): self.img.set_at((x, y), (0, 0, 0)) return - def close(self): + def close(self) -> None: import pygame pygame.image.save(self.img, 'out.bmp') return @@ -587,7 +607,3 @@ def close(self): parser.close() fp.close() return - - -if __name__ == '__main__': - sys.exit(main(sys.argv)) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 35ced14f..853d877a 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -16,9 +16,12 @@ import pickle as pickle import struct import logging +from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, + MutableMapping, Optional, TextIO, Tuple, Union, cast) from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF +from .psparser import PSKeyword from .psparser import PSLiteral from .psparser import literal_name from .psparser import KWD @@ -38,44 +41,48 @@ class CMapBase: debug = 0 - def __init__(self, **kwargs): - self.attrs = kwargs.copy() + def __init__(self, **kwargs: object) -> None: + self.attrs: MutableMapping[str, object] = kwargs.copy() return - def is_vertical(self): + def is_vertical(self) -> bool: return self.attrs.get('WMode', 0) != 0 - def set_attr(self, k, v): + def set_attr(self, k: str, v: object) -> None: self.attrs[k] = v return - def add_code2cid(self, code, cid): + def add_code2cid(self, code: str, cid: int) -> None: return - def add_cid2unichr(self, cid, code): + def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int] + ) -> None: return - def use_cmap(self, cmap): + def use_cmap(self, cmap: "CMapBase") -> None: return + def decode(self, code: bytes) -> Iterable[int]: + raise NotImplementedError + class CMap(CMapBase): - def __init__(self, **kwargs): + def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) - self.code2cid = {} + self.code2cid: Dict[int, object] = {} return - def __repr__(self): + def __repr__(self) -> str: return '' % self.attrs.get('CMapName') - def use_cmap(self, cmap): + def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) - def copy(dst, src): + def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: for (k, v) in src.items(): if isinstance(v, dict): - d = {} + d: Dict[int, object] = {} dst[k] = d copy(d, v) else: @@ -83,20 +90,24 @@ def copy(dst, src): copy(self.code2cid, cmap.code2cid) return - def decode(self, code): + def decode(self, code: bytes) -> Iterator[int]: log.debug('decode: %r, %r', self, code) d = self.code2cid for i in iter(code): if i in d: - d = d[i] - if isinstance(d, int): - yield d + x = d[i] + if isinstance(x, int): + yield x d = self.code2cid + else: + d = cast(Dict[int, object], x) else: d = self.code2cid return - def dump(self, out=sys.stdout, code2cid=None, code=None): + def dump(self, out: TextIO = sys.stdout, + code2cid: Optional[Dict[int, object]] = None, + code: Tuple[int, ...] = ()) -> None: if code2cid is None: code2cid = self.code2cid code = () @@ -105,13 +116,13 @@ def dump(self, out=sys.stdout, code2cid=None, code=None): if isinstance(v, int): out.write('code %r = cid %d\n' % (c, v)) else: - self.dump(out=out, code2cid=v, code=c) + self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) return class IdentityCMap(CMapBase): - def decode(self, code): + def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code)//2 if n: return struct.unpack('>%dH' % n, code) @@ -121,7 +132,7 @@ def decode(self, code): class IdentityCMapByte(IdentityCMap): - def decode(self, code): + def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) if n: return struct.unpack('>%dB' % n, code) @@ -131,19 +142,19 @@ def decode(self, code): class UnicodeMap(CMapBase): - def __init__(self, **kwargs): + def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) - self.cid2unichr = {} + self.cid2unichr: Dict[int, str] = {} return - def __repr__(self): + def __repr__(self) -> str: return '' % self.attrs.get('CMapName') - def get_unichr(self, cid): + def get_unichr(self, cid: int) -> str: log.debug('get_unichr: %r, %r', self, cid) return self.cid2unichr[cid] - def dump(self, out=sys.stdout): + def dump(self, out: TextIO = sys.stdout) -> None: for (k, v) in sorted(self.cid2unichr.items()): out.write('cid %d = unicode %r\n' % (k, v)) return @@ -151,29 +162,31 @@ def dump(self, out=sys.stdout): class FileCMap(CMap): - def add_code2cid(self, code, cid): + def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int),\ str((type(code), type(cid))) d = self.code2cid for c in code[:-1]: - c = ord(c) - if c in d: - d = d[c] + ci = ord(c) + if ci in d: + d = cast(Dict[int, object], d[ci]) else: - t = {} - d[c] = t + t: Dict[int, object] = {} + d[ci] = t d = t - c = ord(code[-1]) - d[c] = cid + ci = ord(code[-1]) + d[ci] = cid return class FileUnicodeMap(UnicodeMap): - def add_cid2unichr(self, cid, code): + def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int] + ) -> None: assert isinstance(cid, int), str(type(cid)) if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. + assert isinstance(code.name, str) self.cid2unichr[cid] = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. @@ -187,8 +200,8 @@ def add_cid2unichr(self, cid, code): class PyCMap(CMap): - def __init__(self, name, module): - CMap.__init__(self, CMapName=name) + def __init__(self, name: str, module: Any) -> None: + super().__init__(CMapName=name) self.code2cid = module.CODE2CID if module.IS_VERTICAL: self.attrs['WMode'] = 1 @@ -197,8 +210,8 @@ def __init__(self, name, module): class PyUnicodeMap(UnicodeMap): - def __init__(self, name, module, vertical): - UnicodeMap.__init__(self, CMapName=name) + def __init__(self, name: str, module: Any, vertical: bool) -> None: + super().__init__(CMapName=name) if vertical: self.cid2unichr = module.CID2UNICHR_V self.attrs['WMode'] = 1 @@ -209,14 +222,14 @@ def __init__(self, name, module, vertical): class CMapDB: - _cmap_cache = {} - _umap_cache = {} + _cmap_cache: Dict[str, PyCMap] = {} + _umap_cache: Dict[str, List[PyUnicodeMap]] = {} class CMapNotFound(CMapError): pass @classmethod - def _load_data(cls, name): + def _load_data(cls, name: str) -> Any: name = name.replace("\0", "") filename = '%s.pickle.gz' % name log.info('loading: %r', name) @@ -234,7 +247,7 @@ def _load_data(cls, name): raise CMapDB.CMapNotFound(name) @classmethod - def get_cmap(cls, name): + def get_cmap(cls, name: str) -> CMapBase: if name == 'Identity-H': return IdentityCMap(WMode=0) elif name == 'Identity-V': @@ -252,7 +265,7 @@ def get_cmap(cls, name): return cmap @classmethod - def get_unicode_map(cls, name, vertical=False): + def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: try: return cls._umap_cache[name][vertical] except KeyError: @@ -263,16 +276,16 @@ def get_unicode_map(cls, name, vertical=False): return cls._umap_cache[name][vertical] -class CMapParser(PSStackParser): +class CMapParser(PSStackParser[PSKeyword]): - def __init__(self, cmap, fp): + def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True return - def run(self): + def run(self) -> None: try: self.nextobject() except PSEOF: @@ -296,7 +309,7 @@ def run(self): KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange') KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() @@ -380,6 +393,7 @@ def do_keyword(self, pos, token): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) else: + assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] @@ -410,7 +424,7 @@ def do_keyword(self, pos, token): return -def main(argv): +def main(argv: List[str]) -> None: args = argv[1:] for fname in args: fp = open(fname, 'rb') @@ -422,4 +436,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + main(sys.argv) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 812f6682..bffbb89e 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,13 +1,19 @@ import io import logging +from pdfminer.pdfcolor import PDFColorSpace +from typing import (BinaryIO, Dict, Generic, List, Optional, Sequence, TextIO, + Tuple, TypeVar, Union, cast) import re from . import utils +from .layout import LAParams, LTComponent, TextGroupElement from .layout import LTChar from .layout import LTContainer from .layout import LTCurve from .layout import LTFigure from .layout import LTImage +from .layout import LTItem +from .layout import LTLayoutContainer from .layout import LTLine from .layout import LTPage from .layout import LTRect @@ -17,25 +23,38 @@ from .layout import LTTextGroup from .layout import LTTextLine from .pdfdevice import PDFTextDevice +from .pdffont import PDFFont from .pdffont import PDFUnicodeNotDefined +from .pdfinterp import PDFGraphicState, PDFResourceManager +from .pdfpage import PDFPage +from .pdftypes import PDFStream +from .utils import AnyIO, Point, Matrix, Rect, PathSegment from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc from .utils import mult_matrix +from .image import ImageWriter log = logging.getLogger(__name__) class PDFLayoutAnalyzer(PDFTextDevice): - - def __init__(self, rsrcmgr, pageno=1, laparams=None): + cur_item: LTLayoutContainer + ctm: Matrix + + def __init__( + self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno self.laparams = laparams - self._stack = [] + self._stack: List[LTLayoutContainer] = [] return - def begin_page(self, page, ctm): + def begin_page(self, page: PDFPage, ctm: Matrix) -> None: (x0, y0, x1, y1) = page.mediabox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) @@ -43,7 +62,7 @@ def begin_page(self, page, ctm): self.cur_item = LTPage(self.pageno, mediabox) return - def end_page(self, page): + def end_page(self, page: PDFPage) -> None: assert not self._stack, str(len(self._stack)) assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) if self.laparams is not None: @@ -52,19 +71,19 @@ def end_page(self, page): self.receive_layout(self.cur_item) return - def begin_figure(self, name, bbox, matrix): + def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: self._stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) return - def end_figure(self, _): + def end_figure(self, _: str) -> None: fig = self.cur_item assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) self.cur_item = self._stack.pop() self.cur_item.add(fig) return - def render_image(self, name, stream): + def render_image(self, name: str, stream: PDFStream) -> None: assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) item = LTImage(name, stream, (self.cur_item.x0, self.cur_item.y0, @@ -72,7 +91,14 @@ def render_image(self, name, stream): self.cur_item.add(item) return - def paint_path(self, gstate, stroke, fill, evenodd, path): + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) @@ -90,7 +116,8 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): # And, per Section 4.4's Table 4.9, all other path commands place # their point-position in their final two arguments. (Any preceding # arguments represent control points on Bézier curves.) - raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path] + raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:]) + for p in path] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] if shape in {'mlh', 'ml'}: @@ -123,8 +150,17 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): gstate.scolor, gstate.ncolor) self.cur_item.add(curve) - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, - graphicstate): + def render_char( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs: PDFColorSpace, + graphicstate: PDFGraphicState + ) -> float: try: text = font.to_unichr(cid) assert isinstance(text, str), str(type(text)) @@ -137,40 +173,56 @@ def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, self.cur_item.add(item) return item.adv - def handle_undefined_char(self, font, cid): + def handle_undefined_char(self, font: PDFFont, cid: int) -> str: log.info('undefined: %r, %r', font, cid) return '(cid:%d)' % cid - def receive_layout(self, ltpage): + def receive_layout(self, ltpage: LTPage) -> None: return class PDFPageAggregator(PDFLayoutAnalyzer): - def __init__(self, rsrcmgr, pageno=1, laparams=None): + def __init__( + self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.result = None + self.result: Optional[LTPage] = None return - def receive_layout(self, ltpage): + def receive_layout(self, ltpage: LTPage) -> None: self.result = ltpage return - def get_result(self): + def get_result(self) -> LTPage: + assert self.result is not None return self.result -class PDFConverter(PDFLayoutAnalyzer): - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, - laparams=None): +# Some PDFConverter children support only binary I/O +IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO) + + +class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: IOType, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) - self.outfp = outfp + self.outfp: IOType = outfp self.codec = codec self.outfp_binary = self._is_binary_stream(self.outfp) @staticmethod - def _is_binary_stream(outfp): + def _is_binary_stream(outfp: AnyIO) -> bool: """Test if an stream is binary or not""" if 'b' in getattr(outfp, 'mode', ''): return True @@ -187,24 +239,33 @@ def _is_binary_stream(outfp): return True -class TextConverter(PDFConverter): - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - showpageno=False, imagewriter=None): - PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, - laparams=laparams) +class TextConverter(PDFConverter[AnyIO]): + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + showpageno: bool = False, + imagewriter: Optional[ImageWriter] = None + ) -> None: + super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, + laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter return - def write_text(self, text): + def write_text(self, text: str) -> None: text = utils.compatible_encode_method(text, self.codec, 'ignore') if self.outfp_binary: - text = text.encode() - self.outfp.write(text) + cast(BinaryIO, self.outfp).write(text.encode()) + else: + cast(TextIO, self.outfp).write(text) return - def receive_layout(self, ltpage): - def render(item): + def receive_layout(self, ltpage: LTPage) -> None: + def render(item: LTItem) -> None: if isinstance(item, LTContainer): for child in item: render(child) @@ -224,17 +285,24 @@ def render(item): # Some dummy functions to save memory/CPU when all that is wanted # is text. This stops all the image and drawing output from being # recorded and taking up RAM. - def render_image(self, name, stream): + def render_image(self, name: str, stream: PDFStream) -> None: if self.imagewriter is None: return PDFConverter.render_image(self, name, stream) return - def paint_path(self, gstate, stroke, fill, evenodd, path): + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: return -class HTMLConverter(PDFConverter): +class HTMLConverter(PDFConverter[AnyIO]): RECT_COLORS = { 'figure': 'yellow', 'textline': 'magenta', @@ -249,12 +317,30 @@ class HTMLConverter(PDFConverter): 'char': 'black', } - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, - pagemargin=50, imagewriter=None, debug=0, rect_colors=None, - text_colors=None): + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + scale: float = 1, + fontscale: float = 1.0, + layoutmode: str = 'normal', + showpageno: bool = True, + pagemargin: int = 50, + imagewriter: Optional[ImageWriter] = None, + debug: int = 0, + rect_colors: Optional[Dict[str, str]] = None, + text_colors: Optional[Dict[str, str]] = None + ) -> None: PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + + # write() assumes a codec for binary I/O, or no codec for text I/O. + if self.outfp_binary == (not self.codec): + raise ValueError("Codec is required for a binary I/O output") + if text_colors is None: text_colors = {'char': 'black'} if rect_colors is None: @@ -271,19 +357,20 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, if debug: self.rect_colors.update(self.RECT_COLORS) self.text_colors.update(self.TEXT_COLORS) - self._yoffset = self.pagemargin - self._font = None - self._fontstack = [] + self._yoffset: float = self.pagemargin + self._font: Optional[Tuple[str, float]] = None + self._fontstack: List[Optional[Tuple[str, float]]] = [] self.write_header() return - def write(self, text): + def write(self, text: str) -> None: if self.codec: - text = text.encode(self.codec) - self.outfp.write(text) + cast(BinaryIO, self.outfp).write(text.encode(self.codec)) + else: + cast(TextIO, self.outfp).write(text) return - def write_header(self): + def write_header(self) -> None: self.write('\n') if self.codec: s = '{}'.format(i, i) for i in range(1, self.pageno)] s = '
Page: %s
\n' % \ @@ -303,28 +390,49 @@ def write_footer(self): self.write('\n') return - def write_text(self, text): + def write_text(self, text: str) -> None: self.write(enc(text)) return - def place_rect(self, color, borderwidth, x, y, w, h): - color = self.rect_colors.get(color) - if color is not None: + def place_rect( + self, + color: str, + borderwidth: int, + x: float, + y: float, + w: float, + h: float + ) -> None: + color2 = self.rect_colors.get(color) + if color2 is not None: s = '\n' % \ - (color, borderwidth, x * self.scale, + (color2, borderwidth, x * self.scale, (self._yoffset - y) * self.scale, w * self.scale, h * self.scale) self.write( s) return - def place_border(self, color, borderwidth, item): + def place_border( + self, + color: str, + borderwidth: int, + item: LTComponent + ) -> None: self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) return - def place_image(self, item, borderwidth, x, y, w, h): + def place_image( + self, + item: LTImage, + borderwidth: int, + x: float, + y: float, + w: float, + h: float + ) -> None: if self.imagewriter is not None: name = self.imagewriter.export_image(item) s = '' % \ - (color, x * self.scale, (self._yoffset - y) * self.scale, + (color2, x * self.scale, (self._yoffset - y) * self.scale, size * self.scale * self.fontscale) self.write(s) self.write_text(text) self.write('\n') return - def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False): + def begin_div( + self, + color: str, + borderwidth: int, + x: float, + y: float, + w: float, + h: float, + writing_mode: str = 'False' + ) -> None: self._fontstack.append(self._font) self._font = None s = '
None: if self.codec: - text = text.encode(self.codec) - self.outfp.write(text) + cast(BinaryIO, self.outfp).write(text.encode(self.codec)) + else: + cast(TextIO, self.outfp).write(text) return - def write_header(self): + def write_header(self) -> None: if self.codec: self.write('\n' % self.codec) else: @@ -487,18 +626,18 @@ def write_header(self): self.write('\n') return - def write_footer(self): + def write_footer(self) -> None: self.write('\n') return - def write_text(self, text): + def write_text(self, text: str) -> None: if self.stripcontrol: text = self.CONTROL.sub('', text) self.write(enc(text)) return - def receive_layout(self, ltpage): - def show_group(item): + def receive_layout(self, ltpage: LTPage) -> None: + def show_group(item: LTItem) -> None: if isinstance(item, LTTextBox): self.write('\n' % (item.index, bbox2str(item.bbox))) @@ -509,7 +648,8 @@ def show_group(item): self.write('\n') return - def render(item): + def render(item: LTItem) -> None: + child: LTItem if isinstance(item, LTPage): s = '\n' % \ (item.pageid, bbox2str(item.bbox), item.rotate) @@ -580,6 +720,6 @@ def render(item): render(ltpage) return - def close(self): + def close(self) -> None: self.write_footer() return diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 58998a90..3db476f5 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,5 +1,6 @@ import logging import re +from typing import Dict, Iterable, Optional, cast from .glyphlist import glyphname2unicode from .latin_enc import ENCODING @@ -10,7 +11,7 @@ log = logging.getLogger(__name__) -def name2unicode(name): +def name2unicode(name: str) -> str: """Converts Adobe glyph names to Unicode numbers. In contrast to the specification, this raises a KeyError instead of return @@ -32,7 +33,7 @@ def name2unicode(name): else: if name in glyphname2unicode: - return glyphname2unicode.get(name) + return glyphname2unicode[name] elif name.startswith('uni'): name_without_uni = name.strip('uni') @@ -59,7 +60,7 @@ def name2unicode(name): 'it does not match specification' % name) -def raise_key_error_for_invalid_unicode(unicode_digit): +def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 @@ -72,10 +73,10 @@ def raise_key_error_for_invalid_unicode(unicode_digit): class EncodingDB: - std2unicode = {} - mac2unicode = {} - win2unicode = {} - pdf2unicode = {} + std2unicode: Dict[int, str] = {} + mac2unicode: Dict[int, str] = {} + win2unicode: Dict[int, str] = {} + pdf2unicode: Dict[int, str] = {} for (name, std, mac, win, pdf) in ENCODING: c = name2unicode(name) if std: @@ -95,7 +96,11 @@ class EncodingDB: } @classmethod - def get_encoding(cls, name, diff=None): + def get_encoding( + cls, + name: str, + diff: Optional[Iterable[object]] = None + ) -> Dict[int, str]: cid2unicode = cls.encodings.get(name, cls.std2unicode) if diff: cid2unicode = cid2unicode.copy() @@ -105,7 +110,7 @@ def get_encoding(cls, name, diff=None): cid = x elif isinstance(x, PSLiteral): try: - cid2unicode[cid] = name2unicode(x.name) + cid2unicode[cid] = name2unicode(cast(str, x.name)) except (KeyError, ValueError) as e: log.debug(str(e)) cid += 1 diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 33f661c0..f8c5ca4d 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -3,22 +3,36 @@ import logging import sys from io import StringIO +from typing import Any, BinaryIO, Container, Iterator, Optional, cast from .converter import XMLConverter, HTMLConverter, TextConverter, \ PDFPageAggregator from .image import ImageWriter -from .layout import LAParams -from .pdfdevice import TagExtractor +from .layout import LAParams, LTPage +from .pdfdevice import PDFDevice, TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage -from .utils import open_filename - - -def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', - laparams=None, maxpages=0, page_numbers=None, - password="", scale=1.0, rotation=0, layoutmode='normal', - output_dir=None, strip_control=False, debug=False, - disable_caching=False, **kwargs): +from .utils import open_filename, FileOrName, AnyIO + + +def extract_text_to_fp( + inf: BinaryIO, + outfp: AnyIO, + output_type: str = 'text', + codec: str = 'utf-8', + laparams: Optional[LAParams] = None, + maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", + scale: float = 1.0, + rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, + strip_control: bool = False, + debug: bool = False, + disable_caching: bool = False, + **kwargs: Any +) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. @@ -56,7 +70,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) - device = None + device: Optional[PDFDevice] = None if output_type != 'text' and outfp == sys.stdout: outfp = sys.stdout.buffer @@ -76,13 +90,15 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', imagewriter=imagewriter) elif output_type == 'tag': - device = TagExtractor(rsrcmgr, outfp, codec=codec) + # Binary I/O is required, but we have no good way to test it here. + device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) else: msg = f"Output type can be text, html, xml or tag but is " \ f"{output_type}" raise ValueError(msg) + assert device is not None interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, @@ -95,8 +111,15 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', device.close() -def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, - caching=True, codec='utf-8', laparams=None): +def extract_text( + pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + codec: str = 'utf-8', + laparams: Optional[LAParams] = None +) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file @@ -114,6 +137,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: + fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) @@ -131,8 +155,14 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, return output_string.getvalue() -def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, - caching=True, laparams=None): +def extract_pages( + pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + laparams: Optional[LAParams] = None +) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file @@ -149,6 +179,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, laparams = LAParams() with open_filename(pdf_file, "rb") as fp: + fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) diff --git a/pdfminer/image.py b/pdfminer/image.py index 77d14810..83f9a7aa 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -2,20 +2,28 @@ import os.path import struct from io import BytesIO +from typing import BinaryIO, Tuple from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter +from .layout import LTImage from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE -def align32(x): +def align32(x: int) -> int: return ((x+3)//4)*4 class BMPWriter: - def __init__(self, fp, bits, width, height): + def __init__( + self, + fp: BinaryIO, + bits: int, + width: int, + height: int + ) -> None: self.fp = fp self.bits = bits self.width = width @@ -51,7 +59,7 @@ def __init__(self, fp, bits, width, height): self.pos1 = self.pos0 + self.datasize return - def write_line(self, y, data): + def write_line(self, y: int, data: bytes) -> None: self.fp.seek(self.pos1 - (y+1)*self.linesize) self.fp.write(data) return @@ -63,13 +71,13 @@ class ImageWriter: Supports various image types: JPEG, JBIG2 and bitmaps """ - def __init__(self, outdir): + def __init__(self, outdir: str) -> None: self.outdir = outdir if not os.path.exists(self.outdir): os.makedirs(self.outdir) return - def export_image(self, image): + def export_image(self, image: LTImage) -> str: (width, height) = image.srcsize is_jbig2 = self.is_jbig2_image(image) @@ -80,8 +88,9 @@ def export_image(self, image): fp = open(path, 'wb') if ext == '.jpg': raw_data = image.stream.get_rawdata() + assert raw_data is not None if LITERAL_DEVICE_CMYK in image.colorspace: - from PIL import Image + from PIL import Image # type: ignore[import] from PIL import ImageChops ifp = BytesIO(raw_data) i = Image.open(ifp) @@ -128,7 +137,7 @@ def export_image(self, image): return name @staticmethod - def is_jbig2_image(image): + def is_jbig2_image(image: LTImage) -> bool: filters = image.stream.get_filters() is_jbig2 = False for filter_name, params in filters: @@ -138,7 +147,12 @@ def is_jbig2_image(image): return is_jbig2 @staticmethod - def _get_image_extension(image, width, height, is_jbig2): + def _get_image_extension( + image: LTImage, + width: int, + height: int, + is_jbig2: bool + ) -> str: filters = image.stream.get_filters() if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: ext = '.jpg' @@ -154,7 +168,11 @@ def _get_image_extension(image, width, height, is_jbig2): return ext @staticmethod - def _create_unique_image_name(dirname, image_name, ext): + def _create_unique_image_name( + dirname: str, + image_name: str, + ext: str + ) -> Tuple[str, str]: name = image_name + ext path = os.path.join(dirname, name) img_index = 0 diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py index 4299629b..10ee7e6f 100644 --- a/pdfminer/jbig2.py +++ b/pdfminer/jbig2.py @@ -1,6 +1,7 @@ import math import os from struct import pack, unpack, calcsize +from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast # segment structure base SEG_STRUCT = [ @@ -34,15 +35,15 @@ FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010 -def bit_set(bit_pos, value): +def bit_set(bit_pos: int, value: int) -> bool: return bool((value >> bit_pos) & 1) -def check_flag(flag, value): +def check_flag(flag: int, value: int) -> bool: return bool(flag & value) -def masked_value(mask, value): +def masked_value(mask: int, value: int) -> int: for bit_pos in range(0, 31): if bit_set(bit_pos, mask): return (value & mask) >> bit_pos @@ -50,7 +51,7 @@ def masked_value(mask, value): raise Exception("Invalid mask or value") -def mask_value(mask, value): +def mask_value(mask: int, value: int) -> int: for bit_pos in range(0, 31): if bit_set(bit_pos, mask): return (value & (mask >> bit_pos)) << bit_pos @@ -58,25 +59,34 @@ def mask_value(mask, value): raise Exception("Invalid mask or value") +def unpack_int(format: str, buffer: bytes) -> int: + assert format in {">B", ">I", ">L"} + [result] = cast(Tuple[int], unpack(format, buffer)) + return result + + +JBIG2SegmentFlags = Dict[str, Union[int, bool]] +JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]] +JBIG2Segment = Dict[str, Union[bool, int, bytes, JBIG2SegmentFlags, + JBIG2RetentionFlags]] + + class JBIG2StreamReader: """Read segments from a JBIG2 byte stream""" - - def __init__(self, stream): + def __init__(self, stream: BinaryIO) -> None: self.stream = stream - def get_segments(self): - segments = [] + def get_segments(self) -> List[JBIG2Segment]: + segments: List[JBIG2Segment] = [] while not self.is_eof(): - segment = {} + segment: JBIG2Segment = {} for field_format, name in SEG_STRUCT: field_len = calcsize(field_format) field = self.stream.read(field_len) if len(field) < field_len: segment["_error"] = True break - value = unpack(field_format, field) - if len(value) == 1: - [value] = value + value = unpack_int(field_format, field) parser = getattr(self, "parse_%s" % name, None) if callable(parser): value = parser(segment, value, field) @@ -86,21 +96,31 @@ def get_segments(self): segments.append(segment) return segments - def is_eof(self): + def is_eof(self) -> bool: if self.stream.read(1) == b'': return True else: self.stream.seek(-1, os.SEEK_CUR) return False - def parse_flags(self, segment, flags, field): + def parse_flags( + self, + segment: JBIG2Segment, + flags: int, + field: bytes + ) -> JBIG2SegmentFlags: return { "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), "type": masked_value(SEG_TYPE_MASK, flags) } - def parse_retention_flags(self, segment, flags, field): + def parse_retention_flags( + self, + segment: JBIG2Segment, + flags: int, + field: bytes + ) -> JBIG2RetentionFlags: ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) retain_segments = [] ref_segments = [] @@ -110,15 +130,16 @@ def parse_retention_flags(self, segment, flags, field): retain_segments.append(bit_set(bit_pos, flags)) else: field += self.stream.read(3) - [ref_count] = unpack(">L", field) + ref_count = unpack_int(">L", field) ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count) ret_bytes_count = int(math.ceil((ref_count + 1) / 8)) for ret_byte_index in range(ret_bytes_count): - [ret_byte] = unpack(">B", self.stream.read(1)) + ret_byte = unpack_int(">B", self.stream.read(1)) for bit_pos in range(7): retain_segments.append(bit_set(bit_pos, ret_byte)) seg_num = segment["number"] + assert isinstance(seg_num, int) if seg_num <= 256: ref_format = ">B" elif seg_num <= 65536: @@ -129,8 +150,8 @@ def parse_retention_flags(self, segment, flags, field): ref_size = calcsize(ref_format) for ref_index in range(ref_count): - ref = self.stream.read(ref_size) - [ref] = unpack(ref_format, ref) + ref_data = self.stream.read(ref_size) + ref = unpack_int(ref_format, ref_data) ref_segments.append(ref) return { @@ -139,15 +160,26 @@ def parse_retention_flags(self, segment, flags, field): "ref_segments": ref_segments, } - def parse_page_assoc(self, segment, page, field): - if segment["flags"]["page_assoc_long"]: + def parse_page_assoc( + self, + segment: JBIG2Segment, + page: int, + field: bytes + ) -> int: + if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]: field += self.stream.read(3) - [page] = unpack(">L", field) + page = unpack_int(">L", field) return page - def parse_data_length(self, segment, length, field): + def parse_data_length( + self, + segment: JBIG2Segment, + length: int, + field: bytes + ) -> int: if length: - if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \ + if (cast(JBIG2SegmentFlags, segment["flags"])["type"] == + SEG_TYPE_IMMEDIATE_GEN_REGION) \ and (length == DATA_LEN_UNKNOWN): raise NotImplementedError( @@ -163,25 +195,36 @@ def parse_data_length(self, segment, length, field): class JBIG2StreamWriter: """Write JBIG2 segments to a file in JBIG2 format""" - def __init__(self, stream): + EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = { + 'ref_count': 0, + 'ref_segments': cast(List[int], []), + 'retain_segments': cast(List[bool], []) + } + + def __init__(self, stream: BinaryIO) -> None: self.stream = stream - def write_segments(self, segments, fix_last_page=True): + def write_segments( + self, + segments: Iterable[JBIG2Segment], + fix_last_page: bool = True + ) -> int: data_len = 0 - current_page = None - seg_num = None + current_page: Optional[int] = None + seg_num: Optional[int] = None for segment in segments: data = self.encode_segment(segment) self.stream.write(data) data_len += len(data) - seg_num = segment["number"] + seg_num = cast(Optional[int], segment["number"]) if fix_last_page: - seg_page = segment.get("page_assoc") + seg_page = cast(int, segment.get("page_assoc")) - if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE: + if cast(JBIG2SegmentFlags, segment["flags"])["type"] == \ + SEG_TYPE_END_OF_PAGE: current_page = None elif seg_page: current_page = seg_page @@ -194,7 +237,11 @@ def write_segments(self, segments, fix_last_page=True): return data_len - def write_file(self, segments, fix_last_page=True): + def write_file( + self, + segments: Iterable[JBIG2Segment], + fix_last_page: bool = True + ) -> int: header = FILE_HEADER_ID header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN header += pack(">B", header_flags) @@ -205,7 +252,7 @@ def write_file(self, segments, fix_last_page=True): seg_num = 0 for segment in segments: - seg_num = segment["number"] + seg_num = cast(int, segment["number"]) eof_segment = self.get_eof_segment(seg_num + 1) data = self.encode_segment(eof_segment) @@ -215,7 +262,7 @@ def write_file(self, segments, fix_last_page=True): return data_len - def encode_segment(self, segment): + def encode_segment(self, segment: JBIG2Segment) -> bytes: data = b'' for field_format, name in SEG_STRUCT: value = segment.get(name) @@ -227,7 +274,8 @@ def encode_segment(self, segment): data += field return data - def encode_flags(self, value, segment): + def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment + ) -> bytes: flags = 0 if value.get("deferred"): flags |= HEADER_FLAG_DEFERRED @@ -237,17 +285,22 @@ def encode_flags(self, value, segment): if value["page_assoc_long"] else flags else: flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ - if segment.get("page", 0) > 255 else flags + if cast(int, segment.get("page", 0)) > 255 else flags flags |= mask_value(SEG_TYPE_MASK, value["type"]) return pack(">B", flags) - def encode_retention_flags(self, value, segment): + def encode_retention_flags( + self, + value: JBIG2RetentionFlags, + segment: JBIG2Segment + ) -> bytes: flags = [] flags_format = ">B" ref_count = value["ref_count"] - retain_segments = value.get("retain_segments", []) + assert isinstance(ref_count, int) + retain_segments = cast(List[bool], value.get("retain_segments", [])) if ref_count <= 4: flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) @@ -271,9 +324,9 @@ def encode_retention_flags(self, value, segment): flags.append(ret_byte) - ref_segments = value.get("ref_segments", []) + ref_segments = cast(List[int], value.get("ref_segments", [])) - seg_num = segment["number"] + seg_num = cast(int, segment["number"]) if seg_num <= 256: ref_format = "B" elif seg_num <= 65536: @@ -287,35 +340,31 @@ def encode_retention_flags(self, value, segment): return pack(flags_format, *flags) - def encode_data_length(self, value, segment): + def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes: data = pack(">L", value) - data += segment["raw_data"] + data += cast(bytes, segment["raw_data"]) return data - def get_eop_segment(self, seg_number, page_number): + def get_eop_segment( + self, + seg_number: int, + page_number: int + ) -> JBIG2Segment: return { 'data_length': 0, 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE}, 'number': seg_number, 'page_assoc': page_number, 'raw_data': b'', - 'retention_flags': { - 'ref_count': 0, - 'ref_segments': [], - 'retain_segments': [] - } + 'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS } - def get_eof_segment(self, seg_number): + def get_eof_segment(self, seg_number: int) -> JBIG2Segment: return { 'data_length': 0, 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE}, 'number': seg_number, 'page_assoc': 0, 'raw_data': b'', - 'retention_flags': { - 'ref_count': 0, - 'ref_segments': [], - 'retain_segments': [] - } + 'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS } diff --git a/pdfminer/latin_enc.py b/pdfminer/latin_enc.py index fae26ff3..d579aea1 100644 --- a/pdfminer/latin_enc.py +++ b/pdfminer/latin_enc.py @@ -5,7 +5,12 @@ """ -ENCODING = [ +from typing import List, Optional, Tuple + +EncodingRow = \ + Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]] + +ENCODING: List[EncodingRow] = [ # (name, std, mac, win, pdf) ('A', 65, 65, 65, 65), ('AE', 225, 174, 198, 198), diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8bce26bc..b9f3d105 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,25 +1,36 @@ import heapq import logging +from typing import (Dict, Generic, Iterable, Iterator, List, Optional, + Sequence, Set, Tuple, TypeVar, Union, cast) from .utils import INF +from .utils import LTComponentT +from .utils import Matrix from .utils import Plane +from .utils import Point +from .utils import Rect from .utils import apply_matrix_pt from .utils import bbox2str from .utils import fsplit from .utils import get_bound from .utils import matrix2str from .utils import uniq +from .pdfcolor import PDFColorSpace +from .pdftypes import PDFStream +from .pdfinterp import Color +from .pdfinterp import PDFGraphicState +from .pdffont import PDFFont logger = logging.getLogger(__name__) class IndexAssigner: - def __init__(self, index=0): + def __init__(self, index: int = 0) -> None: self.index = index return - def run(self, obj): + def run(self, obj: "LTItem") -> None: if isinstance(obj, LTTextBox): obj.index = self.index self.index += 1 @@ -57,14 +68,16 @@ class LAParams: figures. """ - def __init__(self, - line_overlap=0.5, - char_margin=2.0, - line_margin=0.5, - word_margin=0.1, - boxes_flow=0.5, - detect_vertical=False, - all_texts=False): + def __init__( + self, + line_overlap: float = 0.5, + char_margin: float = 2.0, + line_margin: float = 0.5, + word_margin: float = 0.1, + boxes_flow: Optional[float] = 0.5, + detect_vertical: bool = False, + all_texts: bool = False + ) -> None: self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin @@ -76,7 +89,7 @@ def __init__(self, self._validate() return - def _validate(self): + def _validate(self) -> None: if self.boxes_flow is not None: boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a " "number between -1 and +1") @@ -86,7 +99,7 @@ def _validate(self): if not -1 <= self.boxes_flow <= 1: raise ValueError(boxes_flow_err_msg) - def __repr__(self): + def __repr__(self) -> str: return '' % \ (self.char_margin, self.line_margin, self.word_margin, @@ -96,7 +109,7 @@ def __repr__(self): class LTItem: """Interface for things that can be analyzed""" - def analyze(self, laparams): + def analyze(self, laparams: LAParams) -> None: """Perform the layout analysis.""" return @@ -104,11 +117,11 @@ def analyze(self, laparams): class LTText: """Interface for things that have text""" - def __repr__(self): + def __repr__(self) -> str: return ('<%s %r>' % (self.__class__.__name__, self.get_text())) - def get_text(self): + def get_text(self) -> str: """Text contained in this object""" raise NotImplementedError @@ -116,29 +129,29 @@ def get_text(self): class LTComponent(LTItem): """Object with a bounding box""" - def __init__(self, bbox): + def __init__(self, bbox: Rect) -> None: LTItem.__init__(self) self.set_bbox(bbox) return - def __repr__(self): + def __repr__(self) -> str: return ('<%s %s>' % (self.__class__.__name__, bbox2str(self.bbox))) # Disable comparison. - def __lt__(self, _): + def __lt__(self, _: object) -> bool: raise ValueError - def __le__(self, _): + def __le__(self, _: object) -> bool: raise ValueError - def __gt__(self, _): + def __gt__(self, _: object) -> bool: raise ValueError - def __ge__(self, _): + def __ge__(self, _: object) -> bool: raise ValueError - def set_bbox(self, bbox): + def set_bbox(self, bbox: Rect) -> None: (x0, y0, x1, y1) = bbox self.x0 = x0 self.y0 = y0 @@ -149,39 +162,39 @@ def set_bbox(self, bbox): self.bbox = bbox return - def is_empty(self): + def is_empty(self) -> bool: return self.width <= 0 or self.height <= 0 - def is_hoverlap(self, obj): + def is_hoverlap(self, obj: "LTComponent") -> bool: assert isinstance(obj, LTComponent), str(type(obj)) return obj.x0 <= self.x1 and self.x0 <= obj.x1 - def hdistance(self, obj): + def hdistance(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return 0 else: return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) - def hoverlap(self, obj): + def hoverlap(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) else: return 0 - def is_voverlap(self, obj): + def is_voverlap(self, obj: "LTComponent") -> bool: assert isinstance(obj, LTComponent), str(type(obj)) return obj.y0 <= self.y1 and self.y0 <= obj.y1 - def vdistance(self, obj): + def vdistance(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return 0 else: return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) - def voverlap(self, obj): + def voverlap(self, obj: "LTComponent") -> float: assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) @@ -192,8 +205,16 @@ def voverlap(self, obj): class LTCurve(LTComponent): """A generic Bezier curve""" - def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False, - stroking_color=None, non_stroking_color=None): + def __init__( + self, + linewidth: float, + pts: List[Point], + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts self.linewidth = linewidth @@ -204,7 +225,7 @@ def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False, self.non_stroking_color = non_stroking_color return - def get_pts(self): + def get_pts(self) -> str: return ','.join('%.3f,%.3f' % p for p in self.pts) @@ -214,8 +235,17 @@ class LTLine(LTCurve): Could be used for separating text or figures. """ - def __init__(self, linewidth, p0, p1, stroke=False, fill=False, - evenodd=False, stroking_color=None, non_stroking_color=None): + def __init__( + self, + linewidth: float, + p0: Point, + p1: Point, + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color) return @@ -227,8 +257,16 @@ class LTRect(LTCurve): Could be used for framing another pictures or figures. """ - def __init__(self, linewidth, bbox, stroke=False, fill=False, - evenodd=False, stroking_color=None, non_stroking_color=None): + def __init__( + self, + linewidth: float, + bbox: Rect, + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, @@ -242,7 +280,7 @@ class LTImage(LTComponent): Embedded images can be in JPEG, Bitmap or JBIG2. """ - def __init__(self, name, stream, bbox): + def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: LTComponent.__init__(self, bbox) self.name = name self.stream = stream @@ -255,7 +293,7 @@ def __init__(self, name, stream, bbox): self.colorspace = [self.colorspace] return - def __repr__(self): + def __repr__(self) -> str: return ('<%s(%s) %s %r>' % (self.__class__.__name__, self.name, bbox2str(self.bbox), self.srcsize)) @@ -269,19 +307,30 @@ class LTAnno(LTItem, LTText): according to the relationship between two characters (e.g. a space). """ - def __init__(self, text): + def __init__(self, text: str) -> None: self._text = text return - def get_text(self): + def get_text(self) -> str: return self._text class LTChar(LTComponent, LTText): """Actual letter in the text as a Unicode string.""" - def __init__(self, matrix, font, fontsize, scaling, rise, - text, textwidth, textdisp, ncs, graphicstate): + def __init__( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + text: str, + textwidth: float, + textdisp: Union[float, Tuple[Optional[float], float]], + ncs: PDFColorSpace, + graphicstate: PDFGraphicState + ) -> None: LTText.__init__(self) self._text = text self.matrix = matrix @@ -292,6 +341,7 @@ def __init__(self, matrix, font, fontsize, scaling, rise, # compute the boundary rectangle. if font.is_vertical(): # vertical + assert isinstance(textdisp, tuple) (vx, vy) = textdisp if vx is None: vx = fontsize * 0.5 @@ -320,114 +370,129 @@ def __init__(self, matrix, font, fontsize, scaling, rise, self.size = self.height return - def __repr__(self): + def __repr__(self) -> str: return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % (self.__class__.__name__, bbox2str(self.bbox), matrix2str(self.matrix), self.fontname, self.adv, self.get_text())) - def get_text(self): + def get_text(self) -> str: return self._text - def is_compatible(self, obj): + def is_compatible(self, obj: object) -> bool: """Returns True if two characters can coexist in the same line.""" return True -class LTContainer(LTComponent): +LTItemT = TypeVar('LTItemT', bound=LTItem) + + +class LTContainer(LTComponent, Generic[LTItemT]): """Object that can be extended and analyzed""" - def __init__(self, bbox): + def __init__(self, bbox: Rect) -> None: LTComponent.__init__(self, bbox) - self._objs = [] + self._objs: List[LTItemT] = [] return - def __iter__(self): + def __iter__(self) -> Iterator[LTItemT]: return iter(self._objs) - def __len__(self): + def __len__(self) -> int: return len(self._objs) - def add(self, obj): + def add(self, obj: LTItemT) -> None: self._objs.append(obj) return - def extend(self, objs): + def extend(self, objs: Iterable[LTItemT]) -> None: for obj in objs: self.add(obj) return - def analyze(self, laparams): + def analyze(self, laparams: LAParams) -> None: for obj in self._objs: obj.analyze(laparams) return -class LTExpandableContainer(LTContainer): - def __init__(self): +class LTExpandableContainer(LTContainer[LTItemT]): + def __init__(self) -> None: LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) return - def add(self, obj): - LTContainer.add(self, obj) + # Incompatible override: we take an LTComponent (with bounding box), but + # super() LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] + LTContainer.add(self, cast(LTItemT, obj)) self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0), max(self.x1, obj.x1), max(self.y1, obj.y1))) return -class LTTextContainer(LTExpandableContainer, LTText): - def __init__(self): +class LTTextContainer(LTExpandableContainer[LTItemT], LTText): + def __init__(self) -> None: LTText.__init__(self) LTExpandableContainer.__init__(self) return - def get_text(self): - return ''.join(obj.get_text() for obj in self + def get_text(self) -> str: + return ''.join(cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)) -class LTTextLine(LTTextContainer): +TextLineElement = Union[LTChar, LTAnno] + + +class LTTextLine(LTTextContainer[TextLineElement]): """Contains a list of LTChar objects that represent a single text line. The characters are aligned either horizontally or vertically, depending on the text's writing mode. """ - def __init__(self, word_margin): - LTTextContainer.__init__(self) + def __init__(self, word_margin: float) -> None: + super().__init__() self.word_margin = word_margin return - def __repr__(self): + def __repr__(self) -> str: return ('<%s %s %r>' % (self.__class__.__name__, bbox2str(self.bbox), self.get_text())) - def analyze(self, laparams): + def analyze(self, laparams: LAParams) -> None: LTTextContainer.analyze(self, laparams) LTContainer.add(self, LTAnno('\n')) return - def find_neighbors(self, plane, ratio): + def find_neighbors(self, plane: Plane[LTComponentT], ratio: float + ) -> List["LTTextLine"]: raise NotImplementedError class LTTextLineHorizontal(LTTextLine): - def __init__(self, word_margin): + def __init__(self, word_margin: float) -> None: LTTextLine.__init__(self, word_margin) - self._x1 = +INF + self._x1: float = +INF return - def add(self, obj): + # Incompatible override: we take an LTComponent (with bounding box), but + # LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0 - margin: LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 - LTTextLine.add(self, obj) + super().add(obj) return - def find_neighbors(self, plane, ratio): + def find_neighbors( + self, + plane: Plane[LTComponentT], + ratio: float + ) -> List[LTTextLine]: """ Finds neighboring LTTextLineHorizontals in the plane. @@ -445,45 +510,67 @@ def find_neighbors(self, plane, ratio): self._is_right_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d)))] - def _is_left_aligned_with(self, other, tolerance=0): + def _is_left_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the left-hand edge of `other` is within `tolerance`. """ return abs(other.x0 - self.x0) <= tolerance - def _is_right_aligned_with(self, other, tolerance=0): + def _is_right_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the right-hand edge of `other` is within `tolerance`. """ return abs(other.x1 - self.x1) <= tolerance - def _is_centrally_aligned_with(self, other, tolerance=0): + def _is_centrally_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the horizontal center of `other` is within `tolerance`. """ return abs( (other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance - def _is_same_height_as(self, other, tolerance): + def _is_same_height_as( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: return abs(other.height - self.height) <= tolerance class LTTextLineVertical(LTTextLine): - def __init__(self, word_margin): + def __init__(self, word_margin: float) -> None: LTTextLine.__init__(self, word_margin) - self._y0 = -INF + self._y0: float = -INF return - def add(self, obj): + # Incompatible override: we take an LTComponent (with bounding box), but + # LTContainer only considers LTItem (no bounding box). + def add(self, obj: LTComponent) -> None: # type: ignore[override] if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if obj.y1 + margin < self._y0: LTContainer.add(self, LTAnno(' ')) self._y0 = obj.y0 - LTTextLine.add(self, obj) + super().add(obj) return - def find_neighbors(self, plane, ratio): + def find_neighbors( + self, + plane: Plane[LTComponentT], + ratio: float + ) -> List[LTTextLine]: """ Finds neighboring LTTextLineVerticals in the plane. @@ -501,30 +588,42 @@ def find_neighbors(self, plane, ratio): self._is_upper_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d)))] - def _is_lower_aligned_with(self, other, tolerance=0): + def _is_lower_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the lower edge of `other` is within `tolerance`. """ return abs(other.y0 - self.y0) <= tolerance - def _is_upper_aligned_with(self, other, tolerance=0): + def _is_upper_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the upper edge of `other` is within `tolerance`. """ return abs(other.y1 - self.y1) <= tolerance - def _is_centrally_aligned_with(self, other, tolerance=0): + def _is_centrally_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the vertical center of `other` is within `tolerance`. """ return abs( (other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance - def _is_same_width_as(self, other, tolerance): + def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: return abs(other.width - self.width) <= tolerance -class LTTextBox(LTTextContainer): +class LTTextBox(LTTextContainer[LTTextLine]): """Represents a group of text chunks in a rectangular area. Note that this box is created by geometric analysis and does not @@ -532,72 +631,86 @@ class LTTextBox(LTTextContainer): of LTTextLine objects. """ - def __init__(self): + def __init__(self) -> None: LTTextContainer.__init__(self) - self.index = -1 + self.index: int = -1 return - def __repr__(self): + def __repr__(self) -> str: return ('<%s(%s) %s %r>' % (self.__class__.__name__, self.index, bbox2str(self.bbox), self.get_text())) + def get_writing_mode(self) -> str: + raise NotImplementedError + class LTTextBoxHorizontal(LTTextBox): - def analyze(self, laparams): - LTTextBox.analyze(self, laparams) + def analyze(self, laparams: LAParams) -> None: + super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.y1) return - def get_writing_mode(self): + def get_writing_mode(self) -> str: return 'lr-tb' class LTTextBoxVertical(LTTextBox): - def analyze(self, laparams): - LTTextBox.analyze(self, laparams) + def analyze(self, laparams: LAParams) -> None: + super().analyze(laparams) self._objs.sort(key=lambda obj: -obj.x1) return - def get_writing_mode(self): + def get_writing_mode(self) -> str: return 'tb-rl' -class LTTextGroup(LTTextContainer): - def __init__(self, objs): - LTTextContainer.__init__(self) +TextGroupElement = Union[LTTextBox, "LTTextGroup"] + + +class LTTextGroup(LTTextContainer[TextGroupElement]): + def __init__(self, objs: Iterable[TextGroupElement]) -> None: + super().__init__() self.extend(objs) return class LTTextGroupLRTB(LTTextGroup): - def analyze(self, laparams): - LTTextGroup.analyze(self, laparams) + def analyze(self, laparams: LAParams) -> None: + super().analyze(laparams) + assert laparams.boxes_flow is not None + boxes_flow = laparams.boxes_flow # reorder the objects from top-left to bottom-right. self._objs.sort( - key=lambda obj: (1 - laparams.boxes_flow) * obj.x0 - - (1 + laparams.boxes_flow) * (obj.y0 + obj.y1)) + key=lambda obj: (1 - boxes_flow) * obj.x0 + - (1 + boxes_flow) * (obj.y0 + obj.y1)) return class LTTextGroupTBRL(LTTextGroup): - def analyze(self, laparams): - LTTextGroup.analyze(self, laparams) + def analyze(self, laparams: LAParams) -> None: + super().analyze(laparams) + assert laparams.boxes_flow is not None + boxes_flow = laparams.boxes_flow # reorder the objects from top-right to bottom-left. self._objs.sort( - key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1) - - (1 - laparams.boxes_flow) * obj.y1) + key=lambda obj: - (1 + boxes_flow) * (obj.x0 + obj.x1) + - (1 - boxes_flow) * obj.y1) return -class LTLayoutContainer(LTContainer): - def __init__(self, bbox): +class LTLayoutContainer(LTContainer[LTComponent]): + def __init__(self, bbox: Rect) -> None: LTContainer.__init__(self, bbox) - self.groups = None + self.groups: Optional[List[LTTextGroup]] = None return # group_objects: group text object to textlines. - def group_objects(self, laparams, objs): + def group_objects( + self, + laparams: LAParams, + objs: Iterable[LTComponent] + ) -> Iterator[LTTextLine]: obj0 = None line = None for obj1 in objs: @@ -667,15 +780,20 @@ def group_objects(self, laparams, objs): obj0 = obj1 if line is None: line = LTTextLineHorizontal(laparams.word_margin) + assert obj0 is not None line.add(obj0) yield line return - def group_textlines(self, laparams, lines): + def group_textlines( + self, + laparams: LAParams, + lines: Iterable[LTTextLine] + ) -> Iterator[LTTextBox]: """Group neighboring lines to textboxes""" - plane = Plane(self.bbox) + plane: Plane[LTTextLine] = Plane(self.bbox) plane.extend(lines) - boxes = {} + boxes: Dict[LTTextLine, LTTextBox] = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) members = [line] @@ -684,7 +802,7 @@ def group_textlines(self, laparams, lines): if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): - box = LTTextBoxHorizontal() + box: LTTextBox = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): @@ -702,7 +820,11 @@ def group_textlines(self, laparams, lines): yield box return - def group_textboxes(self, laparams, boxes): + def group_textboxes( + self, + laparams: LAParams, + boxes: Sequence[LTTextBox] + ) -> List[LTTextGroup]: """Group textboxes hierarchically. Get pair-wise distances, via dist func defined below, and then merge @@ -718,10 +840,13 @@ def group_textboxes(self, laparams, boxes): :param laparams: LAParams object. :param boxes: All textbox objects to be grouped. - :return: a list that has only one element, the final top level textbox. + :return: a list that has only one element, the final top level group. """ - def dist(obj1, obj2): + ElementT = Union[LTTextBox, LTTextGroup] + plane: Plane[ElementT] = Plane(self.bbox) + + def dist(obj1: LTComponent, obj2: LTComponent) -> float: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. @@ -740,7 +865,7 @@ def dist(obj1, obj2): return (x1 - x0) * (y1 - y0) \ - obj1.width*obj1.height - obj2.width*obj2.height - def isany(obj1, obj2): + def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]: """Check if there's any other object between obj1 and obj2.""" x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) @@ -749,16 +874,15 @@ def isany(obj1, obj2): objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) - dists = [] + dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = [] for i in range(len(boxes)): - obj1 = boxes[i] + box1 = boxes[i] for j in range(i+1, len(boxes)): - obj2 = boxes[j] - dists.append((False, dist(obj1, obj2), id(obj1), id(obj2), - obj1, obj2)) + box2 = boxes[j] + dists.append((False, dist(box1, box2), id(box1), id(box2), + box1, box2)) heapq.heapify(dists) - plane = Plane(self.bbox) plane.extend(boxes) done = set() while len(dists) > 0: @@ -770,7 +894,7 @@ def isany(obj1, obj2): continue if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \ isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)): - group = LTTextGroupTBRL([obj1, obj2]) + group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) @@ -781,9 +905,10 @@ def isany(obj1, obj2): heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other)) plane.add(group) - return list(plane) + # By now only groups are in the plane + return list(cast(LTTextGroup, g) for g in plane) - def analyze(self, laparams): + def analyze(self, laparams: LAParams) -> None: # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), @@ -801,7 +926,7 @@ def analyze(self, laparams): for textbox in textboxes: textbox.analyze(laparams) - def getkey(box): + def getkey(box: LTTextBox) -> Tuple[int, float, float]: if isinstance(box, LTTextBoxVertical): return (0, -box.x1, -box.y0) else: @@ -814,7 +939,8 @@ def getkey(box): group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) - self._objs = textboxes + otherobjs + empties + self._objs = (cast(List[LTComponent], textboxes) + otherobjs + + cast(List[LTComponent], empties)) return @@ -826,7 +952,7 @@ class LTFigure(LTLayoutContainer): recursively. """ - def __init__(self, name, bbox, matrix): + def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: self.name = name self.matrix = matrix (x, y, w, h) = bbox @@ -835,12 +961,12 @@ def __init__(self, name, bbox, matrix): LTLayoutContainer.__init__(self, bbox) return - def __repr__(self): + def __repr__(self) -> str: return ('<%s(%s) %s matrix=%s>' % (self.__class__.__name__, self.name, bbox2str(self.bbox), matrix2str(self.matrix))) - def analyze(self, laparams): + def analyze(self, laparams: LAParams) -> None: if not laparams.all_texts: return LTLayoutContainer.analyze(self, laparams) @@ -854,13 +980,13 @@ class LTPage(LTLayoutContainer): LTCurve and LTLine. """ - def __init__(self, pageid, bbox, rotate=0): + def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: LTLayoutContainer.__init__(self, bbox) self.pageid = pageid self.rotate = rotate return - def __repr__(self): + def __repr__(self) -> str: return ('<%s(%r) %s rotate=%r>' % (self.__class__.__name__, self.pageid, bbox2str(self.bbox), self.rotate)) diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index f0ed8a87..31c085ed 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -1,5 +1,6 @@ from io import BytesIO import logging +from typing import BinaryIO, Iterator, List, Optional, cast logger = logging.getLogger(__name__) @@ -11,16 +12,17 @@ class CorruptDataError(Exception): class LZWDecoder: - def __init__(self, fp): + def __init__(self, fp: BinaryIO) -> None: self.fp = fp self.buff = 0 self.bpos = 8 self.nbits = 9 - self.table = None - self.prevbuf = None + # NB: self.table stores None only in indices 256 and 257 + self.table: Optional[List[Optional[bytes]]] = None + self.prevbuf: Optional[bytes] = None return - def readbits(self, bits): + def readbits(self, bits: int) -> int: v = 0 while 1: # the number of remaining bits we can get from the current buffer. @@ -45,7 +47,7 @@ def readbits(self, bits): self.bpos = 0 return v - def feed(self, code): + def feed(self, code: int) -> bytes: x = b'' if code == 256: self.table = [bytes((c,)) for c in range(256)] # 0-255 @@ -56,14 +58,16 @@ def feed(self, code): elif code == 257: pass elif not self.prevbuf: - x = self.prevbuf = self.table[code] + assert self.table is not None + x = self.prevbuf = cast(bytes, self.table[code]) # assume not None else: + assert self.table is not None if code < len(self.table): - x = self.table[code] + x = cast(bytes, self.table[code]) # assume not None self.table.append(self.prevbuf+x[:1]) elif code == len(self.table): self.table.append(self.prevbuf+self.prevbuf[:1]) - x = self.table[code] + x = cast(bytes, self.table[code]) else: raise CorruptDataError table_length = len(self.table) @@ -76,7 +80,7 @@ def feed(self, code): self.prevbuf = x return x - def run(self): + def run(self) -> Iterator[bytes]: while 1: try: code = self.readbits(self.nbits) @@ -88,12 +92,13 @@ def run(self): # just ignore corrupt data and stop yielding there break yield x + assert self.table is not None logger.debug('nbits=%d, code=%d, output=%r, table=%r' % (self.nbits, code, x, self.table[258:])) return -def lzwdecode(data): +def lzwdecode(data: bytes) -> bytes: fp = BytesIO(data) s = LZWDecoder(fp).run() return b''.join(s) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index ff28d54e..df685ed1 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,4 +1,5 @@ import collections +from typing import Dict from .psparser import LIT @@ -9,17 +10,17 @@ class PDFColorSpace: - def __init__(self, name, ncomponents): + def __init__(self, name: str, ncomponents: int) -> None: self.name = name self.ncomponents = ncomponents return - def __repr__(self): + def __repr__(self) -> str: return '' % \ (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = collections.OrderedDict() +PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict() for (name, n) in [ ('DeviceGray', 1), # default value first diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 82ede760..0a370633 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,66 +1,116 @@ +from pdfminer.psparser import PSLiteral +from typing import (BinaryIO, Iterable, List, Optional, Sequence, + TYPE_CHECKING, Union, cast) from . import utils +from .utils import Matrix, Point, Rect, PathSegment +from .pdfcolor import PDFColorSpace +from .pdffont import PDFFont from .pdffont import PDFUnicodeNotDefined +from .pdfpage import PDFPage +from .pdftypes import PDFStream + +if TYPE_CHECKING: + from .pdfinterp import PDFGraphicState + from .pdfinterp import PDFResourceManager + from .pdfinterp import PDFTextState + from .pdfinterp import PDFStackT + + +PDFTextSeq = Iterable[Union[int, float, bytes]] class PDFDevice: """Translate the output of PDFPageInterpreter to the output that is needed """ - def __init__(self, rsrcmgr): + def __init__(self, rsrcmgr: "PDFResourceManager") -> None: self.rsrcmgr = rsrcmgr - self.ctm = None + self.ctm: Optional[Matrix] = None return - def __repr__(self): + def __repr__(self) -> str: return '' - def __enter__(self): + def __enter__(self) -> "PDFDevice": return self - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__( + self, + exc_type: object, + exc_val: object, + exc_tb: object + ) -> None: self.close() - def close(self): + def close(self) -> None: return - def set_ctm(self, ctm): + def set_ctm(self, ctm: Matrix) -> None: self.ctm = ctm return - def begin_tag(self, tag, props=None): + def begin_tag( + self, + tag: PSLiteral, + props: Optional["PDFStackT"] = None + ) -> None: return - def end_tag(self): + def end_tag(self) -> None: return - def do_tag(self, tag, props=None): + def do_tag( + self, + tag: PSLiteral, + props: Optional["PDFStackT"] = None + ) -> None: return - def begin_page(self, page, ctm): + def begin_page(self, page: PDFPage, ctm: Matrix) -> None: return - def end_page(self, page): + def end_page(self, page: PDFPage) -> None: return - def begin_figure(self, name, bbox, matrix): + def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: return - def end_figure(self, name): + def end_figure(self, name: str) -> None: return - def paint_path(self, graphicstate, stroke, fill, evenodd, path): + def paint_path( + self, + graphicstate: "PDFGraphicState", + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: return - def render_image(self, name, stream): + def render_image(self, name: str, stream: PDFStream) -> None: return - def render_string(self, textstate, seq, ncs, graphicstate): + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: return class PDFTextDevice(PDFDevice): - def render_string(self, textstate, seq, ncs, graphicstate): + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: + assert self.ctm is not None matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize @@ -68,6 +118,7 @@ def render_string(self, textstate, seq, ncs, graphicstate): charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise + assert font is not None if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling @@ -83,13 +134,25 @@ def render_string(self, textstate, seq, ncs, graphicstate): graphicstate) return - def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): + def render_string_horizontal( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> Point: (x, y) = pos needcharspace = False for obj in seq: - if utils.isnumber(obj): + if isinstance(obj, (int, float)): x -= obj*dxscale needcharspace = True else: @@ -104,13 +167,25 @@ def render_string_horizontal(self, seq, matrix, pos, needcharspace = True return (x, y) - def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): + def render_string_vertical( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> Point: (x, y) = pos needcharspace = False for obj in seq: - if utils.isnumber(obj): + if isinstance(obj, (int, float)): y -= obj*dxscale needcharspace = True else: @@ -125,23 +200,44 @@ def render_string_vertical(self, seq, matrix, pos, needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, - graphicstate): + def render_char( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> float: return 0 class TagExtractor(PDFDevice): - def __init__(self, rsrcmgr, outfp, codec='utf-8'): + def __init__( + self, + rsrcmgr: "PDFResourceManager", + outfp: BinaryIO, + codec: str = 'utf-8' + ) -> None: PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec self.pageno = 0 - self._stack = [] + self._stack: List[PSLiteral] = [] return - def render_string(self, textstate, seq, ncs, graphicstate): + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: font = textstate.font + assert font is not None text = '' for obj in seq: if isinstance(obj, str): @@ -158,40 +254,42 @@ def render_string(self, textstate, seq, ncs, graphicstate): self._write(utils.enc(text)) return - def begin_page(self, page, ctm): + def begin_page(self, page: PDFPage, ctm: Matrix) -> None: output = '' %\ (self.pageno, utils.bbox2str(page.mediabox), page.rotate) self._write(output) return - def end_page(self, page): + def end_page(self, page: PDFPage) -> None: self._write('\n') self.pageno += 1 return - def begin_tag(self, tag, props=None): + def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None + ) -> None: s = '' if isinstance(props, dict): s = ''.join([ ' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v)) for (k, v) in sorted(props.items()) ]) - out_s = '<{}{}>'.format(utils.enc(tag.name), s) + out_s = '<{}{}>'.format(utils.enc(cast(str, tag.name)), s) self._write(out_s) self._stack.append(tag) return - def end_tag(self): + def end_tag(self) -> None: assert self._stack, str(self.pageno) tag = self._stack.pop(-1) - out_s = '' % utils.enc(tag.name) + out_s = '' % utils.enc(cast(str, tag.name)) self._write(out_s) return - def do_tag(self, tag, props=None): + def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None + ) -> None: self.begin_tag(tag, props) self._stack.pop(-1) return - def _write(self, s: str): + def _write(self, s: str) -> None: self.outfp.write(s.encode(self.codec)) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 6a576f57..88589706 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -2,16 +2,18 @@ import re import struct from hashlib import sha256, md5, sha384, sha512 +from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List, + Optional, Sequence, Tuple, Type, Union, cast) from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes from . import settings from .arcfour import Arcfour -from .pdfparser import PDFSyntaxError, PDFStreamParser -from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \ +from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser +from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream,\ PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ - dict_value, stream_value + uint_value, dict_value, stream_value from .psparser import PSEOF, literal_name, LIT, KWD from .utils import choplist, nunpack, decode_text @@ -51,7 +53,7 @@ class PDFTextExtractionNotAllowed(PDFEncryptionError): class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed): - def __init__(self, *args): + def __init__(self, *args: object) -> None: from warnings import warn warn('PDFTextExtractionNotAllowedError will be removed in the future. ' 'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning) @@ -65,31 +67,33 @@ def __init__(self, *args): class PDFBaseXRef: - - def get_trailer(self): + def get_trailer(self) -> Dict[str, Any]: raise NotImplementedError - def get_objids(self): + def get_objids(self) -> Iterable[int]: return [] # Must return # (strmid, index, genno) # or (None, pos, genno) - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: raise KeyError(objid) + def load(self, parser: PDFParser) -> None: + raise NotImplementedError + class PDFXRef(PDFBaseXRef): - def __init__(self): - self.offsets = {} - self.trailer = {} + def __init__(self) -> None: + self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {} + self.trailer: Dict[str, Any] = {} return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.offsets.keys()) - def load(self, parser): + def load(self, parser: PDFParser) -> None: while True: try: (pos, line) = parser.nextline() @@ -123,15 +127,15 @@ def load(self, parser): error_msg = 'Invalid XRef format: {!r}, line={!r}'\ .format(parser, line) raise PDFNoValidXRef(error_msg) - (pos, genno, use) = f - if use != b'n': + (pos_b, genno_b, use_b) = f + if use_b != b'n': continue - self.offsets[objid] = (None, int(pos), int(genno)) + self.offsets[objid] = (None, int(pos_b), int(genno_b)) log.info('xref objects: %r', self.offsets) self.load_trailer(parser) return - def load_trailer(self, parser): + def load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() assert kwd is KWD(b'trailer'), str(kwd) @@ -145,13 +149,13 @@ def load_trailer(self, parser): log.debug('trailer=%r', self.trailer) return - def get_trailer(self): + def get_trailer(self) -> Dict[str, Any]: return self.trailer - def get_objids(self): + def get_objids(self) -> KeysView[int]: return self.offsets.keys() - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: try: return self.offsets[objid] except KeyError: @@ -160,30 +164,30 @@ def get_pos(self, objid): class PDFXRefFallback(PDFXRef): - def __repr__(self): + def __repr__(self) -> str: return '' % (self.offsets.keys()) PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') - def load(self, parser): + def load(self, parser: PDFParser) -> None: parser.seek(0) while 1: try: - (pos, line) = parser.nextline() + (pos, line_bytes) = parser.nextline() except PSEOF: break - if line.startswith(b'trailer'): + if line_bytes.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) log.info('trailer: %r', self.trailer) break - line = line.decode('latin-1') # default pdf encoding + line = line_bytes.decode('latin-1') # default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue - (objid, genno) = m.groups() - objid = int(objid) - genno = int(genno) + (objid_s, genno_s) = m.groups() + objid = int(objid_s) + genno = int(genno_s) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) @@ -198,11 +202,11 @@ def load(self, parser): raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) - objs = [] + objs: List[int] = [] try: while 1: (_, obj) = parser1.nextobject() - objs.append(obj) + objs.append(cast(int, obj)) except PSEOF: pass n = min(n, len(objs)//2) @@ -214,17 +218,19 @@ def load(self, parser): class PDFXRefStream(PDFBaseXRef): - def __init__(self): - self.data = None - self.entlen = None - self.fl1 = self.fl2 = self.fl3 = None - self.ranges = [] + def __init__(self) -> None: + self.data: Optional[bytes] = None + self.entlen: Optional[int] = None + self.fl1: Optional[int] = None + self.fl2: Optional[int] = None + self.fl3: Optional[int] = None + self.ranges: List[Tuple[int, int]] = [] return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.ranges) - def load(self, parser): + def load(self, parser: PDFParser) -> None: (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() @@ -236,8 +242,11 @@ def load(self, parser): index_array = stream.get('Index', (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') - self.ranges.extend(choplist(2, index_array)) + self.ranges.extend(cast(Iterator[Tuple[int, int]], + choplist(2, index_array))) (self.fl1, self.fl2, self.fl3) = stream['W'] + assert (self.fl1 is not None and self.fl2 is not None + and self.fl3 is not None) self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs @@ -246,12 +255,14 @@ def load(self, parser): self.fl1, self.fl2, self.fl3) return - def get_trailer(self): + def get_trailer(self) -> Dict[str, Any]: return self.trailer - def get_objids(self): + def get_objids(self) -> Iterator[int]: for (start, nobjs) in self.ranges: for i in range(nobjs): + assert self.entlen is not None + assert self.data is not None offset = self.entlen * i ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -259,7 +270,7 @@ def get_objids(self): yield start+i return - def get_pos(self, objid): + def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: index = 0 for (start, nobjs) in self.ranges: if start <= objid and objid < start+nobjs: @@ -269,6 +280,10 @@ def get_pos(self, objid): index += nobjs else: raise KeyError(objid) + assert self.entlen is not None + assert self.data is not None + assert (self.fl1 is not None and self.fl2 is not None + and self.fl3 is not None) offset = self.entlen * index ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -287,16 +302,21 @@ class PDFStandardSecurityHandler: PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') - supported_revisions = (2, 3) - - def __init__(self, docid, param, password=''): + supported_revisions: Tuple[int, ...] = (2, 3) + + def __init__( + self, + docid: Sequence[bytes], + param: Dict[str, Any], + password: str = '' + ) -> None: self.docid = docid self.param = param self.password = password self.init() return - def init(self): + def init(self) -> None: self.init_params() if self.r not in self.supported_revisions: error_msg = 'Unsupported revision: param=%r' % self.param @@ -304,7 +324,7 @@ def init(self): self.init_key() return - def init_params(self): + def init_params(self) -> None: self.v = int_value(self.param.get('V', 0)) self.r = int_value(self.param['R']) self.p = uint_value(self.param['P'], 32) @@ -313,22 +333,22 @@ def init_params(self): self.length = int_value(self.param.get('Length', 40)) return - def init_key(self): + def init_key(self) -> None: self.key = self.authenticate(self.password) if self.key is None: raise PDFPasswordIncorrect return - def is_printable(self): + def is_printable(self) -> bool: return bool(self.p & 4) - def is_modifiable(self): + def is_modifiable(self) -> bool: return bool(self.p & 8) - def is_extractable(self): + def is_extractable(self) -> bool: return bool(self.p & 16) - def compute_u(self, key): + def compute_u(self, key: bytes) -> bytes: if self.r == 2: # Algorithm 3.4 return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 @@ -343,7 +363,7 @@ def compute_u(self, key): result += result # 6 return result - def compute_encryption_key(self, password): + def compute_encryption_key(self, password: bytes) -> bytes: # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 hash = md5(password) # 2 @@ -352,7 +372,7 @@ def compute_encryption_key(self, password): hash.update(struct.pack('= 4: - if not self.encrypt_metadata: + if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata: hash.update(b'\xff\xff\xff\xff') result = hash.digest() n = 5 @@ -362,28 +382,28 @@ def compute_encryption_key(self, password): result = md5(result[:n]).digest() return result[:n] - def authenticate(self, password): - password = password.encode("latin1") - key = self.authenticate_user_password(password) + def authenticate(self, password: str) -> Optional[bytes]: + password_bytes = password.encode("latin1") + key = self.authenticate_user_password(password_bytes) if key is None: - key = self.authenticate_owner_password(password) + key = self.authenticate_owner_password(password_bytes) return key - def authenticate_user_password(self, password): + def authenticate_user_password(self, password: bytes) -> Optional[bytes]: key = self.compute_encryption_key(password) if self.verify_encryption_key(key): return key else: return None - def verify_encryption_key(self, key): + def verify_encryption_key(self, key: bytes) -> bool: # Algorithm 3.6 u = self.compute_u(key) if self.r == 2: return u == self.u return u[:16] == self.u[:16] - def authenticate_owner_password(self, password): + def authenticate_owner_password(self, password: bytes) -> Optional[bytes]: # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] hash = md5(password) @@ -403,12 +423,19 @@ def authenticate_owner_password(self, password): user_password = Arcfour(k).decrypt(user_password) return self.authenticate_user_password(user_password) - def decrypt(self, objid, genno, data, attrs=None): + def decrypt( + self, + objid: int, + genno: int, + data: bytes, + attrs: Optional[Dict[str, Any]] = None + ) -> bytes: return self.decrypt_rc4(objid, genno, data) - def decrypt_rc4(self, objid, genno, data): + def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: + assert self.key is not None key = self.key + struct.pack(' None: super().init_params() self.length = 128 self.cf = dict_value(self.param.get('CF')) @@ -442,7 +469,10 @@ def init_params(self): raise PDFEncryptionError(error_msg) return - def get_cfm(self, name): + def get_cfm( + self, + name: str + ) -> Optional[Callable[[int, int, bytes], bytes]]: if name == 'V2': return self.decrypt_rc4 elif name == 'AESV2': @@ -450,7 +480,14 @@ def get_cfm(self, name): else: return None - def decrypt(self, objid, genno, data, attrs=None, name=None): + def decrypt( + self, + objid: int, + genno: int, + data: bytes, + attrs: Optional[Dict[str, Any]] = None, + name: Optional[str] = None + ) -> bytes: if not self.encrypt_metadata and attrs is not None: t = attrs.get('Type') if t is not None and literal_name(t) == 'Metadata': @@ -459,27 +496,28 @@ def decrypt(self, objid, genno, data, attrs=None, name=None): name = self.strf return self.cfm[name](objid, genno, data) - def decrypt_identity(self, objid, genno, data): + def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes: return data - def decrypt_aes128(self, objid, genno, data): + def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes: + assert self.key is not None key = self.key + struct.pack(' None: super().init_params() self.length = 256 self.oe = str_value(self.param['OE']) @@ -492,31 +530,34 @@ def init_params(self): self.u_key_salt = self.u[40:] return - def get_cfm(self, name): + def get_cfm( + self, + name: str + ) -> Optional[Callable[[int, int, bytes], bytes]]: if name == 'AESV3': return self.decrypt_aes256 else: return None - def authenticate(self, password): - password = self._normalize_password(password) - hash = self._password_hash(password, self.o_validation_salt, self.u) + def authenticate(self, password: str) -> Optional[bytes]: + password_b = self._normalize_password(password) + hash = self._password_hash(password_b, self.o_validation_salt, self.u) if hash == self.o_hash: - hash = self._password_hash(password, self.o_key_salt, self.u) + hash = self._password_hash(password_b, self.o_key_salt, self.u) cipher = Cipher(algorithms.AES(hash), modes.CBC(b'\0' * 16), - backend=default_backend()) - return cipher.decryptor().update(self.oe) - hash = self._password_hash(password, self.u_validation_salt) + backend=default_backend()) # type: ignore + return cipher.decryptor().update(self.oe) # type: ignore + hash = self._password_hash(password_b, self.u_validation_salt) if hash == self.u_hash: - hash = self._password_hash(password, self.u_key_salt) + hash = self._password_hash(password_b, self.u_key_salt) cipher = Cipher(algorithms.AES(hash), modes.CBC(b'\0' * 16), - backend=default_backend()) - return cipher.decryptor().update(self.ue) + backend=default_backend()) # type: ignore + return cipher.decryptor().update(self.ue) # type: ignore return None - def _normalize_password(self, password): + def _normalize_password(self, password: str) -> bytes: if self.r == 6: # saslprep expects non-empty strings, apparently if not password: @@ -525,7 +566,12 @@ def _normalize_password(self, password): password = saslprep(password) return password.encode('utf-8')[:127] - def _password_hash(self, password, salt, vector=None): + def _password_hash( + self, + password: bytes, + salt: bytes, + vector: Optional[bytes] = None + ) -> bytes: """ Compute password hash depending on revision number """ @@ -533,7 +579,12 @@ def _password_hash(self, password, salt, vector=None): return self._r5_password(password, salt, vector) return self._r6_password(password, salt[0:8], vector) - def _r5_password(self, password, salt, vector): + def _r5_password( + self, + password: bytes, + salt: bytes, + vector: Optional[bytes] = None + ) -> bytes: """ Compute the password for revision 5 """ @@ -543,7 +594,12 @@ def _r5_password(self, password, salt, vector): hash.update(vector) return hash.digest() - def _r6_password(self, password, salt, vector): + def _r6_password( + self, + password: bytes, + salt: bytes, + vector: Optional[bytes] = None + ) -> bytes: """ Compute the password for revision 6 """ @@ -568,22 +624,28 @@ def _r6_password(self, password, salt, vector): return k[:32] @staticmethod - def _bytes_mod_3(input_bytes): + def _bytes_mod_3(input_bytes: bytes) -> int: # 256 is 1 mod 3, so we can just sum 'em return sum(b % 3 for b in input_bytes) % 3 - def _aes_cbc_encrypt(self, key, iv, data): + def _aes_cbc_encrypt( + self, + key: bytes, + iv: bytes, + data: bytes + ) -> bytes: cipher = Cipher(algorithms.AES(key), modes.CBC(iv)) - encryptor = cipher.encryptor() - return encryptor.update(data) + encryptor.finalize() + encryptor = cipher.encryptor() # type: ignore + return encryptor.update(data) + encryptor.finalize() # type: ignore - def decrypt_aes256(self, objid, genno, data): + def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes: initialization_vector = data[:16] ciphertext = data[16:] + assert self.key is not None cipher = Cipher(algorithms.AES(self.key), modes.CBC(initialization_vector), - backend=default_backend()) - return cipher.decryptor().update(ciphertext) + backend=default_backend()) # type: ignore + return cipher.decryptor().update(ciphertext) # type: ignore class PDFDocument: @@ -599,24 +661,30 @@ class PDFDocument: """ - security_handler_registry = { + security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = { 1: PDFStandardSecurityHandler, 2: PDFStandardSecurityHandler, 4: PDFStandardSecurityHandlerV4, 5: PDFStandardSecurityHandlerV5, } - def __init__(self, parser, password='', caching=True, fallback=True): + def __init__( + self, + parser: PDFParser, + password: str = '', + caching: bool = True, + fallback: bool = True + ) -> None: "Set the document to use a given PDFParser object." self.caching = caching - self.xrefs = [] + self.xrefs: List[PDFBaseXRef] = [] self.info = [] - self.catalog = None - self.encryption = None - self.decipher = None + self.catalog: Dict[str, Any] = {} + self.encryption: Optional[Tuple[Any, Any]] = None + self.decipher: Optional[DecipherCallable] = None self._parser = None - self._cached_objs = {} - self._parsed_objs = {} + self._cached_objs: Dict[int, Tuple[object, int]] = {} + self._parsed_objs: Dict[int, Tuple[List[object], int]] = {} self._parser = parser self._parser.set_document(self) self.is_printable = self.is_modifiable = self.is_extractable = True @@ -629,9 +697,9 @@ def __init__(self, parser, password='', caching=True, fallback=True): pass # fallback = True if fallback: parser.fallback = True - xref = PDFXRefFallback() - xref.load(parser) - self.xrefs.append(xref) + newxref = PDFXRefFallback() + newxref.load(parser) + self.xrefs.append(newxref) for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: @@ -665,7 +733,8 @@ def __init__(self, parser, password='', caching=True, fallback=True): # _initialize_password(password=b'') # Perform the initialization with a given password. - def _initialize_password(self, password=''): + def _initialize_password(self, password: str = '') -> None: + assert self.encryption is not None (docid, param) = self.encryption if literal_name(param.get('Filter')) != 'Standard': raise PDFEncryptionError('Unknown filter: param=%r' % param) @@ -678,15 +747,22 @@ def _initialize_password(self, password=''): self.is_printable = handler.is_printable() self.is_modifiable = handler.is_modifiable() self.is_extractable = handler.is_extractable() + assert self._parser is not None self._parser.fallback = False # need to read streams with exact length return - def _getobj_objstm(self, stream, index, objid): + def _getobj_objstm( + self, + stream: PDFStream, + index: int, + objid: int + ) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: (objs, n) = self._get_objects(stream) if self.caching: + assert stream.objid is not None self._parsed_objs[stream.objid] = (objs, n) i = n*2+index try: @@ -695,19 +771,19 @@ def _getobj_objstm(self, stream, index, objid): raise PDFSyntaxError('index too big: %r' % index) return obj - def _get_objects(self, stream): + def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: if stream.get('Type') is not LITERAL_OBJSTM: if settings.STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: - n = stream['N'] + n = cast(int, stream['N']) except KeyError: if settings.STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser = PDFStreamParser(stream.get_data()) parser.set_document(self) - objs = [] + objs: List[object] = [] try: while 1: (_, obj) = parser.nextobject() @@ -716,7 +792,8 @@ def _get_objects(self, stream): pass return (objs, n) - def _getobj_parse(self, pos, objid): + def _getobj_parse(self, pos: int, objid: int) -> object: + assert self._parser is not None self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid (_, genno) = self._parser.nexttoken() # genno @@ -744,7 +821,7 @@ def _getobj_parse(self, pos, objid): return obj # can raise PDFObjectNotFound - def getobj(self, objid): + def getobj(self, objid: int) -> object: """Get object from PDF :raises PDFException if PDFDocument is not initialized @@ -783,11 +860,14 @@ def getobj(self, objid): self._cached_objs[objid] = (obj, genno) return obj - def get_outlines(self): + OutlineType = Tuple[Any, Any, Any, Any, Any] + + def get_outlines(self) -> Iterator[OutlineType]: if 'Outlines' not in self.catalog: raise PDFNoOutlines - def search(entry, level): + def search(entry: object, level: int + ) -> Iterator[PDFDocument.OutlineType]: entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: @@ -803,7 +883,11 @@ def search(entry, level): return return search(self.catalog['Outlines'], 0) - def lookup_name(self, cat, key): + def lookup_name( + self, + cat: str, + key: Union[str, bytes] + ) -> Any: try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): @@ -811,14 +895,15 @@ def lookup_name(self, cat, key): # may raise KeyError d0 = dict_value(names[cat]) - def lookup(d): + def lookup(d: Dict[str, Any]) -> Any: if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) - names = dict(choplist(2, objs)) + names = dict(cast(Iterator[Tuple[Union[str, bytes], Any]], + choplist(2, objs))) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): @@ -828,7 +913,7 @@ def lookup(d): raise KeyError((cat, key)) return lookup(d0) - def get_dest(self, name): + def get_dest(self, name: Union[str, bytes]) -> Any: try: # PDF-1.2 or later obj = self.lookup_name('Dests', name) @@ -843,7 +928,7 @@ def get_dest(self, name): return obj # find_xref - def find_xref(self, parser): + def find_xref(self, parser: PDFParser) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. prev = None @@ -857,10 +942,16 @@ def find_xref(self, parser): else: raise PDFNoValidXRef('Unexpected EOF') log.info('xref found: pos=%r', prev) + assert prev is not None return int(prev) # read xref table - def read_xref_from(self, parser, start, xrefs): + def read_xref_from( + self, + parser: PDFParser, + start: int, + xrefs: List[PDFBaseXRef] + ) -> None: """Reads XRefs from the given location.""" parser.seek(start) parser.reset() @@ -873,7 +964,7 @@ def read_xref_from(self, parser, start, xrefs): # XRefStream: PDF-1.5 parser.seek(pos) parser.reset() - xref = PDFXRefStream() + xref: PDFBaseXRef = PDFXRefStream() xref.load(parser) else: if token is parser.KEYWORD_XREF: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 74ad6a61..df0813d5 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -2,11 +2,15 @@ import struct import sys from io import BytesIO +from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping, + Optional, Tuple, Union, cast, TYPE_CHECKING) from . import settings from .cmapdb import CMap +from .cmapdb import CMapBase from .cmapdb import CMapDB from .cmapdb import CMapParser +from .cmapdb import UnicodeMap from .cmapdb import FileUnicodeMap from .encodingdb import EncodingDB from .encodingdb import name2unicode @@ -22,52 +26,59 @@ from .psparser import KWD from .psparser import LIT from .psparser import PSEOF +from .psparser import PSKeyword from .psparser import PSLiteral from .psparser import PSStackParser from .psparser import literal_name +from .utils import Matrix, Point +from .utils import Rect from .utils import apply_matrix_norm from .utils import choplist -from .utils import isnumber from .utils import nunpack +if TYPE_CHECKING: + from .pdfinterp import PDFResourceManager + log = logging.getLogger(__name__) -def get_widths(seq): - widths = {} - r = [] +def get_widths(seq: Iterable[object]) -> Dict[int, float]: + """Build a mapping of character widths for horizontal writing.""" + widths: Dict[int, float] = {} + r: List[float] = [] for v in seq: if isinstance(v, list): if r: char1 = r[-1] for (i, w) in enumerate(v): - widths[char1+i] = w + widths[cast(int, char1) + i] = w r = [] - elif isnumber(v): + elif isinstance(v, (int, float)): # == utils.isnumber(v) r.append(v) if len(r) == 3: (char1, char2, w) = r - for i in range(char1, char2+1): + for i in range(cast(int, char1), cast(int, char2) + 1): widths[i] = w r = [] return widths -def get_widths2(seq): - widths = {} - r = [] +def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: + """Build a mapping of character widths for vertical writing.""" + widths: Dict[int, Tuple[float, Point]] = {} + r: List[float] = [] for v in seq: if isinstance(v, list): if r: char1 = r[-1] for (i, (w, vx, vy)) in enumerate(choplist(3, v)): - widths[char1+i] = (w, (vx, vy)) + widths[cast(int, char1) + i] = (w, (vx, vy)) r = [] - elif isnumber(v): + elif isinstance(v, (int, float)): # == utils.isnumber(v) r.append(v) if len(r) == 5: (char1, char2, w, vx, vy) = r - for i in range(char1, char2+1): + for i in range(cast(int, char1), cast(int, char2) + 1): widths[i] = (w, (vx, vy)) r = [] return widths @@ -76,11 +87,13 @@ def get_widths2(seq): class FontMetricsDB: @classmethod - def get_metrics(cls, fontname): + def get_metrics(cls, fontname: str + ) -> Tuple[Dict[str, object], Dict[str, int]]: return FONT_METRICS[fontname] -class Type1FontHeaderParser(PSStackParser): +# int here means that we're not extending PSStackParser with additional types. +class Type1FontHeaderParser(PSStackParser[int]): KEYWORD_BEGIN = KWD(b'begin') KEYWORD_END = KWD(b'end') @@ -91,12 +104,12 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - def __init__(self, data): + def __init__(self, data: BinaryIO) -> None: PSStackParser.__init__(self, data) - self._cid2unicode = {} + self._cid2unicode: Dict[int, str] = {} return - def get_encoding(self): + def get_encoding(self) -> Dict[int, str]: """Parse the font encoding. The Type1 font encoding maps character codes to character names. These @@ -116,12 +129,12 @@ def get_encoding(self): except PSEOF: break try: - self._cid2unicode[cid] = name2unicode(name) + self._cid2unicode[cid] = name2unicode(cast(str, name)) except KeyError as e: log.debug(str(e)) return self._cid2unicode - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_PUT: ((_, key), (_, value)) = self.pop(2) if (isinstance(key, int) and isinstance(value, PSLiteral)): @@ -140,10 +153,10 @@ def do_keyword(self, pos, token): } -def getdict(data): - d = {} +def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: + d: Dict[int, List[Union[float, int]]] = {} fp = BytesIO(data) - stack = [] + stack: List[Union[float, int]] = [] while 1: c = fp.read(1) if not c: @@ -162,7 +175,9 @@ def getdict(data): if n == 15: loop = False else: - s += NIBBLES[n] + nibble = NIBBLES[n] + assert nibble is not None + s += nibble value = float(s) elif 32 <= b0 and b0 <= 246: value = b0-139 @@ -270,9 +285,9 @@ class CFFFont: class INDEX: - def __init__(self, fp): + def __init__(self, fp: BinaryIO) -> None: self.fp = fp - self.offsets = [] + self.offsets: List[int] = [] (count, offsize) = struct.unpack('>HB', self.fp.read(3)) for i in range(count+1): self.offsets.append(nunpack(self.fp.read(offsize))) @@ -280,20 +295,20 @@ def __init__(self, fp): self.fp.seek(self.base+self.offsets[-1]) return - def __repr__(self): + def __repr__(self) -> str: return '' % len(self) - def __len__(self): + def __len__(self) -> int: return len(self.offsets)-1 - def __getitem__(self, i): + def __getitem__(self, i: int) -> bytes: self.fp.seek(self.base+self.offsets[i]) return self.fp.read(self.offsets[i+1]-self.offsets[i]) - def __iter__(self): + def __iter__(self) -> Iterator[bytes]: return iter(self[i] for i in range(len(self))) - def __init__(self, name, fp): + def __init__(self, name: str, fp: BinaryIO) -> None: self.name = name self.fp = fp # Header @@ -314,13 +329,13 @@ def __init__(self, name, fp): (encoding_pos,) = self.top_dict.get(16, [0]) (charstring_pos,) = self.top_dict.get(17, [0]) # CharStrings - self.fp.seek(charstring_pos) + self.fp.seek(cast(int, charstring_pos)) self.charstring = self.INDEX(self.fp) self.nglyphs = len(self.charstring) # Encodings self.code2gid = {} self.gid2code = {} - self.fp.seek(encoding_pos) + self.fp.seek(cast(int, encoding_pos)) format = self.fp.read(1) if format == b'\x00': # Format 0 @@ -344,17 +359,18 @@ def __init__(self, name, fp): # Charsets self.name2gid = {} self.gid2name = {} - self.fp.seek(charset_pos) + self.fp.seek(cast(int, charset_pos)) format = self.fp.read(1) if format == b'\x00': # Format 0 n = self.nglyphs-1 - for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, - self.fp.read(2*n))): + for (gid, sid) in enumerate( + cast(Tuple[int, ...], + struct.unpack('>' + 'H' * n, self.fp.read(2 * n)))): gid += 1 - name = self.getstr(sid) - self.name2gid[name] = gid - self.gid2name[gid] = name + sidname = self.getstr(sid) + self.name2gid[sidname] = gid + self.gid2name[gid] = sidname elif format == b'\x01': # Format 1 (n,) = struct.unpack('B', self.fp.read(1)) @@ -362,9 +378,9 @@ def __init__(self, name, fp): for i in range(n): (first, nleft) = struct.unpack('BB', self.fp.read(2)) for gid in range(first, first+nleft+1): - name = self.getstr(sid) - self.name2gid[name] = gid - self.gid2name[gid] = name + sidname = self.getstr(sid) + self.name2gid[sidname] = gid + self.gid2name[gid] = sidname sid += 1 elif format == b'\x02': # Format 2 @@ -373,7 +389,9 @@ def __init__(self, name, fp): raise ValueError('unsupported charset format: %r' % format) return - def getstr(self, sid): + def getstr(self, sid: int) -> Union[str, bytes]: + # This returns str for one of the STANDARD_STRINGS but bytes otherwise, + # and appears to be a needless source of type complexity. if sid < len(self.STANDARD_STRINGS): return self.STANDARD_STRINGS[sid] return self.string_index[sid-len(self.STANDARD_STRINGS)] @@ -384,17 +402,19 @@ class TrueTypeFont: class CMapNotFound(Exception): pass - def __init__(self, name, fp): + def __init__(self, name: str, fp: BinaryIO) -> None: self.name = name self.fp = fp - self.tables = {} + self.tables: Dict[bytes, Tuple[int, int]] = {} self.fonttype = fp.read(4) try: - (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) + (ntables, _1, _2, _3) = cast(Tuple[int, int, int, int], + struct.unpack('>HHHH', fp.read(8))) for _ in range(ntables): - (name, tsum, offset, length) = struct.unpack('>4sLLL', - fp.read(16)) - self.tables[name] = (offset, length) + (name_bytes, tsum, offset, length) = \ + cast(Tuple[bytes, int, int, int], + struct.unpack('>4sLLL', fp.read(16))) + self.tables[name_bytes] = (offset, length) except struct.error: # Do not fail if there are not enough bytes to read. Even for # corrupted PDFs we would like to get as much information as @@ -402,34 +422,40 @@ def __init__(self, name, fp): pass return - def create_unicode_map(self): + def create_unicode_map(self) -> FileUnicodeMap: if b'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound (base_offset, length) = self.tables[b'cmap'] fp = self.fp fp.seek(base_offset) - (version, nsubtables) = struct.unpack('>HH', fp.read(4)) - subtables = [] + (version, nsubtables) = \ + cast(Tuple[int, int], struct.unpack('>HH', fp.read(4))) + subtables: List[Tuple[int, int, int]] = [] for i in range(nsubtables): - subtables.append(struct.unpack('>HHL', fp.read(8))) - char2gid = {} + subtables.append( + cast(Tuple[int, int, int], struct.unpack('>HHL', fp.read(8)))) + char2gid: Dict[int, int] = {} # Only supports subtable type 0, 2 and 4. for (_1, _2, st_offset) in subtables: fp.seek(base_offset+st_offset) - (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6)) + (fmttype, fmtlen, fmtlang) = \ + cast(Tuple[int, int, int], struct.unpack('>HHH', fp.read(6))) if fmttype == 0: - char2gid.update(enumerate(struct.unpack('>256B', - fp.read(256)))) + char2gid.update(enumerate( + cast(Tuple[int, ...], + struct.unpack('>256B', fp.read(256))))) elif fmttype == 2: - subheaderkeys = struct.unpack('>256H', fp.read(512)) + subheaderkeys = cast(Tuple[int, ...], + struct.unpack('>256H', fp.read(512))) firstbytes = [0]*8192 for (i, k) in enumerate(subheaderkeys): firstbytes[k//8] = i nhdrs = max(subheaderkeys)//8 + 1 - hdrs = [] + hdrs: List[Tuple[int, int, int, int, int]] = [] for i in range(nhdrs): (firstcode, entcount, delta, offset) = \ - struct.unpack('>HHhH', fp.read(8)) + cast(Tuple[int, int, int, int], + struct.unpack('>HHhH', fp.read(8))) hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset)) for (i, firstcode, entcount, delta, pos) in hdrs: @@ -438,24 +464,36 @@ def create_unicode_map(self): first = firstcode + (firstbytes[i] << 8) fp.seek(pos) for c in range(entcount): - gid = struct.unpack('>H', fp.read(2)) + gid = cast(Tuple[int], + struct.unpack('>H', fp.read(2)))[0] if gid: gid += delta char2gid[first+c] = gid elif fmttype == 4: - (segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) + (segcount, _1, _2, _3) = \ + cast(Tuple[int, int, int, int], + struct.unpack('>HHHH', fp.read(8))) segcount //= 2 - ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) + ecs = cast(Tuple[int, ...], + struct.unpack('>%dH' % segcount, + fp.read(2*segcount))) fp.read(2) - scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) - idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount)) + scs = cast(Tuple[int, ...], + struct.unpack('>%dH' % segcount, + fp.read(2*segcount))) + idds = cast(Tuple[int, ...], + struct.unpack('>%dh' % segcount, + fp.read(2*segcount))) pos = fp.tell() - idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) + idrs = cast(Tuple[int, ...], + struct.unpack('>%dH' % segcount, + fp.read(2*segcount))) for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs): if idr: fp.seek(pos+idr) for c in range(sc, ec+1): - b = struct.unpack('>H', fp.read(2))[0] + b = cast(Tuple[int], + struct.unpack('>H', fp.read(2)))[0] char2gid[c] = (b + idd) & 0xffff else: for c in range(sc, ec+1): @@ -480,12 +518,21 @@ class PDFUnicodeNotDefined(PDFFontError): LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_TYPE1C = LIT('Type1C') +# Font widths are maintained in a dict type that maps from *either* unicode +# chars or integer character IDs. +FontWidthDict = Union[Dict[int, float], Dict[str, float]] + class PDFFont: - def __init__(self, descriptor, widths, default_width=None): + def __init__( + self, + descriptor: Mapping[str, Any], + widths: FontWidthDict, + default_width: Optional[float] = None + ) -> None: self.descriptor = descriptor - self.widths = resolve_all(widths) + self.widths: FontWidthDict = resolve_all(widths) self.fontname = resolve1(descriptor.get('FontName', 'unknown')) if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) @@ -498,8 +545,8 @@ def __init__(self, descriptor, widths, default_width=None): else: self.default_width = default_width self.leading = num_value(descriptor.get('Leading', 0)) - self.bbox = list_value(resolve_all(descriptor.get('FontBBox', - (0, 0, 0, 0)))) + self.bbox = cast(Rect, list_value( + resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0))))) self.hscale = self.vscale = .001 # PDF RM 9.8.1 specifies /Descent should always be a negative number. @@ -510,57 +557,72 @@ def __init__(self, descriptor, widths, default_width=None): self.descent = -self.descent return - def __repr__(self): + def __repr__(self) -> str: return '' - def is_vertical(self): + def is_vertical(self) -> bool: return False - def is_multibyte(self): + def is_multibyte(self) -> bool: return False - def decode(self, bytes): + def decode(self, bytes: bytes) -> Iterable[int]: return bytearray(bytes) # map(ord, bytes) - def get_ascent(self): + def get_ascent(self) -> float: """Ascent above the baseline, in text space units""" return self.ascent * self.vscale - def get_descent(self): + def get_descent(self) -> float: """Descent below the baseline, in text space units; always negative""" return self.descent * self.vscale - def get_width(self): + def get_width(self) -> float: w = self.bbox[2]-self.bbox[0] if w == 0: w = -self.default_width return w * self.hscale - def get_height(self): + def get_height(self) -> float: h = self.bbox[3]-self.bbox[1] if h == 0: h = self.ascent - self.descent return h * self.vscale - def char_width(self, cid): + def char_width(self, cid: int) -> float: + # Because character widths may be mapping either IDs or strings, + # we try to lookup the character ID first, then its str equivalent. try: - return self.widths[cid] * self.hscale + return cast(Dict[int, float], self.widths)[cid] * self.hscale except KeyError: + str_widths = cast(Dict[str, float], self.widths) try: - return self.widths[self.to_unichr(cid)] * self.hscale + return str_widths[self.to_unichr(cid)] * self.hscale except (KeyError, PDFUnicodeNotDefined): return self.default_width * self.hscale - def char_disp(self, cid): + def char_disp( + self, + cid: int + ) -> Union[float, Tuple[Optional[float], float]]: + "Returns an integer for horizontal fonts, a tuple for vertical fonts." return 0 - def string_width(self, s): + def string_width(self, s: bytes) -> float: return sum(self.char_width(cid) for cid in self.decode(s)) + def to_unichr(self, cid: int) -> str: + raise NotImplementedError + class PDFSimpleFont(PDFFont): - def __init__(self, descriptor, widths, spec): + def __init__( + self, + descriptor: Mapping[str, Any], + widths: FontWidthDict, + spec: Mapping[str, Any] + ) -> None: # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. @@ -575,7 +637,7 @@ def __init__(self, descriptor, widths, spec): self.cid2unicode = EncodingDB.get_encoding(name, diff) else: self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) - self.unicode_map = None + self.unicode_map: Optional[UnicodeMap] = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.unicode_map = FileUnicodeMap() @@ -583,7 +645,7 @@ def __init__(self, descriptor, widths, spec): PDFFont.__init__(self, descriptor, widths) return - def to_unichr(self, cid): + def to_unichr(self, cid: int) -> str: if self.unicode_map: try: return self.unicode_map.get_unichr(cid) @@ -597,21 +659,28 @@ def to_unichr(self, cid): class PDFType1Font(PDFSimpleFont): - def __init__(self, rsrcmgr, spec): + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any] + ) -> None: try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if settings.STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' + + widths: FontWidthDict try: - (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) + (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) + widths = cast(Dict[str, float], int_widths) # implicit int->float except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) # lastchar = int_value(spec.get('LastChar', 255)) - widths = list_value(spec.get('Widths', [0]*256)) - widths = {i+firstchar: w for (i, w) in enumerate(widths)} + width_list = list_value(spec.get('Widths', [0]*256)) + widths = {i+firstchar: w for (i, w) in enumerate(width_list)} PDFSimpleFont.__init__(self, descriptor, widths, spec) if 'Encoding' not in spec and 'FontFile' in descriptor: # try to recover the missing encoding info from the font file. @@ -622,41 +691,51 @@ def __init__(self, rsrcmgr, spec): self.cid2unicode = parser.get_encoding() return - def __repr__(self): + def __repr__(self) -> str: return '' % self.basefont class PDFTrueTypeFont(PDFType1Font): - def __repr__(self): + def __repr__(self) -> str: return '' % self.basefont class PDFType3Font(PDFSimpleFont): - def __init__(self, rsrcmgr, spec): + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any] + ) -> None: firstchar = int_value(spec.get('FirstChar', 0)) # lastchar = int_value(spec.get('LastChar', 0)) - widths = list_value(spec.get('Widths', [0]*256)) - widths = {i+firstchar: w for (i, w) in enumerate(widths)} + width_list = list_value(spec.get('Widths', [0]*256)) + widths = {i+firstchar: w for (i, w) in enumerate(width_list)} if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = {'Ascent': 0, 'Descent': 0, 'FontBBox': spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) - self.matrix = tuple(list_value(spec.get('FontMatrix'))) + self.matrix = cast(Matrix, tuple(list_value(spec.get('FontMatrix')))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) return - def __repr__(self): + def __repr__(self) -> str: return '' class PDFCIDFont(PDFFont): - - def __init__(self, rsrcmgr, spec, strict=settings.STRICT): + default_disp: Union[float, Tuple[Optional[float], float]] + + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any], + strict: bool = settings.STRICT + ) -> None: try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -669,7 +748,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): cid_ordering = resolve1( self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1") self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering) - self.cmap = self.get_cmap_from_spec(spec, strict) + self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -682,7 +761,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) - self.unicode_map = None + self.unicode_map: Optional[UnicodeMap] = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.unicode_map = FileUnicodeMap() @@ -703,12 +782,12 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical - widths = get_widths2(list_value(spec.get('W2', []))) + widths2 = get_widths2(list_value(spec.get('W2', []))) self.disps = {cid: (vx, vy) - for (cid, (_, (vx, vy))) in widths.items()} + for (cid, (_, (vx, vy))) in widths2.items()} (vy, w) = resolve1(spec.get('DW2', [880, -1000])) self.default_disp = (None, vy) - widths = {cid: w for (cid, (w, _)) in widths.items()} + widths = {cid: w for (cid, (w, _)) in widths2.items()} default_width = w else: # writing mode: horizontal @@ -719,7 +798,11 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - def get_cmap_from_spec(self, spec, strict): + def get_cmap_from_spec( + self, + spec: Mapping[str, Any], + strict: bool + ) -> CMapBase: """Get cmap from font specification For certain PDFs, Encoding Type isn't mentioned as an attribute of @@ -738,7 +821,7 @@ def get_cmap_from_spec(self, spec, strict): return CMap() @staticmethod - def _get_cmap_name(spec, strict): + def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: """Get cmap name from font specification""" cmap_name = 'unknown' # default value @@ -752,34 +835,37 @@ def _get_cmap_name(spec, strict): if strict: raise PDFFontError('Encoding is unspecified') - if type(cmap_name) is PDFStream: - if 'CMapName' in cmap_name: - cmap_name = cmap_name.get('CMapName').name + if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] + cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) + if 'CMapName' in cmap_name_stream: + cmap_name = cmap_name_stream.get('CMapName').name else: if strict: raise PDFFontError('CMapName unspecified for encoding') - cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name) - return cmap_name + return IDENTITY_ENCODER.get(cmap_name, cmap_name) - def __repr__(self): + def __repr__(self) -> str: return ''\ .format(self.basefont, self.cidcoding) - def is_vertical(self): + def is_vertical(self) -> bool: return self.vertical - def is_multibyte(self): + def is_multibyte(self) -> bool: return True - def decode(self, bytes): + def decode(self, bytes: bytes) -> Iterable[int]: return self.cmap.decode(bytes) - def char_disp(self, cid): + def char_disp( + self, + cid: int + ) -> Union[float, Tuple[Optional[float], float]]: "Returns an integer for horizontal fonts, a tuple for vertical fonts." return self.disps.get(cid, self.default_disp) - def to_unichr(self, cid): + def to_unichr(self, cid: int) -> str: try: if not self.unicode_map: raise KeyError(cid) @@ -788,7 +874,7 @@ def to_unichr(self, cid): raise PDFUnicodeNotDefined(self.cidcoding, cid) -def main(argv): +def main(argv: List[str]) -> None: for fname in argv[1:]: fp = open(fname, 'rb') font = CFFFont(fname, fp) @@ -798,4 +884,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + main(sys.argv) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index ef67947c..6387b42b 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -1,9 +1,12 @@ import re import logging +from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast from io import BytesIO from .cmapdb import CMapDB from .cmapdb import CMap -from .psparser import PSTypeError +from .cmapdb import CMapBase +from .psparser import PSLiteral, PSTypeError +from .psparser import PSStackType from .psparser import PSEOF from .psparser import PSKeyword from .psparser import literal_name @@ -12,6 +15,9 @@ from .psparser import LIT from .psparser import KWD from . import settings +from .pdfdevice import PDFDevice +from .pdfdevice import PDFTextSeq +from .pdfpage import PDFPage from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import PDFObjRef @@ -19,6 +25,7 @@ from .pdftypes import list_value from .pdftypes import dict_value from .pdftypes import stream_value +from .pdffont import PDFFont from .pdffont import PDFFontError from .pdffont import PDFType1Font from .pdffont import PDFTrueTypeFont @@ -26,6 +33,7 @@ from .pdffont import PDFCIDFont from .pdfcolor import PDFColorSpace from .pdfcolor import PREDEFINED_COLORSPACE +from .utils import Matrix, Point, PathSegment, Rect from .utils import choplist from .utils import mult_matrix from .utils import MATRIX_IDENTITY @@ -50,22 +58,24 @@ class PDFInterpreterError(PDFException): class PDFTextState: - - def __init__(self): - self.font = None - self.fontsize = 0 - self.charspace = 0 - self.wordspace = 0 - self.scaling = 100 - self.leading = 0 - self.render = 0 - self.rise = 0 + matrix: Matrix + linematrix: Point + + def __init__(self) -> None: + self.font: Optional[PDFFont] = None + self.fontsize: float = 0 + self.charspace: float = 0 + self.wordspace: float = 0 + self.scaling: float = 100 + self.leading: float = 0 + self.render: int = 0 + self.rise: float = 0 self.reset() # self.matrix is set # self.linematrix is set return - def __repr__(self): + def __repr__(self) -> str: return '' \ @@ -73,7 +83,7 @@ def __repr__(self): self.scaling, self.leading, self.render, self.rise, self.matrix, self.linematrix) - def copy(self): + def copy(self) -> "PDFTextState": obj = PDFTextState() obj.font = self.font obj.fontsize = self.fontsize @@ -87,31 +97,37 @@ def copy(self): obj.linematrix = self.linematrix return obj - def reset(self): + def reset(self) -> None: self.matrix = MATRIX_IDENTITY self.linematrix = (0, 0) return +Color = Union[ + float, # Greyscale + Tuple[float, float, float], # R, G, B + Tuple[float, float, float, float]] # C, M, Y, K + + class PDFGraphicState: - def __init__(self): - self.linewidth = 0 - self.linecap = None - self.linejoin = None - self.miterlimit = None - self.dash = None - self.intent = None - self.flatness = None + def __init__(self) -> None: + self.linewidth: float = 0 + self.linecap: Optional[object] = None + self.linejoin: Optional[object] = None + self.miterlimit: Optional[object] = None + self.dash: Optional[Tuple[object, object]] = None + self.intent: Optional[object] = None + self.flatness: Optional[object] = None # stroking color - self.scolor = None + self.scolor: Optional[Color] = None # non stroking color - self.ncolor = None + self.ncolor: Optional[Color] = None return - def copy(self): + def copy(self) -> "PDFGraphicState": obj = PDFGraphicState() obj.linewidth = self.linewidth obj.linecap = self.linecap @@ -124,7 +140,7 @@ def copy(self): obj.ncolor = self.ncolor return obj - def __repr__(self): + def __repr__(self) -> str: return ('' % @@ -141,12 +157,12 @@ class PDFResourceManager: allocated multiple times. """ - def __init__(self, caching=True): + def __init__(self, caching: bool = True) -> None: self.caching = caching - self._cached_fonts = {} + self._cached_fonts: Dict[object, PDFFont] = {} return - def get_procset(self, procs): + def get_procset(self, procs: Sequence[object]) -> None: for proc in procs: if proc is LITERAL_PDF: pass @@ -156,7 +172,7 @@ def get_procset(self, procs): pass return - def get_cmap(self, cmapname, strict=False): + def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: try: return CMapDB.get_cmap(cmapname) except CMapDB.CMapNotFound: @@ -164,7 +180,7 @@ def get_cmap(self, cmapname, strict=False): raise return CMap() - def get_font(self, objid, spec): + def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: @@ -209,15 +225,18 @@ def get_font(self, objid, spec): return font -class PDFContentParser(PSStackParser): +class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): - def __init__(self, streams): + def __init__(self, streams: Sequence[object]) -> None: self.streams = streams self.istream = 0 - PSStackParser.__init__(self, None) + # PSStackParser.__init__(fp=None) is safe only because we've overloaded + # all the methods that would attempt to access self.fp without first + # calling self.fillfp(). + PSStackParser.__init__(self, None) # type: ignore[arg-type] return - def fillfp(self): + def fillfp(self) -> None: if not self.fp: if self.istream < len(self.streams): strm = stream_value(self.streams[self.istream]) @@ -227,12 +246,12 @@ def fillfp(self): self.fp = BytesIO(strm.get_data()) return - def seek(self, pos): + def seek(self, pos: int) -> None: self.fillfp() PSStackParser.seek(self, pos) return - def fillbuf(self): + def fillbuf(self) -> None: if self.charpos < len(self.buf): return while 1: @@ -241,19 +260,23 @@ def fillbuf(self): self.buf = self.fp.read(self.BUFSIZ) if self.buf: break - self.fp = None + self.fp = None # type: ignore[assignment] self.charpos = 0 return - def get_inline_data(self, pos, target=b'EI'): + def get_inline_data( + self, + pos: int, + target: bytes = b'EI' + ) -> Tuple[int, bytes]: self.seek(pos) i = 0 data = b'' while i <= len(target): self.fillbuf() if i: - c = self.buf[self.charpos] - c = bytes((c,)) + ci = self.buf[self.charpos] + c = bytes((ci,)) data += c self.charpos += 1 if len(target) <= i and c.isspace(): @@ -275,7 +298,7 @@ def get_inline_data(self, pos, target=b'EI'): data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data) return (pos, data) - def flush(self): + def flush(self) -> None: self.add_results(*self.popall()) return @@ -283,7 +306,7 @@ def flush(self): KEYWORD_ID = KWD(b'ID') KEYWORD_EI = KWD(b'EI') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, 'inline') @@ -307,30 +330,34 @@ def do_keyword(self, pos, token): return +PDFStackT = PSStackType[PDFStream] +"""Types that may appear on the PDF argument stack.""" + + class PDFPageInterpreter: """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary """ - def __init__(self, rsrcmgr, device): + def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None: self.rsrcmgr = rsrcmgr self.device = device return - def dup(self): + def dup(self) -> "PDFPageInterpreter": return self.__class__(self.rsrcmgr, self.device) - def init_resources(self, resources): + def init_resources(self, resources: Dict[object, object]) -> None: """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources - self.fontmap = {} + self.fontmap: Dict[object, PDFFont] = {} self.xobjmap = {} - self.csmap = PREDEFINED_COLORSPACE.copy() + self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() if not resources: return - def get_colorspace(spec): + def get_colorspace(spec: object) -> Optional[PDFColorSpace]: if isinstance(spec, list): name = literal_name(spec[0]) else: @@ -343,6 +370,7 @@ def get_colorspace(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) + for (k, v) in dict_value(resources).items(): log.debug('Resource: %r: %r', k, v) if k == 'Font': @@ -354,7 +382,9 @@ def get_colorspace(spec): self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid, spec) in dict_value(v).items(): - self.csmap[csid] = get_colorspace(resolve1(spec)) + colorspace = get_colorspace(resolve1(spec)) + if colorspace is not None: + self.csmap[csid] = colorspace elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': @@ -362,130 +392,180 @@ def get_colorspace(spec): self.xobjmap[xobjid] = xobjstrm return - def init_state(self, ctm): + def init_state(self, ctm: Matrix) -> None: """Initialize the text and graphic states for rendering a page.""" - self.gstack = [] # stack for graphical states. + # gstack: stack for graphical states. + self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] self.ctm = ctm self.device.set_ctm(self.ctm) self.textstate = PDFTextState() self.graphicstate = PDFGraphicState() - self.curpath = [] + self.curpath: List[PathSegment] = [] # argstack: stack for command arguments. - self.argstack = [] + self.argstack: List[PDFStackT] = [] # set some global states. - self.scs = self.ncs = None + self.scs: Optional[PDFColorSpace] = None + self.ncs: Optional[PDFColorSpace] = None if self.csmap: self.scs = self.ncs = next(iter(self.csmap.values())) return - def push(self, obj): + def push(self, obj: PDFStackT) -> None: self.argstack.append(obj) return - def pop(self, n): + def pop(self, n: int) -> List[PDFStackT]: if n == 0: return [] x = self.argstack[-n:] self.argstack = self.argstack[:-n] return x - def get_current_state(self): + def get_current_state( + self + ) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - def set_current_state(self, state): + def set_current_state( + self, + state: Tuple[Matrix, PDFTextState, PDFGraphicState] + ) -> None: (self.ctm, self.textstate, self.graphicstate) = state self.device.set_ctm(self.ctm) return - def do_q(self): + def do_q(self) -> None: """Save graphics state""" self.gstack.append(self.get_current_state()) return - def do_Q(self): + def do_Q(self) -> None: """Restore graphics state""" if self.gstack: self.set_current_state(self.gstack.pop()) return - def do_cm(self, a1, b1, c1, d1, e1, f1): + def do_cm( + self, + a1: PDFStackT, + b1: PDFStackT, + c1: PDFStackT, + d1: PDFStackT, + e1: PDFStackT, + f1: PDFStackT + ) -> None: """Concatenate matrix to current transformation matrix""" - self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm) + self.ctm = \ + mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm) self.device.set_ctm(self.ctm) return - def do_w(self, linewidth): + def do_w(self, linewidth: PDFStackT) -> None: """Set line width""" - self.graphicstate.linewidth = linewidth + self.graphicstate.linewidth = cast(float, linewidth) return - def do_J(self, linecap): + def do_J(self, linecap: PDFStackT) -> None: """Set line cap style""" self.graphicstate.linecap = linecap return - def do_j(self, linejoin): + def do_j(self, linejoin: PDFStackT) -> None: """Set line join style""" self.graphicstate.linejoin = linejoin return - def do_M(self, miterlimit): + def do_M(self, miterlimit: PDFStackT) -> None: """Set miter limit""" self.graphicstate.miterlimit = miterlimit return - def do_d(self, dash, phase): + def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: """Set line dash pattern""" self.graphicstate.dash = (dash, phase) return - def do_ri(self, intent): + def do_ri(self, intent: PDFStackT) -> None: """Set color rendering intent""" self.graphicstate.intent = intent return - def do_i(self, flatness): + def do_i(self, flatness: PDFStackT) -> None: """Set flatness tolerance""" self.graphicstate.flatness = flatness return - def do_gs(self, name): + def do_gs(self, name: PDFStackT) -> None: """Set parameters from graphics state parameter dictionary""" # todo return - def do_m(self, x, y): + def do_m(self, x: PDFStackT, y: PDFStackT) -> None: """Begin new subpath""" - self.curpath.append(('m', x, y)) + self.curpath.append(('m', cast(float, x), cast(float, y))) return - def do_l(self, x, y): + def do_l(self, x: PDFStackT, y: PDFStackT) -> None: """Append straight line segment to path""" - self.curpath.append(('l', x, y)) - return - - def do_c(self, x1, y1, x2, y2, x3, y3): + self.curpath.append(('l', cast(float, x), cast(float, y))) + return + + def do_c( + self, + x1: PDFStackT, + y1: PDFStackT, + x2: PDFStackT, + y2: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (three control points)""" - self.curpath.append(('c', x1, y1, x2, y2, x3, y3)) - return - - def do_v(self, x2, y2, x3, y3): + self.curpath.append(('c', cast(float, x1), cast(float, y1), + cast(float, x2), cast(float, y2), + cast(float, x3), cast(float, y3))) + return + + def do_v( + self, + x2: PDFStackT, + y2: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (initial point replicated)""" - self.curpath.append(('v', x2, y2, x3, y3)) + self.curpath.append(('v', cast(float, x2), cast(float, y2), + cast(float, x3), cast(float, y3))) return - def do_y(self, x1, y1, x3, y3): + def do_y( + self, + x1: PDFStackT, + y1: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (final point replicated)""" - self.curpath.append(('y', x1, y1, x3, y3)) + self.curpath.append(('y', cast(float, x1), cast(float, y1), + cast(float, x3), cast(float, y3))) return - def do_h(self): + def do_h(self) -> None: """Close subpath""" self.curpath.append(('h',)) return - def do_re(self, x, y, w, h): + def do_re( + self, + x: PDFStackT, + y: PDFStackT, + w: PDFStackT, + h: PDFStackT + ) -> None: """Append rectangle to path""" + x = cast(float, x) + y = cast(float, y) + w = cast(float, w) + h = cast(float, h) self.curpath.append(('m', x, y)) self.curpath.append(('l', x+w, y)) self.curpath.append(('l', x+w, y+h)) @@ -493,77 +573,77 @@ def do_re(self, x, y, w, h): self.curpath.append(('h',)) return - def do_S(self): + def do_S(self) -> None: """Stroke path""" self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] return - def do_s(self): + def do_s(self) -> None: """Close and stroke path""" self.do_h() self.do_S() return - def do_f(self): + def do_f(self) -> None: """Fill path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] return - def do_F(self): + def do_F(self) -> None: """Fill path using nonzero winding number rule (obsolete)""" return self.do_f() - def do_f_a(self): + def do_f_a(self) -> None: """Fill path using even-odd rule""" self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] return - def do_B(self): + def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] return - def do_B_a(self): + def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] return - def do_b(self): + def do_b(self) -> None: """Close, fill, and stroke path using nonzero winding number rule""" self.do_h() self.do_B() return - def do_b_a(self): + def do_b_a(self) -> None: """Close, fill, and stroke path using even-odd rule""" self.do_h() self.do_B_a() return - def do_n(self): + def do_n(self) -> None: """End path without filling or stroking""" self.curpath = [] return - def do_W(self): + def do_W(self) -> None: """Set clipping path using nonzero winding number rule""" return - def do_W_a(self): + def do_W_a(self) -> None: """Set clipping path using even-odd rule""" return - def do_CS(self, name): + def do_CS(self, name: PDFStackT) -> None: """Set color space for stroking operations Introduced in PDF 1.1 @@ -575,7 +655,7 @@ def do_CS(self, name): raise PDFInterpreterError('Undefined ColorSpace: %r' % name) return - def do_cs(self, name): + def do_cs(self, name: PDFStackT) -> None: """Set color space for nonstroking operations""" try: self.ncs = self.csmap[literal_name(name)] @@ -584,37 +664,53 @@ def do_cs(self, name): raise PDFInterpreterError('Undefined ColorSpace: %r' % name) return - def do_G(self, gray): + def do_G(self, gray: PDFStackT) -> None: """Set gray level for stroking operations""" - self.graphicstate.scolor = gray + self.graphicstate.scolor = cast(float, gray) return - def do_g(self, gray): + def do_g(self, gray: PDFStackT) -> None: """Set gray level for nonstroking operations""" - self.graphicstate.ncolor = gray + self.graphicstate.ncolor = cast(float, gray) return - def do_RG(self, r, g, b): + def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: """Set RGB color for stroking operations""" - self.graphicstate.scolor = (r, g, b) + self.graphicstate.scolor = \ + (cast(float, r), cast(float, g), cast(float, b)) return - def do_rg(self, r, g, b): + def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: """Set RGB color for nonstroking operations""" - self.graphicstate.ncolor = (r, g, b) + self.graphicstate.ncolor = \ + (cast(float, r), cast(float, g), cast(float, b)) return - def do_K(self, c, m, y, k): + def do_K( + self, + c: PDFStackT, + m: PDFStackT, + y: PDFStackT, + k: PDFStackT + ) -> None: """Set CMYK color for stroking operations""" - self.graphicstate.scolor = (c, m, y, k) + self.graphicstate.scolor = \ + (cast(float, c), cast(float, m), cast(float, y), cast(float, k)) return - def do_k(self, c, m, y, k): + def do_k( + self, + c: PDFStackT, + m: PDFStackT, + y: PDFStackT, + k: PDFStackT + ) -> None: """Set CMYK color for nonstroking operations""" - self.graphicstate.ncolor = (c, m, y, k) + self.graphicstate.ncolor = \ + (cast(float, c), cast(float, m), cast(float, y), cast(float, k)) return - def do_SCN(self): + def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: n = self.scs.ncomponents @@ -622,10 +718,10 @@ def do_SCN(self): if settings.STRICT: raise PDFInterpreterError('No colorspace specified!') n = 1 - self.graphicstate.scolor = self.pop(n) + self.graphicstate.scolor = cast(Color, self.pop(n)) return - def do_scn(self): + def do_scn(self) -> None: """Set color for nonstroking operations""" if self.ncs: n = self.ncs.ncomponents @@ -633,24 +729,24 @@ def do_scn(self): if settings.STRICT: raise PDFInterpreterError('No colorspace specified!') n = 1 - self.graphicstate.ncolor = self.pop(n) + self.graphicstate.ncolor = cast(Color, self.pop(n)) return - def do_SC(self): + def do_SC(self) -> None: """Set color for stroking operations""" self.do_SCN() return - def do_sc(self): + def do_sc(self) -> None: """Set color for nonstroking operations""" self.do_scn() return - def do_sh(self, name): + def do_sh(self, name: object) -> None: """Paint area defined by shading pattern""" return - def do_BT(self): + def do_BT(self) -> None: """Begin text object Initializing the text matrix, Tm, and the text line matrix, Tlm, to @@ -660,82 +756,82 @@ def do_BT(self): self.textstate.reset() return - def do_ET(self): + def do_ET(self) -> None: """End a text object""" return - def do_BX(self): + def do_BX(self) -> None: """Begin compatibility section""" return - def do_EX(self): + def do_EX(self) -> None: """End compatibility section""" return - def do_MP(self, tag): + def do_MP(self, tag: PDFStackT) -> None: """Define marked-content point""" - self.device.do_tag(tag) + self.device.do_tag(cast(PSLiteral, tag)) return - def do_DP(self, tag, props): + def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None: """Define marked-content point with property list""" - self.device.do_tag(tag, props) + self.device.do_tag(cast(PSLiteral, tag), props) return - def do_BMC(self, tag): + def do_BMC(self, tag: PDFStackT) -> None: """Begin marked-content sequence""" - self.device.begin_tag(tag) + self.device.begin_tag(cast(PSLiteral, tag)) return - def do_BDC(self, tag, props): + def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None: """Begin marked-content sequence with property list""" - self.device.begin_tag(tag, props) + self.device.begin_tag(cast(PSLiteral, tag), props) return - def do_EMC(self): + def do_EMC(self) -> None: """End marked-content sequence""" self.device.end_tag() return - def do_Tc(self, space): + def do_Tc(self, space: PDFStackT) -> None: """Set character spacing. Character spacing is used by the Tj, TJ, and ' operators. :param space: a number expressed in unscaled text space units. """ - self.textstate.charspace = space + self.textstate.charspace = cast(float, space) return - def do_Tw(self, space): + def do_Tw(self, space: PDFStackT) -> None: """Set the word spacing. Word spacing is used by the Tj, TJ, and ' operators. :param space: a number expressed in unscaled text space units """ - self.textstate.wordspace = space + self.textstate.wordspace = cast(float, space) return - def do_Tz(self, scale): + def do_Tz(self, scale: PDFStackT) -> None: """Set the horizontal scaling. :param scale: is a number specifying the percentage of the normal width """ - self.textstate.scaling = scale + self.textstate.scaling = cast(float, scale) return - def do_TL(self, leading): + def do_TL(self, leading: PDFStackT) -> None: """Set the text leading. Text leading is used only by the T*, ', and " operators. :param leading: a number expressed in unscaled text space units """ - self.textstate.leading = -leading + self.textstate.leading = -cast(float, leading) return - def do_Tf(self, fontid, fontsize): + def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None: """Set the text font :param fontid: the name of a font resource in the Font subdictionary @@ -748,44 +844,56 @@ def do_Tf(self, fontid, fontsize): if settings.STRICT: raise PDFInterpreterError('Undefined Font id: %r' % fontid) self.textstate.font = self.rsrcmgr.get_font(None, {}) - self.textstate.fontsize = fontsize + self.textstate.fontsize = cast(float, fontsize) return - def do_Tr(self, render): + def do_Tr(self, render: PDFStackT) -> None: """Set the text rendering mode""" - self.textstate.render = render + self.textstate.render = cast(int, render) return - def do_Ts(self, rise): + def do_Ts(self, rise: PDFStackT) -> None: """Set the text rise :param rise: a number expressed in unscaled text space units """ - self.textstate.rise = rise + self.textstate.rise = cast(float, rise) return - def do_Td(self, tx, ty): + def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None: """Move text position""" + tx = cast(float, tx) + ty = cast(float, ty) (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.linematrix = (0, 0) return - def do_TD(self, tx, ty): + def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: """Move text position and set leading""" + tx = cast(float, tx) + ty = cast(float, ty) (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.leading = ty self.textstate.linematrix = (0, 0) return - def do_Tm(self, a, b, c, d, e, f): + def do_Tm( + self, + a: PDFStackT, + b: PDFStackT, + c: PDFStackT, + d: PDFStackT, + e: PDFStackT, + f: PDFStackT + ) -> None: """Set text matrix and text line matrix""" - self.textstate.matrix = (a, b, c, d, e, f) + self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f)) self.textstate.linematrix = (0, 0) return - def do_T_a(self): + def do_T_a(self) -> None: """Move to start of next text line""" (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, @@ -793,22 +901,23 @@ def do_T_a(self): self.textstate.linematrix = (0, 0) return - def do_TJ(self, seq): + def do_TJ(self, seq: PDFStackT) -> None: """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq, self.ncs, - self.graphicstate.copy()) + assert self.ncs is not None + self.device.render_string(self.textstate, cast(PDFTextSeq, seq), + self.ncs, self.graphicstate.copy()) return - def do_Tj(self, s): + def do_Tj(self, s: PDFStackT) -> None: """Show text""" self.do_TJ([s]) return - def do__q(self, s): + def do__q(self, s: PDFStackT) -> None: """Move to next line and show text The ' (single quote) operator. @@ -817,7 +926,7 @@ def do__q(self, s): self.do_TJ([s]) return - def do__w(self, aw, ac, s): + def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None: """Set word and character spacing, move to next line, and show text The " (double quote) operator. @@ -827,15 +936,15 @@ def do__w(self, aw, ac, s): self.do_TJ([s]) return - def do_BI(self): + def do_BI(self) -> None: """Begin inline image object""" return - def do_ID(self): + def do_ID(self) -> None: """Begin inline image data""" return - def do_EI(self, obj): + def do_EI(self, obj: PDFStackT) -> None: """End inline image object""" if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj: iobjid = str(id(obj)) @@ -844,9 +953,9 @@ def do_EI(self, obj): self.device.end_figure(iobjid) return - def do_Do(self, xobjid): + def do_Do(self, xobjid_arg: PDFStackT) -> None: """Invoke named XObject""" - xobjid = literal_name(xobjid) + xobjid = cast(str, literal_name(xobjid_arg)) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: @@ -857,8 +966,9 @@ def do_Do(self, xobjid): subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() - bbox = list_value(xobj['BBox']) - matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) + bbox = cast(Rect, list_value(xobj['BBox'])) + matrix = cast(Matrix, list_value( + xobj.get('Matrix', MATRIX_IDENTITY))) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. @@ -880,7 +990,7 @@ def do_Do(self, xobjid): pass return - def process_page(self, page): + def process_page(self, page: PDFPage) -> None: log.info('Processing page: %r', page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: @@ -896,7 +1006,12 @@ def process_page(self, page): self.device.end_page(page) return - def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): + def render_contents( + self, + resources: Dict[object, object], + streams: Sequence[object], + ctm: Matrix = MATRIX_IDENTITY + ) -> None: """Render the content streams. This method may be called recursively. @@ -908,7 +1023,7 @@ def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): self.execute(list_value(streams)) return - def execute(self, streams): + def execute(self, streams: Sequence[object]) -> None: try: parser = PDFContentParser(streams) except PSEOF: diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 48da18c9..8380c239 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,4 +1,6 @@ import logging +from pdfminer.utils import Rect +from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple import warnings from . import settings from .psparser import LIT @@ -32,7 +34,7 @@ class PDFPage: attrs: a dictionary of page attributes. contents: a list of PDFStream objects that represents the page content. lastmod: the last modified time of the page. - resources: a list of resources used by the page. + resources: a dictionary of resources used by the page. mediabox: the physical size of the page. cropbox: the crop rectangle of the page. rotate: the page rotation (in degree). @@ -40,7 +42,12 @@ class PDFPage: beads: a chain that represents natural reading order. """ - def __init__(self, doc, pageid, attrs): + def __init__( + self, + doc: PDFDocument, + pageid: object, + attrs: object + ) -> None: """Initialize a page object. doc: a PDFDocument object. @@ -51,10 +58,11 @@ def __init__(self, doc, pageid, attrs): self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) - self.resources = resolve1(self.attrs.get('Resources', dict())) - self.mediabox = resolve1(self.attrs['MediaBox']) + self.resources: Dict[object, object] = \ + resolve1(self.attrs.get('Resources', dict())) + self.mediabox: Rect = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: - self.cropbox = resolve1(self.attrs['CropBox']) + self.cropbox: Rect = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360 @@ -66,23 +74,28 @@ def __init__(self, doc, pageid, attrs): contents = [] if not isinstance(contents, list): contents = [contents] - self.contents = contents + self.contents: List[object] = contents return - def __repr__(self): + def __repr__(self) -> str: return ''\ .format(self.resources, self.mediabox) INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'} @classmethod - def create_pages(cls, document): - def search(obj, parent): + def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: + def search( + obj: object, + parent: Dict[str, object] + ) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]: if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: - objid = obj.objid + # This looks broken. obj.objid means obj could be either + # PDFObjRef or PDFStream, but neither is valid for dict_value. + objid = obj.objid # type: ignore[attr-defined] tree = dict_value(obj).copy() for (k, v) in parent.items(): if k in cls.INHERITABLE_ATTRS and k not in tree: @@ -119,9 +132,15 @@ def search(obj, parent): return @classmethod - def get_pages(cls, fp, - pagenos=None, maxpages=0, password='', - caching=True, check_extractable=False): + def get_pages( + cls, + fp: BinaryIO, + pagenos: Optional[Container[int]] = None, + maxpages: int = 0, + password: str = '', + caching: bool = True, + check_extractable: bool = False + ) -> Iterator["PDFPage"]: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index b604b9dd..18ad9ebd 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -1,6 +1,8 @@ import logging from io import BytesIO +from typing import BinaryIO, TYPE_CHECKING, Optional, Union from .psparser import PSStackParser +from .psparser import PSKeyword from .psparser import PSSyntaxError from .psparser import PSEOF from .psparser import KWD @@ -11,6 +13,9 @@ from .pdftypes import int_value from .pdftypes import dict_value +if TYPE_CHECKING: + from .pdfdocument import PDFDocument + log = logging.getLogger(__name__) @@ -18,7 +23,8 @@ class PDFSyntaxError(PDFException): pass -class PDFParser(PSStackParser): +# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None +class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """ PDFParser fetch PDF objects from a file stream. It can handle indirect references by referring to @@ -35,13 +41,13 @@ class PDFParser(PSStackParser): """ - def __init__(self, fp): + def __init__(self, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) - self.doc = None + self.doc: Optional["PDFDocument"] = None self.fallback = False return - def set_document(self, doc): + def set_document(self, doc: "PDFDocument") -> None: """Associates the parser with a PDFDocument object.""" self.doc = doc return @@ -53,7 +59,7 @@ def set_document(self, doc): KEYWORD_XREF = KWD(b'xref') KEYWORD_STARTXREF = KWD(b'startxref') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): @@ -71,7 +77,9 @@ def do_keyword(self, pos, token): if len(self.curstack) >= 2: try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) + (objid, genno) = ( + int(objid), int(genno)) # type: ignore[arg-type] + assert self.doc is not None obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: @@ -114,13 +122,13 @@ def do_keyword(self, pos, token): objlen += len(line) if self.fallback: data += line - data = bytes(data) self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10]) - obj = PDFStream(dic, data, self.doc.decipher) - self.push((pos, obj)) + assert self.doc is not None + stream = PDFStream(dic, bytes(data), self.doc.decipher) + self.push((pos, stream)) else: # others @@ -138,22 +146,23 @@ class PDFStreamParser(PDFParser): indirect references to other objects in the same document. """ - def __init__(self, data): + def __init__(self, data: bytes) -> None: PDFParser.__init__(self, BytesIO(data)) return - def flush(self): + def flush(self) -> None: self.add_results(*self.popall()) return KEYWORD_OBJ = KWD(b'obj') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_R: # reference to indirect object try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) + (objid, genno) = ( + int(objid), int(genno)) # type: ignore[arg-type] obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 14c729b8..6190ea99 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -1,5 +1,8 @@ import zlib import logging +import sys +from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List, + Tuple, cast) from .lzw import lzwdecode from .ascii85 import ascii85decode from .ascii85 import asciihexdecode @@ -10,7 +13,9 @@ from .psparser import LIT from . import settings from .utils import apply_png_predictor -from .utils import isnumber + +if TYPE_CHECKING: + from .pdfdocument import PDFDocument log = logging.getLogger(__name__) @@ -28,6 +33,21 @@ LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) +if sys.version_info >= (3, 8): + from typing import Protocol + + class DecipherCallable(Protocol): + """Fully typed a decipher callback, with optional parameter.""" + def __call__(self, objid: int, genno: int, data: bytes, + attrs: Optional[Dict[str, Any]] = None) -> bytes: + raise NotImplementedError + +else: # Fallback for older Python + from typing import Callable + + DecipherCallable = Callable[..., bytes] + + class PDFObject(PSObject): pass @@ -54,7 +74,12 @@ class PDFNotImplementedError(PDFException): class PDFObjRef(PDFObject): - def __init__(self, doc, objid, _): + def __init__( + self, + doc: Optional["PDFDocument"], + objid: int, + _: object + ) -> None: if objid == 0: if settings.STRICT: raise PDFValueError('PDF object id cannot be 0.') @@ -62,17 +87,18 @@ def __init__(self, doc, objid, _): self.objid = objid return - def __repr__(self): + def __repr__(self) -> str: return '' % (self.objid) - def resolve(self, default=None): + def resolve(self, default: object = None) -> Any: + assert self.doc is not None try: return self.doc.getobj(self.objid) except PDFObjectNotFound: return default -def resolve1(x, default=None): +def resolve1(x: object, default: object = None) -> Any: """Resolves an object. If this is an array or dictionary, it may still contains @@ -83,7 +109,7 @@ def resolve1(x, default=None): return x -def resolve_all(x, default=None): +def resolve_all(x: object, default: object = None) -> Any: """Recursively resolves the given object and all the internals. Make sure there is no indirect reference within the nested object. @@ -99,7 +125,12 @@ def resolve_all(x, default=None): return x -def decipher_all(decipher, objid, genno, x): +def decipher_all( + decipher: DecipherCallable, + objid: int, + genno: int, + x: object +) -> Any: """Recursively deciphers the given object. """ if isinstance(x, bytes): @@ -112,7 +143,7 @@ def decipher_all(decipher, objid, genno, x): return x -def int_value(x): +def int_value(x: object) -> int: x = resolve1(x) if not isinstance(x, int): if settings.STRICT: @@ -121,7 +152,7 @@ def int_value(x): return x -def float_value(x): +def float_value(x: object) -> float: x = resolve1(x) if not isinstance(x, float): if settings.STRICT: @@ -130,34 +161,34 @@ def float_value(x): return x -def num_value(x): +def num_value(x: object) -> float: x = resolve1(x) - if not isnumber(x): + if not isinstance(x, (int, float)): # == utils.isnumber(x) if settings.STRICT: raise PDFTypeError('Int or Float required: %r' % x) return 0 return x -def uint_value(x, n_bits): +def uint_value(x: object, n_bits: int) -> int: """Resolve number and interpret it as a two's-complement unsigned number""" - x = int_value(x) - if x > 0: - return x + xi = int_value(x) + if xi > 0: + return xi else: - return x + 2**n_bits + return xi + cast(int, 2**n_bits) -def str_value(x): +def str_value(x: object) -> bytes: x = resolve1(x) if not isinstance(x, bytes): if settings.STRICT: raise PDFTypeError('String required: %r' % x) - return '' + return b'' return x -def list_value(x): +def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]: x = resolve1(x) if not isinstance(x, (list, tuple)): if settings.STRICT: @@ -166,7 +197,7 @@ def list_value(x): return x -def dict_value(x): +def dict_value(x: object) -> Dict[Any, Any]: x = resolve1(x) if not isinstance(x, dict): if settings.STRICT: @@ -176,7 +207,7 @@ def dict_value(x): return x -def stream_value(x): +def stream_value(x: object) -> "PDFStream": x = resolve1(x) if not isinstance(x, PDFStream): if settings.STRICT: @@ -187,22 +218,27 @@ def stream_value(x): class PDFStream(PDFObject): - def __init__(self, attrs, rawdata, decipher=None): + def __init__( + self, + attrs: Dict[str, Any], + rawdata: bytes, + decipher: Optional[DecipherCallable] = None + ) -> None: assert isinstance(attrs, dict), str(type(attrs)) self.attrs = attrs - self.rawdata = rawdata + self.rawdata: Optional[bytes] = rawdata self.decipher = decipher - self.data = None - self.objid = None - self.genno = None + self.data: Optional[bytes] = None + self.objid: Optional[int] = None + self.genno: Optional[int] = None return - def set_objid(self, objid, genno): + def set_objid(self, objid: int, genno: int) -> None: self.objid = objid self.genno = genno return - def __repr__(self): + def __repr__(self) -> str: if self.data is None: assert self.rawdata is not None return '' % \ @@ -212,22 +248,22 @@ def __repr__(self): return '' % \ (self.objid, len(self.data), self.attrs) - def __contains__(self, name): + def __contains__(self, name: object) -> bool: return name in self.attrs - def __getitem__(self, name): + def __getitem__(self, name: str) -> Any: return self.attrs[name] - def get(self, name, default=None): + def get(self, name: str, default: object = None) -> Any: return self.attrs.get(name, default) - def get_any(self, names, default=None): + def get_any(self, names: Iterable[str], default: object = None) -> Any: for name in names: if name in self.attrs: return self.attrs[name] return default - def get_filters(self): + def get_filters(self) -> List[Tuple[Any, Any]]: filters = self.get_any(('F', 'Filter')) params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if not filters: @@ -248,12 +284,14 @@ def get_filters(self): # return list solves https://github.com/pdfminer/pdfminer.six/issues/15 return list(zip(_filters, params)) - def decode(self): + def decode(self) -> None: assert self.data is None \ and self.rawdata is not None, str((self.data, self.rawdata)) data = self.rawdata if self.decipher: # Handle encryption + assert self.objid is not None + assert self.genno is not None data = self.decipher(self.objid, self.genno, data, self.attrs) filters = self.get_filters() if not filters: @@ -314,10 +352,11 @@ def decode(self): self.rawdata = None return - def get_data(self): + def get_data(self) -> bytes: if self.data is None: self.decode() + assert self.data is not None return self.data - def get_rawdata(self): + def get_rawdata(self) -> Optional[bytes]: return self.rawdata diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 10cf05a7..a05009e4 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -4,7 +4,8 @@ import re import logging - +from typing import (Any, BinaryIO, Dict, Generic, Iterator, List, + Optional, Tuple, Type, TypeVar, Union) from . import settings from .utils import choplist @@ -51,10 +52,12 @@ class PSLiteral(PSObject): Always use PSLiteralTable.intern(). """ - def __init__(self, name): + NameType = Union[str, bytes] + + def __init__(self, name: NameType) -> None: self.name = name - def __repr__(self): + def __repr__(self) -> str: name = self.name return '/%r' % name @@ -71,31 +74,36 @@ class PSKeyword(PSObject): Always use PSKeywordTable.intern(). """ - def __init__(self, name): + def __init__(self, name: bytes) -> None: self.name = name return - def __repr__(self): + def __repr__(self) -> str: name = self.name return '/%r' % name -class PSSymbolTable: +_SymbolT = TypeVar('_SymbolT', PSLiteral, PSKeyword) + + +class PSSymbolTable(Generic[_SymbolT]): """A utility class for storing PSLiteral/PSKeyword objects. Interned objects can be checked its identity with "is" operator. """ - def __init__(self, klass): - self.dict = {} - self.klass = klass + def __init__(self, klass: Type[_SymbolT]) -> None: + self.dict: Dict[PSLiteral.NameType, _SymbolT] = {} + self.klass: Type[_SymbolT] = klass return - def intern(self, name): + def intern(self, name: PSLiteral.NameType) -> _SymbolT: if name in self.dict: lit = self.dict[name] else: - lit = self.klass(name) + # Type confusion issue: PSKeyword always takes bytes as name + # PSLiteral uses either str or bytes + lit = self.klass(name) # type: ignore[arg-type] self.dict[name] = lit return lit @@ -112,7 +120,7 @@ def intern(self, name): KEYWORD_DICT_END = KWD(b'>>') -def literal_name(x): +def literal_name(x: object) -> Any: if not isinstance(x, PSLiteral): if settings.STRICT: raise PSTypeError('Literal required: {!r}'.format(x)) @@ -120,14 +128,15 @@ def literal_name(x): name = x else: name = x.name - try: - name = str(name, 'utf-8') - except Exception: - pass + if not isinstance(name, str): + try: + name = str(name, 'utf-8') + except Exception: + pass return name -def keyword_name(x): +def keyword_name(x: object) -> Any: if not isinstance(x, PSKeyword): if settings.STRICT: raise PSTypeError('Keyword required: %r' % x) @@ -161,32 +170,35 @@ def keyword_name(x): } +PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] + + class PSBaseParser: """Most basic PostScript parser that performs only tokenization. """ BUFSIZ = 4096 - def __init__(self, fp): + def __init__(self, fp: BinaryIO) -> None: self.fp = fp self.seek(0) return - def __repr__(self): + def __repr__(self) -> str: return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos) - def flush(self): + def flush(self) -> None: return - def close(self): + def close(self) -> None: self.flush() return - def tell(self): + def tell(self) -> int: return self.bufpos+self.charpos - def poll(self, pos=None, n=80): + def poll(self, pos: Optional[int] = None, n: int = 80) -> None: pos0 = self.fp.tell() if not pos: pos = self.bufpos+self.charpos @@ -195,7 +207,7 @@ def poll(self, pos=None, n=80): self.fp.seek(pos0) return - def seek(self, pos): + def seek(self, pos: int) -> None: """Seeks the parser to the given position. """ log.debug('seek: %r', pos) @@ -208,10 +220,10 @@ def seek(self, pos): self._parse1 = self._parse_main self._curtoken = b'' self._curtokenpos = 0 - self._tokens = [] + self._tokens: List[Tuple[int, PSBaseParserToken]] = [] return - def fillbuf(self): + def fillbuf(self) -> None: if self.charpos < len(self.buf): return # fetch next chunk. @@ -222,7 +234,7 @@ def fillbuf(self): self.charpos = 0 return - def nextline(self): + def nextline(self) -> Tuple[int, bytes]: """Fetches a next line that ends either with \\r or \\n. """ linebuf = b'' @@ -252,7 +264,7 @@ def nextline(self): return (linepos, linebuf) - def revreadlines(self): + def revreadlines(self) -> Iterator[bytes]: """Fetches a next line backword. This is used to locate the trailers at the end of a file. @@ -277,7 +289,7 @@ def revreadlines(self): buf = b'' return - def _parse_main(self, s, i): + def _parse_main(self, s: bytes, i: int) -> int: m = NONSPC.search(s, i) if not m: return len(s) @@ -321,11 +333,11 @@ def _parse_main(self, s, i): self._add_token(KWD(c)) return j+1 - def _add_token(self, obj): + def _add_token(self, obj: PSBaseParserToken) -> None: self._tokens.append((self._curtokenpos, obj)) return - def _parse_comment(self, s, i): + def _parse_comment(self, s: bytes, i: int) -> int: m = EOL.search(s, i) if not m: self._curtoken += s[i:] @@ -337,7 +349,7 @@ def _parse_comment(self, s, i): # self._tokens.append(self._curtoken) return j - def _parse_literal(self, s, i): + def _parse_literal(self, s: bytes, i: int) -> int: m = END_LITERAL.search(s, i) if not m: self._curtoken += s[i:] @@ -350,14 +362,14 @@ def _parse_literal(self, s, i): self._parse1 = self._parse_literal_hex return j+1 try: - self._curtoken = str(self._curtoken, 'utf-8') + name: Union[str, bytes] = str(self._curtoken, 'utf-8') except Exception: - pass - self._add_token(LIT(self._curtoken)) + name = self._curtoken + self._add_token(LIT(name)) self._parse1 = self._parse_main return j - def _parse_literal_hex(self, s, i): + def _parse_literal_hex(self, s: bytes, i: int) -> int: c = s[i:i+1] if HEX.match(c) and len(self.hex) < 2: self.hex += c @@ -367,7 +379,7 @@ def _parse_literal_hex(self, s, i): self._parse1 = self._parse_literal return i - def _parse_number(self, s, i): + def _parse_number(self, s: bytes, i: int) -> int: m = END_NUMBER.search(s, i) if not m: self._curtoken += s[i:] @@ -386,7 +398,7 @@ def _parse_number(self, s, i): self._parse1 = self._parse_main return j - def _parse_float(self, s, i): + def _parse_float(self, s: bytes, i: int) -> int: m = END_NUMBER.search(s, i) if not m: self._curtoken += s[i:] @@ -400,7 +412,7 @@ def _parse_float(self, s, i): self._parse1 = self._parse_main return j - def _parse_keyword(self, s, i): + def _parse_keyword(self, s: bytes, i: int) -> int: m = END_KEYWORD.search(s, i) if not m: self._curtoken += s[i:] @@ -408,7 +420,7 @@ def _parse_keyword(self, s, i): j = m.start(0) self._curtoken += s[i:j] if self._curtoken == b'true': - token = True + token: Union[bool, PSKeyword] = True elif self._curtoken == b'false': token = False else: @@ -417,7 +429,7 @@ def _parse_keyword(self, s, i): self._parse1 = self._parse_main return j - def _parse_string(self, s, i): + def _parse_string(self, s: bytes, i: int) -> int: m = END_STRING.search(s, i) if not m: self._curtoken += s[i:] @@ -443,7 +455,7 @@ def _parse_string(self, s, i): self._parse1 = self._parse_main return j+1 - def _parse_string_1(self, s, i): + def _parse_string_1(self, s: bytes, i: int) -> int: """Parse literal strings PDF Reference 3.2.3 @@ -470,7 +482,7 @@ def _parse_string_1(self, s, i): self._parse1 = self._parse_string return i+1 - def _parse_wopen(self, s, i): + def _parse_wopen(self, s: bytes, i: int) -> int: c = s[i:i+1] if c == b'<': self._add_token(KEYWORD_DICT_BEGIN) @@ -480,7 +492,7 @@ def _parse_wopen(self, s, i): self._parse1 = self._parse_hexstring return i - def _parse_wclose(self, s, i): + def _parse_wclose(self, s: bytes, i: int) -> int: c = s[i:i+1] if c == b'>': self._add_token(KEYWORD_DICT_END) @@ -488,7 +500,7 @@ def _parse_wclose(self, s, i): self._parse1 = self._parse_main return i - def _parse_hexstring(self, s, i): + def _parse_hexstring(self, s: bytes, i: int) -> int: m = END_HEX_STRING.search(s, i) if not m: self._curtoken += s[i:] @@ -501,7 +513,7 @@ def _parse_hexstring(self, s, i): self._parse1 = self._parse_main return j - def nexttoken(self): + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: while not self._tokens: self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) @@ -510,39 +522,51 @@ def nexttoken(self): return token -class PSStackParser(PSBaseParser): - def __init__(self, fp): +# Stack slots may by occupied by any of: +# * the PSBaseParserToken types +# * list (via KEYWORD_ARRAY) +# * dict (via KEYWORD_DICT) +# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT +ExtraT = TypeVar("ExtraT") +PSStackType = Union[float, bool, PSLiteral, bytes, List, Dict, ExtraT] +PSStackEntry = Tuple[int, PSStackType[ExtraT]] + + +class PSStackParser(PSBaseParser, Generic[ExtraT]): + + def __init__(self, fp: BinaryIO) -> None: PSBaseParser.__init__(self, fp) self.reset() return - def reset(self): - self.context = [] - self.curtype = None - self.curstack = [] - self.results = [] + def reset(self) -> None: + self.context: List[Tuple[int, Optional[str], + List[PSStackEntry[ExtraT]]]] = [] + self.curtype: Optional[str] = None + self.curstack: List[PSStackEntry[ExtraT]] = [] + self.results: List[PSStackEntry[ExtraT]] = [] return - def seek(self, pos): + def seek(self, pos: int) -> None: PSBaseParser.seek(self, pos) self.reset() return - def push(self, *objs): + def push(self, *objs: PSStackEntry[ExtraT]) -> None: self.curstack.extend(objs) return - def pop(self, n): + def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: objs = self.curstack[-n:] self.curstack[-n:] = [] return objs - def popall(self): + def popall(self) -> List[PSStackEntry[ExtraT]]: objs = self.curstack self.curstack = [] return objs - def add_results(self, *objs): + def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: try: log.debug('add_results: %r', objs) except Exception: @@ -550,13 +574,13 @@ def add_results(self, *objs): self.results.extend(objs) return - def start_type(self, pos, type): + def start_type(self, pos: int, type: str) -> None: self.context.append((pos, self.curtype, self.curstack)) (self.curtype, self.curstack) = (type, []) log.debug('start_type: pos=%r, type=%r', pos, type) return - def end_type(self, type): + def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: if self.curtype != type: raise PSTypeError('Type mismatch: {!r} != {!r}' .format(self.curtype, type)) @@ -565,10 +589,10 @@ def end_type(self, type): log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs) return (pos, objs) - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: return - def nextobject(self): + def nextobject(self) -> PSStackEntry[ExtraT]: """Yields a list of objects. Arrays and dictionaries are represented as Python lists and diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py index f8ea228d..b79e18e6 100644 --- a/pdfminer/runlength.py +++ b/pdfminer/runlength.py @@ -6,7 +6,7 @@ # -def rldecode(data): +def rldecode(data: bytes) -> bytes: """ RunLength decoder (Adobe version) implementation based on PDF Reference version 1.4 section 3.3.4: diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 4aabb52d..a5cf0334 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -4,8 +4,15 @@ import io import pathlib import struct +from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator, + List, Optional, Set, TextIO, Tuple, TypeVar, Union, + TYPE_CHECKING, cast) +from typing_extensions import Literal from html import escape +if TYPE_CHECKING: + from .layout import LTComponent + import chardet # For str encoding detection # from sys import maxint as INF doesn't work anymore under Python3, but PDF @@ -13,40 +20,54 @@ INF = (1 << 31) - 1 +FileOrName = Union[pathlib.PurePath, str, io.IOBase] +AnyIO = Union[TextIO, BinaryIO] + + class open_filename(object): """ Context manager that allows opening a filename (str or pathlib.PurePath type is supported) and closes it on exit, (just like `open`), but does nothing for file-like objects. """ - def __init__(self, filename, *args, **kwargs): + def __init__( + self, + filename: FileOrName, + *args: Any, + **kwargs: Any + ) -> None: if isinstance(filename, pathlib.PurePath): filename = str(filename) if isinstance(filename, str): - self.file_handler = open(filename, *args, **kwargs) + self.file_handler: AnyIO = open(filename, *args, **kwargs) self.closing = True elif isinstance(filename, io.IOBase): - self.file_handler = filename + self.file_handler = cast(AnyIO, filename) self.closing = False else: raise TypeError('Unsupported input type: %s' % type(filename)) - def __enter__(self): + def __enter__(self) -> AnyIO: return self.file_handler - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__( + self, + exc_type: object, + exc_val: object, + exc_tb: object + ) -> Literal[False]: if self.closing: self.file_handler.close() return False -def make_compat_bytes(in_str): +def make_compat_bytes(in_str: str) -> bytes: "Converts to bytes, encoding to unicode." assert isinstance(in_str, str), str(type(in_str)) return in_str.encode() -def make_compat_str(o): +def make_compat_str(o: object) -> str: """Converts everything to string, if bytes guessing the encoding.""" if isinstance(o, bytes): enc = chardet.detect(o) @@ -55,7 +76,7 @@ def make_compat_str(o): return str(o) -def shorten_str(s, size): +def shorten_str(s: str, size: int) -> str: if size < 7: return s[:size] if len(s) > size: @@ -65,8 +86,11 @@ def shorten_str(s, size): return s -def compatible_encode_method(bytesorstring, encoding='utf-8', - erraction='ignore'): +def compatible_encode_method( + bytesorstring: Union[bytes, str], + encoding: str = 'utf-8', + erraction: str = 'ignore' +) -> str: """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either. @@ -77,7 +101,7 @@ def compatible_encode_method(bytesorstring, encoding='utf-8', return bytesorstring.decode(encoding, erraction) -def paeth_predictor(left, above, upper_left): +def paeth_predictor(left: int, above: int, upper_left: int) -> int: # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html # Initial estimate p = left + above - upper_left @@ -95,7 +119,13 @@ def paeth_predictor(left, above, upper_left): return upper_left -def apply_png_predictor(pred, colors, columns, bitspercomponent, data): +def apply_png_predictor( + pred: int, + colors: int, + columns: int, + bitspercomponent: int, + data: bytes +) -> bytes: """Reverse the effect of the PNG predictor Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html @@ -190,11 +220,20 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): return buf +Point = Tuple[float, float] +Rect = Tuple[float, float, float, float] +Matrix = Tuple[float, float, float, float, float, float] +PathSegment = Union[ + Tuple[str], # Literal['h'] + Tuple[str, float, float], # Literal['m', 'l'] + Tuple[str, float, float, float, float], # Literal['v', 'y'] + Tuple[str, float, float, float, float, float, float]] # Literal['c'] + # Matrix operations -MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) +MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) -def mult_matrix(m1, m0): +def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix: (a1, b1, c1, d1, e1, f1) = m1 (a0, b0, c0, d0, e0, f0) = m0 """Returns the multiplication of two matrices.""" @@ -203,21 +242,21 @@ def mult_matrix(m1, m0): a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0) -def translate_matrix(m, v): +def translate_matrix(m: Matrix, v: Point) -> Matrix: """Translates a matrix by (x, y).""" (a, b, c, d, e, f) = m (x, y) = v return a, b, c, d, x * a + y * c + e, x * b + y * d + f -def apply_matrix_pt(m, v): +def apply_matrix_pt(m: Matrix, v: Point) -> Point: (a, b, c, d, e, f) = m (x, y) = v """Applies a matrix to a point.""" return a * x + c * y + e, b * x + d * y + f -def apply_matrix_norm(m, v): +def apply_matrix_norm(m: Matrix, v: Point) -> Point: """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" (a, b, c, d, e, f) = m (p, q) = v @@ -226,11 +265,14 @@ def apply_matrix_norm(m, v): # Utility functions -def isnumber(x): +def isnumber(x: object) -> bool: return isinstance(x, (int, float)) -def uniq(objs): +_T = TypeVar('_T') + + +def uniq(objs: Iterable[_T]) -> Iterator[_T]: """Eliminates duplicated elements.""" done = set() for obj in objs: @@ -241,7 +283,10 @@ def uniq(objs): return -def fsplit(pred, objs): +def fsplit( + pred: Callable[[_T], bool], + objs: Iterable[_T] +) -> Tuple[List[_T], List[_T]]: """Split a list into two classes according to the predicate.""" t = [] f = [] @@ -253,14 +298,15 @@ def fsplit(pred, objs): return t, f -def drange(v0, v1, d): +def drange(v0: float, v1: float, d: int) -> range: """Returns a discrete range.""" return range(int(v0) // d, int(v1 + d) // d) -def get_bound(pts): +def get_bound(pts: Iterable[Point]) -> Rect: """Compute a minimal rectangle that covers all the points.""" - (x0, y0, x1, y1) = (INF, INF, -INF, -INF) + limit: Rect = (INF, INF, -INF, -INF) + (x0, y0, x1, y1) = limit for (x, y) in pts: x0 = min(x0, x) y0 = min(y0, y) @@ -269,7 +315,11 @@ def get_bound(pts): return x0, y0, x1, y1 -def pick(seq, func, maxobj=None): +def pick( + seq: Iterable[_T], + func: Callable[[_T], float], + maxobj: Optional[_T] = None +) -> Optional[_T]: """Picks the object obj where func(obj) has the highest value.""" maxscore = None for obj in seq: @@ -279,7 +329,7 @@ def pick(seq, func, maxobj=None): return maxobj -def choplist(n, seq): +def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]: """Groups every n elements of the list.""" r = [] for x in seq: @@ -290,7 +340,7 @@ def choplist(n, seq): return -def nunpack(s, default=0): +def nunpack(s: bytes, default: int = 0) -> int: """Unpacks 1 to 4 or 8 byte integers (big endian).""" length = len(s) if not length: @@ -298,13 +348,13 @@ def nunpack(s, default=0): elif length == 1: return ord(s) elif length == 2: - return struct.unpack('>H', s)[0] + return cast(int, struct.unpack('>H', s)[0]) elif length == 3: - return struct.unpack('>L', b'\x00' + s)[0] + return cast(int, struct.unpack('>L', b'\x00' + s)[0]) elif length == 4: - return struct.unpack('>L', s)[0] + return cast(int, struct.unpack('>L', s)[0]) elif length == 8: - return struct.unpack('>Q', s)[0] + return cast(int, struct.unpack('>Q', s)[0]) else: raise TypeError('invalid length: %d' % length) @@ -345,7 +395,7 @@ def nunpack(s, default=0): )) -def decode_text(s): +def decode_text(s: bytes) -> str: """Decodes a PDFDocEncoding string to Unicode.""" if s.startswith(b'\xfe\xff'): return str(s[2:], 'utf-16be', 'ignore') @@ -353,25 +403,25 @@ def decode_text(s): return ''.join(PDFDocEncoding[c] for c in s) -def enc(x): +def enc(x: str) -> str: """Encodes a string for SGML/XML/HTML""" if isinstance(x, bytes): return '' return escape(x) -def bbox2str(bbox): +def bbox2str(bbox: Rect) -> str: (x0, y0, x1, y1) = bbox return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1) -def matrix2str(m): +def matrix2str(m: Matrix) -> str: (a, b, c, d, e, f) = m return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\ .format(a, b, c, d, e, f) -def vecBetweenBoxes(obj1, obj2): +def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point: """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. @@ -397,7 +447,10 @@ def vecBetweenBoxes(obj1, obj2): return max(0, iw), max(0, ih) -class Plane: +LTComponentT = TypeVar('LTComponentT', bound='LTComponent') + + +class Plane(Generic[LTComponentT]): """A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area. @@ -405,26 +458,26 @@ class Plane: which is sorted by its x or y coordinate. """ - def __init__(self, bbox, gridsize=50): - self._seq = [] # preserve the object order. - self._objs = set() - self._grid = {} + def __init__(self, bbox: Rect, gridsize: int = 50) -> None: + self._seq: List[LTComponentT] = [] # preserve the object order. + self._objs: Set[LTComponentT] = set() + self._grid: Dict[Point, List[LTComponentT]] = {} self.gridsize = gridsize (self.x0, self.y0, self.x1, self.y1) = bbox - def __repr__(self): + def __repr__(self) -> str: return '' % list(self) - def __iter__(self): + def __iter__(self) -> Iterator[LTComponentT]: return (obj for obj in self._seq if obj in self._objs) - def __len__(self): + def __len__(self) -> int: return len(self._objs) - def __contains__(self, obj): + def __contains__(self, obj: object) -> bool: return obj in self._objs - def _getrange(self, bbox): + def _getrange(self, bbox: Rect) -> Iterator[Point]: (x0, y0, x1, y1) = bbox if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0: return @@ -436,15 +489,15 @@ def _getrange(self, bbox): for grid_x in drange(x0, x1, self.gridsize): yield (grid_x, grid_y) - def extend(self, objs): + def extend(self, objs: Iterable[LTComponentT]) -> None: for obj in objs: self.add(obj) - def add(self, obj): + def add(self, obj: LTComponentT) -> None: """place an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): if k not in self._grid: - r = [] + r: List[LTComponentT] = [] self._grid[k] = r else: r = self._grid[k] @@ -452,7 +505,7 @@ def add(self, obj): self._seq.append(obj) self._objs.add(obj) - def remove(self, obj): + def remove(self, obj: LTComponentT) -> None: """displace an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): try: @@ -461,7 +514,7 @@ def remove(self, obj): pass self._objs.remove(obj) - def find(self, bbox): + def find(self, bbox: Rect) -> Iterator[LTComponentT]: """finds objects that are in a certain area.""" (x0, y0, x1, y1) = bbox done = set() diff --git a/setup.py b/setup.py index 941a3548..ce94b5f0 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ 'cryptography', ], extras_require={ - "dev": ["nose", "tox"], + "dev": ["nose", "tox", "mypy == 0.910"], "docs": ["sphinx", "sphinx-argparse"], }, description='PDF parser and analyzer', diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index 8fcb7691..df1dc25e 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -1,5 +1,5 @@ import warnings - +from nose.tools import raises from helpers import absolute_sample_path from tempfilepath import TemporaryFilePath from pdfminer.pdfdocument import PDFNoValidXRefWarning @@ -51,3 +51,13 @@ def test_5(self): def test_6(self): run('nonfree/naacl06-shinyama.pdf', '-t -a') + + @raises(TypeError) + def test_simple1_raw(self): + """Known issue: crash in dumpxml writing binary to text stream.""" + run('simple1.pdf', '-r -a') + + @raises(TypeError) + def test_simple1_binary(self): + """Known issue: crash in dumpxml writing binary to text stream.""" + run('simple1.pdf', '-b -a') diff --git a/tools/conv_afm.py b/tools/conv_afm.py index 32cea90c..07f7ebfe 100755 --- a/tools/conv_afm.py +++ b/tools/conv_afm.py @@ -42,4 +42,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index 3f782c8d..7ce0aef1 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -199,4 +199,4 @@ def usage(): if __name__ == '__main__': - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/tools/conv_glyphlist.py b/tools/conv_glyphlist.py index f94dcc8c..dc65f509 100755 --- a/tools/conv_glyphlist.py +++ b/tools/conv_glyphlist.py @@ -24,4 +24,4 @@ def main(argv): if __name__ == '__main__': - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 8724c815..ffdf4241 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -4,6 +4,8 @@ import os.path import re import sys +from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \ + Union, cast import warnings from argparse import ArgumentParser @@ -22,13 +24,15 @@ ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') -def escape(s): +def escape(s: Union[str, bytes]) -> str: if isinstance(s, bytes): - s = str(s, 'latin-1') - return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s) + us = str(s, 'latin-1') + else: + us = s + return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us) -def dumpxml(out, obj, codec=None): +def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None: if obj is None: out.write('') return @@ -51,15 +55,17 @@ def dumpxml(out, obj, codec=None): out.write('') return - if isinstance(obj, ((str,), bytes)): + if isinstance(obj, (str, bytes)): out.write('%s' % (len(obj), escape(obj))) return if isinstance(obj, PDFStream): if codec == 'raw': - out.write(obj.get_rawdata()) + # Bug: writing bytes to text I/O. This will raise TypeError. + out.write(obj.get_rawdata()) # type: ignore [arg-type] elif codec == 'binary': - out.write(obj.get_data()) + # Bug: writing bytes to text I/O. This will raise TypeError. + out.write(obj.get_data()) # type: ignore [arg-type] else: out.write('\n\n') dumpxml(out, obj.attrs) @@ -76,11 +82,15 @@ def dumpxml(out, obj, codec=None): return if isinstance(obj, PSKeyword): - out.write('%s' % obj.name) + # Likely bug: obj.name is bytes, not str + out.write('%s' + % obj.name) # type: ignore [str-bytes-safe] return if isinstance(obj, PSLiteral): - out.write('%s' % obj.name) + # Likely bug: obj.name may be bytes, not str + out.write('%s' + % obj.name) # type: ignore [str-bytes-safe] return if isnumber(obj): @@ -90,11 +100,15 @@ def dumpxml(out, obj, codec=None): raise TypeError(obj) -def dumptrailers(out, doc, show_fallback_xref=False): +def dumptrailers( + out: TextIO, + doc: PDFDocument, + show_fallback_xref: bool = False +) -> None: for xref in doc.xrefs: if not isinstance(xref, PDFXRefFallback) or show_fallback_xref: out.write('\n') - dumpxml(out, xref.trailer) + dumpxml(out, xref.get_trailer()) out.write('\n\n\n') no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs) if no_xrefs and not show_fallback_xref: @@ -105,7 +119,12 @@ def dumptrailers(out, doc, show_fallback_xref=False): return -def dumpallobjs(out, doc, codec=None, show_fallback_xref=False): +def dumpallobjs( + out: TextIO, + doc: PDFDocument, + codec: Optional[str] = None, + show_fallback_xref: bool = False +) -> None: visited = set() out.write('') for xref in doc.xrefs: @@ -127,15 +146,23 @@ def dumpallobjs(out, doc, codec=None, show_fallback_xref=False): return -def dumpoutline(outfp, fname, objids, pagenos, password='', - dumpall=False, codec=None, extractdir=None): +def dumpoutline( + outfp: TextIO, + fname: str, + objids: Any, + pagenos: Container[int], + password: str = '', + dumpall: bool = False, + codec: Optional[str] = None, + extractdir: Optional[str] = None +) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = {page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)} - def resolve_dest(dest): + def resolve_dest(dest: object) -> Any: if isinstance(dest, (str, bytes)): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): @@ -183,10 +210,10 @@ def resolve_dest(dest): LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile') -def extractembedded(outfp, fname, objids, pagenos, password='', - dumpall=False, codec=None, extractdir=None): - def extract1(objid, obj): - filename = os.path.basename(obj.get('UF') or obj.get('F').decode()) +def extractembedded(fname: str, password: str, extractdir: str) -> None: + def extract1(objid: int, obj: Dict[str, Any]) -> None: + filename = os.path.basename(obj.get('UF') or + cast(bytes, obj.get('F')).decode()) fileref = obj['EF'].get('UF') or obj['EF'].get('F') fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): @@ -221,8 +248,17 @@ def extract1(objid, obj): return -def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, - codec=None, extractdir=None, show_fallback_xref=False): +def dumppdf( + outfp: TextIO, + fname: str, + objids: Iterable[int], + pagenos: Container[int], + password: str = '', + dumpall: bool = False, + codec: Optional[str] = None, + extractdir: Optional[str] = None, + show_fallback_xref: bool = False +) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) @@ -249,7 +285,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, return -def create_parser(): +def create_parser() -> ArgumentParser: parser = ArgumentParser(description=__doc__, add_help=True) parser.add_argument('files', type=str, default=None, nargs='+', help='One or more paths to PDF files.') @@ -313,7 +349,7 @@ def create_parser(): return parser -def main(argv=None): +def main(argv: Optional[List[str]] = None) -> None: parser = create_parser() args = parser.parse_args(args=argv) @@ -340,7 +376,7 @@ def main(argv=None): password = args.password if args.raw_stream: - codec = 'raw' + codec: Optional[str] = 'raw' elif args.binary_stream: codec = 'binary' elif args.text_stream: @@ -356,8 +392,7 @@ def main(argv=None): ) elif args.extract_embedded: extractembedded( - outfp, fname, objids, pagenos, password=password, - dumpall=args.all, codec=codec, extractdir=args.extract_embedded + fname, password=password, extractdir=args.extract_embedded ) else: dumppdf( @@ -370,4 +405,4 @@ def main(argv=None): if __name__ == '__main__': - sys.exit(main()) + main() diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index dcaef0e6..47e2c79d 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -4,9 +4,12 @@ import argparse import logging import sys +from typing import Any, Container, Iterable, List, Optional, Union +from typing_extensions import Literal import pdfminer.high_level -import pdfminer.layout +from pdfminer.layout import LAParams +from pdfminer.utils import AnyIO logging.basicConfig() @@ -15,24 +18,42 @@ (".xml", "xml"), (".tag", "tag")) +FloatOrDisabled = Union[float, Literal["disabled"]] -def float_or_disabled(x): + +def float_or_disabled(x: str) -> FloatOrDisabled: if x.lower().strip() == "disabled": - return x + return "disabled" try: - x = float(x) + return float(x) except ValueError: raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) -def extract_text(files=[], outfile='-', - no_laparams=False, all_texts=None, detect_vertical=None, - word_margin=None, char_margin=None, line_margin=None, - boxes_flow=None, output_type='text', codec='utf-8', - strip_control=False, maxpages=0, page_numbers=None, - password="", scale=1.0, rotation=0, layoutmode='normal', - output_dir=None, debug=False, disable_caching=False, - **kwargs): +def extract_text( + files: Iterable[str] = [], + outfile: str = '-', + no_laparams: bool = False, + all_texts: Optional[bool] = None, + detect_vertical: Optional[bool] = None, + word_margin: Optional[float] = None, + char_margin: Optional[float] = None, + line_margin: Optional[float] = None, + boxes_flow: Optional[FloatOrDisabled] = None, + output_type: str = 'text', + codec: str = 'utf-8', + strip_control: bool = False, + maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", + scale: float = 1.0, + rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, + debug: bool = False, + disable_caching: bool = False, + **kwargs: Any +) -> AnyIO: if not files: raise ValueError("Must provide files to work upon!") @@ -40,7 +61,7 @@ def extract_text(files=[], outfile='-', # create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: - laparams = pdfminer.layout.LAParams() + laparams: Optional[LAParams] = LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) @@ -55,8 +76,8 @@ def extract_text(files=[], outfile='-', output_type = alttype if outfile == "-": - outfp = sys.stdout - if outfp.encoding is not None: + outfp: AnyIO = sys.stdout + if sys.stdout.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") @@ -67,7 +88,7 @@ def extract_text(files=[], outfile='-', return outfp -def maketheparser(): +def maketheparser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="+", @@ -180,7 +201,7 @@ def maketheparser(): # main -def main(args=None): +def main(args: Optional[List[str]] = None) -> int: P = maketheparser() A = P.parse_args(args=args) diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py index 68478fb7..1be0723a 100644 --- a/tools/pdfdiff.py +++ b/tools/pdfdiff.py @@ -6,6 +6,7 @@ import io import logging import sys +from typing import Any, Iterable, List, Optional import pdfminer.settings from pdfminer import high_level, layout @@ -16,7 +17,7 @@ logging.basicConfig() -def compare(file1, file2, **kwargs): +def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]: # If any LAParams group arguments were passed, # create an LAParams object and # populate with given args. Otherwise, set it to None. @@ -26,7 +27,7 @@ def compare(file1, file2, **kwargs): "char_margin", "line_margin", "boxes_flow"): paramv = kwargs.get(param, None) if paramv is not None: - laparams[param] = paramv + setattr(laparams, param, paramv) kwargs['laparams'] = laparams s1 = io.StringIO() @@ -40,20 +41,20 @@ def compare(file1, file2, **kwargs): import difflib s1.seek(0) s2.seek(0) - s1, s2 = s1.readlines(), s2.readlines() + s1_lines, s2_lines = s1.readlines(), s2.readlines() import os.path try: extension = os.path.splitext(kwargs['outfile'])[1][1:4] if extension.lower() == 'htm': - return difflib.HtmlDiff().make_file(s1, s2) + return difflib.HtmlDiff().make_file(s1_lines, s2_lines) except KeyError: pass - return difflib.unified_diff(s1, s2, n=kwargs['context_lines']) + return difflib.unified_diff(s1_lines, s2_lines, n=kwargs['context_lines']) # main -def main(args=None): +def main(args: Optional[List[str]] = None) -> int: import argparse P = argparse.ArgumentParser(description=__doc__) P.add_argument("file1", type=str, default=None, help="File 1 to compare.") diff --git a/tools/pdfstats.py b/tools/pdfstats.py index 943574d8..9bf34720 100755 --- a/tools/pdfstats.py +++ b/tools/pdfstats.py @@ -7,10 +7,11 @@ import sys import os import collections +from typing import Any, Counter, Iterator, List from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed +from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed +from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer @@ -19,18 +20,18 @@ _, SCRIPT = os.path.split(__file__) -def msg(*args, **kwargs): +def msg(*args: object, **kwargs: Any) -> None: print(' '.join(map(str, args)), **kwargs) # noqa E999 -def flat_iter(obj): +def flat_iter(obj: object) -> Iterator[object]: yield obj if isinstance(obj, LTContainer): for ob in obj: yield from flat_iter(ob) -def main(args): +def main(args: List[str]) -> int: msg(SCRIPT, args) if len(args) != 1: @@ -40,7 +41,7 @@ def main(args): infilename, = args - lt_types = collections.Counter() + lt_types: Counter[str] = collections.Counter() with open(infilename, 'rb') as pdf_file: @@ -77,6 +78,8 @@ def main(args): msg('page_count', page_count) msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items())) + return 0 + if __name__ == '__main__': sys.exit(main(sys.argv[1:])) diff --git a/tools/prof.py b/tools/prof.py index 1654a985..18803a7c 100644 --- a/tools/prof.py +++ b/tools/prof.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 import sys +from typing import List -def prof_main(argv): - import hotshot.stats +def prof_main(argv: List[str]) -> int: + import hotshot.stats # type: ignore[import] - def usage(): + def usage() -> int: print('usage: %s module.function [args ...]' % argv[0]) return 100 args = argv[1:] @@ -15,19 +16,24 @@ def usage(): prof = name+'.prof' i = name.rindex('.') (modname, funcname) = (name[:i], name[i+1:]) - module = __import__(modname, fromlist=1) + + # Type error: fromlist expects sequence of strings; presumably the intent + # is to retrieve the named module rather than a top-level package (as in + # "when a non-empty fromlist argument is given..."). + module = __import__(modname, fromlist=1) # type: ignore[arg-type] + func = getattr(module, funcname) if args: args.insert(0, argv[0]) - prof = hotshot.Profile(prof) - prof.runcall(lambda: func(args)) - prof.close() + profile = hotshot.Profile(prof) + profile.runcall(lambda: func(args)) + profile.close() else: stats = hotshot.stats.load(prof) stats.strip_dirs() stats.sort_stats('time', 'calls') stats.print_stats(1000) - return + return 0 if __name__ == '__main__': diff --git a/tox.ini b/tox.ini index 1908d96d..2a25d505 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ whitelist_externals = flake8 commands = flake8 pdfminer/ tools/ tests/ --count --statistics + mypy --install-types --non-interactive --show-error-codes . nosetests --nologcapture python -m sphinx -b html docs/source docs/build/html python -m sphinx -b doctest docs/source docs/build/doctest