From ff787a93986c60361536a97182a41774f4a53ac3 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Sat, 21 Aug 2021 21:46:14 -0700 Subject: [PATCH] be more precise about types on ps/pdf stacks, remove most of the Any annotations --- pdfminer/cmapdb.py | 3 +- pdfminer/converter.py | 7 +- pdfminer/pdfdevice.py | 6 +- pdfminer/pdffont.py | 3 +- pdfminer/pdfinterp.py | 158 +++++++++++++++++++++--------------------- pdfminer/pdfparser.py | 33 +++++---- pdfminer/psparser.py | 39 +++++++---- pdfminer/utils.py | 6 +- 8 files changed, 139 insertions(+), 116 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 0987447f..8716a6a3 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -264,7 +264,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: return cls._umap_cache[name][vertical] -class CMapParser(PSStackParser): +# int here means that we're not extending PSStackParser with additional types. +class CMapParser(PSStackParser[int]): def __init__(self, cmap, fp): PSStackParser.__init__(self, fp) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 0b978876..b2d9c7ec 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,7 +1,7 @@ import io import logging from pdfminer.pdfcolor import PDFColorSpace -from typing import Any, List, Optional, Sequence +from typing import Any, List, Optional, Sequence, cast import re import sys @@ -27,7 +27,7 @@ from .pdfinterp import PDFGraphicState, PDFResourceManager from .pdfpage import PDFPage from .pdftypes import PDFStream -from .utils import Matrix, Rect, PathSegment +from .utils import Point, Matrix, Rect, PathSegment from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc @@ -104,7 +104,8 @@ def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool, # And, per Section 4.4's Table 4.9, all other path commands place # their point-position in their final two arguments. (Any preceding # arguments represent control points on Bézier curves.) - raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path] + raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:]) + for p in path] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] if shape in {'mlh', 'ml'}: diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index a2cb3487..ab814921 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,7 +1,7 @@ -from typing import (Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, +from typing import (Any, BinaryIO, Iterable, List, Optional, Sequence, TYPE_CHECKING) from . import utils -from .utils import Matrix, Point, Rect +from .utils import Matrix, Point, Rect, PathSegment from .pdfcolor import PDFColorSpace from .pdffont import PDFFont from .pdffont import PDFUnicodeNotDefined @@ -62,7 +62,7 @@ def end_figure(self, name: str) -> None: def paint_path(self, graphicstate: "PDFGraphicState", stroke: bool, fill: bool, evenodd: bool, - path: Sequence[Tuple[str, float, float]]) -> None: + path: Sequence[PathSegment]) -> None: return def render_image(self, name: str, stream: PDFStream) -> None: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 756d6b91..8e2a39ad 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -80,7 +80,8 @@ def get_metrics(cls, fontname): return FONT_METRICS[fontname] -class Type1FontHeaderParser(PSStackParser): +# int here means that we're not extending PSStackParser with additional types. +class Type1FontHeaderParser(PSStackParser[int]): KEYWORD_BEGIN = KWD(b'begin') KEYWORD_END = KWD(b'end') diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 01902971..6ce48530 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -1,13 +1,13 @@ import re import logging from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, - Tuple, Union) + Tuple, Union, cast) from io import BytesIO from .cmapdb import CMapDB from .cmapdb import CMap from .cmapdb import CMapBase -from .psparser import PSParserToken from .psparser import PSTypeError +from .psparser import PSStackType from .psparser import PSEOF from .psparser import PSKeyword from .psparser import literal_name @@ -68,7 +68,7 @@ def __init__(self): self.wordspace: float = 0 self.scaling: float = 100 self.leading: float = 0 - self.render: float = 0 + self.render: int = 0 self.rise: float = 0 self.reset() # self.matrix is set @@ -116,7 +116,7 @@ def __init__(self): self.linecap = None self.linejoin = None self.miterlimit = None - self.dash = None + self.dash: Optional[Tuple[Any, Any]] = None self.intent = None self.flatness = None @@ -225,7 +225,7 @@ def get_font(self, objid: Any, spec: Mapping[str, Any]) -> PDFFont: return font -class PDFContentParser(PSStackParser): +class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): def __init__(self, streams: List[Any]): self.streams = streams @@ -303,7 +303,7 @@ def flush(self) -> None: KEYWORD_ID = KWD(b'ID') KEYWORD_EI = KWD(b'EI') - def do_keyword(self, pos: int, token: PSParserToken) -> None: + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, 'inline') @@ -394,7 +394,7 @@ def init_state(self, ctm: Matrix) -> None: self.graphicstate = PDFGraphicState() self.curpath: List[PathSegment] = [] # argstack: stack for command arguments. - self.argstack: List[Any] = [] + self.argstack: List[PSStackType[PDFStream]] = [] # set some global states. self.scs: Optional[PDFColorSpace] = None self.ncs: Optional[PDFColorSpace] = None @@ -402,11 +402,11 @@ def init_state(self, ctm: Matrix) -> None: self.scs = self.ncs = next(iter(self.csmap.values())) return - def push(self, obj: Any) -> None: + def push(self, obj: PSStackType[PDFStream]) -> None: self.argstack.append(obj) return - def pop(self, n: int) -> Any: + def pop(self, n: int) -> List[PSStackType[PDFStream]]: if n == 0: return [] x = self.argstack[-n:] @@ -441,77 +441,78 @@ def do_cm(self, a1: float, b1: float, c1: float, d1: float, e1: float, self.device.set_ctm(self.ctm) return - def do_w(self, linewidth): + def do_w(self, linewidth: float) -> None: """Set line width""" self.graphicstate.linewidth = linewidth return - def do_J(self, linecap): + def do_J(self, linecap: Any) -> None: """Set line cap style""" self.graphicstate.linecap = linecap return - def do_j(self, linejoin): + def do_j(self, linejoin: Any) -> None: """Set line join style""" self.graphicstate.linejoin = linejoin return - def do_M(self, miterlimit): + def do_M(self, miterlimit: Any) -> None: """Set miter limit""" self.graphicstate.miterlimit = miterlimit return - def do_d(self, dash, phase): + def do_d(self, dash: Any, phase: Any) -> None: """Set line dash pattern""" self.graphicstate.dash = (dash, phase) return - def do_ri(self, intent): + def do_ri(self, intent: Any) -> None: """Set color rendering intent""" self.graphicstate.intent = intent return - def do_i(self, flatness): + def do_i(self, flatness: Any) -> None: """Set flatness tolerance""" self.graphicstate.flatness = flatness return - def do_gs(self, name): + def do_gs(self, name: Any) -> None: """Set parameters from graphics state parameter dictionary""" # todo return - def do_m(self, x, y): + def do_m(self, x: float, y: float) -> None: """Begin new subpath""" self.curpath.append(('m', x, y)) return - def do_l(self, x, y): + def do_l(self, x: float, y: float) -> None: """Append straight line segment to path""" self.curpath.append(('l', x, y)) return - def do_c(self, x1, y1, x2, y2, x3, y3): + def do_c(self, x1: float, y1: float, x2: float, y2: float, x3: float, + y3: float) -> None: """Append curved segment to path (three control points)""" self.curpath.append(('c', x1, y1, x2, y2, x3, y3)) return - def do_v(self, x2, y2, x3, y3): + def do_v(self, x2: float, y2: float, x3: float, y3: float) -> None: """Append curved segment to path (initial point replicated)""" self.curpath.append(('v', x2, y2, x3, y3)) return - def do_y(self, x1, y1, x3, y3): + def do_y(self, x1: float, y1: float, x3: float, y3: float) -> None: """Append curved segment to path (final point replicated)""" self.curpath.append(('y', x1, y1, x3, y3)) return - def do_h(self): + def do_h(self) -> None: """Close subpath""" self.curpath.append(('h',)) return - def do_re(self, x, y, w, h): + def do_re(self, x: float, y: float, w: float, h: float) -> None: """Append rectangle to path""" self.curpath.append(('m', x, y)) self.curpath.append(('l', x+w, y)) @@ -520,77 +521,77 @@ def do_re(self, x, y, w, h): self.curpath.append(('h',)) return - def do_S(self): + def do_S(self) -> None: """Stroke path""" self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] return - def do_s(self): + def do_s(self) -> None: """Close and stroke path""" self.do_h() self.do_S() return - def do_f(self): + def do_f(self) -> None: """Fill path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] return - def do_F(self): + def do_F(self) -> None: """Fill path using nonzero winding number rule (obsolete)""" return self.do_f() - def do_f_a(self): + def do_f_a(self) -> None: """Fill path using even-odd rule""" self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] return - def do_B(self): + def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] return - def do_B_a(self): + def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] return - def do_b(self): + def do_b(self) -> None: """Close, fill, and stroke path using nonzero winding number rule""" self.do_h() self.do_B() return - def do_b_a(self): + def do_b_a(self) -> None: """Close, fill, and stroke path using even-odd rule""" self.do_h() self.do_B_a() return - def do_n(self): + def do_n(self) -> None: """End path without filling or stroking""" self.curpath = [] return - def do_W(self): + def do_W(self) -> None: """Set clipping path using nonzero winding number rule""" return - def do_W_a(self): + def do_W_a(self) -> None: """Set clipping path using even-odd rule""" return - def do_CS(self, name): + def do_CS(self, name: Any) -> None: """Set color space for stroking operations Introduced in PDF 1.1 @@ -602,7 +603,7 @@ def do_CS(self, name): raise PDFInterpreterError('Undefined ColorSpace: %r' % name) return - def do_cs(self, name): + def do_cs(self, name: Any) -> None: """Set color space for nonstroking operations""" try: self.ncs = self.csmap[literal_name(name)] @@ -611,37 +612,37 @@ def do_cs(self, name): raise PDFInterpreterError('Undefined ColorSpace: %r' % name) return - def do_G(self, gray): + def do_G(self, gray: float) -> None: """Set gray level for stroking operations""" self.graphicstate.scolor = gray return - def do_g(self, gray): + def do_g(self, gray: float) -> None: """Set gray level for nonstroking operations""" self.graphicstate.ncolor = gray return - def do_RG(self, r, g, b): + def do_RG(self, r: float, g: float, b: float) -> None: """Set RGB color for stroking operations""" self.graphicstate.scolor = (r, g, b) return - def do_rg(self, r, g, b): + def do_rg(self, r: float, g: float, b: float) -> None: """Set RGB color for nonstroking operations""" self.graphicstate.ncolor = (r, g, b) return - def do_K(self, c, m, y, k): + def do_K(self, c: float, m: float, y: float, k: float) -> None: """Set CMYK color for stroking operations""" self.graphicstate.scolor = (c, m, y, k) return - def do_k(self, c, m, y, k): + def do_k(self, c: float, m: float, y: float, k: float) -> None: """Set CMYK color for nonstroking operations""" self.graphicstate.ncolor = (c, m, y, k) return - def do_SCN(self): + def do_SCN(self) -> None: """Set color for stroking operations.""" if self.scs: n = self.scs.ncomponents @@ -649,10 +650,10 @@ def do_SCN(self): if settings.STRICT: raise PDFInterpreterError('No colorspace specified!') n = 1 - self.graphicstate.scolor = self.pop(n) + self.graphicstate.scolor = cast(Color, self.pop(n)) # Unchecked cast! return - def do_scn(self): + def do_scn(self) -> None: """Set color for nonstroking operations""" if self.ncs: n = self.ncs.ncomponents @@ -660,24 +661,24 @@ def do_scn(self): if settings.STRICT: raise PDFInterpreterError('No colorspace specified!') n = 1 - self.graphicstate.ncolor = self.pop(n) + self.graphicstate.ncolor = cast(Color, self.pop(n)) # Unchecked cast! return - def do_SC(self): + def do_SC(self) -> None: """Set color for stroking operations""" self.do_SCN() return - def do_sc(self): + def do_sc(self) -> None: """Set color for nonstroking operations""" self.do_scn() return - def do_sh(self, name): + def do_sh(self, name: Any) -> None: """Paint area defined by shading pattern""" return - def do_BT(self): + def do_BT(self) -> None: """Begin text object Initializing the text matrix, Tm, and the text line matrix, Tlm, to @@ -687,44 +688,44 @@ def do_BT(self): self.textstate.reset() return - def do_ET(self): + def do_ET(self) -> None: """End a text object""" return - def do_BX(self): + def do_BX(self) -> None: """Begin compatibility section""" return - def do_EX(self): + def do_EX(self) -> None: """End compatibility section""" return - def do_MP(self, tag): + def do_MP(self, tag: Any) -> None: """Define marked-content point""" self.device.do_tag(tag) return - def do_DP(self, tag, props): + def do_DP(self, tag: Any, props: Any) -> None: """Define marked-content point with property list""" self.device.do_tag(tag, props) return - def do_BMC(self, tag): + def do_BMC(self, tag: Any) -> None: """Begin marked-content sequence""" self.device.begin_tag(tag) return - def do_BDC(self, tag, props): + def do_BDC(self, tag: Any, props: Any) -> None: """Begin marked-content sequence with property list""" self.device.begin_tag(tag, props) return - def do_EMC(self): + def do_EMC(self) -> None: """End marked-content sequence""" self.device.end_tag() return - def do_Tc(self, space): + def do_Tc(self, space: float) -> None: """Set character spacing. Character spacing is used by the Tj, TJ, and ' operators. @@ -734,7 +735,7 @@ def do_Tc(self, space): self.textstate.charspace = space return - def do_Tw(self, space): + def do_Tw(self, space: float) -> None: """Set the word spacing. Word spacing is used by the Tj, TJ, and ' operators. @@ -744,7 +745,7 @@ def do_Tw(self, space): self.textstate.wordspace = space return - def do_Tz(self, scale): + def do_Tz(self, scale: float) -> None: """Set the horizontal scaling. :param scale: is a number specifying the percentage of the normal width @@ -752,7 +753,7 @@ def do_Tz(self, scale): self.textstate.scaling = scale return - def do_TL(self, leading): + def do_TL(self, leading: float) -> None: """Set the text leading. Text leading is used only by the T*, ', and " operators. @@ -762,7 +763,7 @@ def do_TL(self, leading): self.textstate.leading = -leading return - def do_Tf(self, fontid, fontsize): + def do_Tf(self, fontid: Any, fontsize: float) -> None: """Set the text font :param fontid: the name of a font resource in the Font subdictionary @@ -778,12 +779,12 @@ def do_Tf(self, fontid, fontsize): self.textstate.fontsize = fontsize return - def do_Tr(self, render): + def do_Tr(self, render: int) -> None: """Set the text rendering mode""" self.textstate.render = render return - def do_Ts(self, rise): + def do_Ts(self, rise: float) -> None: """Set the text rise :param rise: a number expressed in unscaled text space units @@ -791,14 +792,14 @@ def do_Ts(self, rise): self.textstate.rise = rise return - def do_Td(self, tx, ty): + def do_Td(self, tx: float, ty: float) -> None: """Move text position""" (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.linematrix = (0, 0) return - def do_TD(self, tx, ty): + def do_TD(self, tx: float, ty: float) -> None: """Move text position and set leading""" (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) @@ -806,13 +807,14 @@ def do_TD(self, tx, ty): self.textstate.linematrix = (0, 0) return - def do_Tm(self, a, b, c, d, e, f): + def do_Tm(self, a: float, b: float, c: float, d: float, e: float, f: float + ) -> None: """Set text matrix and text line matrix""" self.textstate.matrix = (a, b, c, d, e, f) self.textstate.linematrix = (0, 0) return - def do_T_a(self): + def do_T_a(self) -> None: """Move to start of next text line""" (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, @@ -831,12 +833,12 @@ def do_TJ(self, seq: Iterable[Any]) -> None: self.graphicstate.copy()) return - def do_Tj(self, s): + def do_Tj(self, s: Any) -> None: """Show text""" self.do_TJ([s]) return - def do__q(self, s): + def do__q(self, s: Any) -> None: """Move to next line and show text The ' (single quote) operator. @@ -845,7 +847,7 @@ def do__q(self, s): self.do_TJ([s]) return - def do__w(self, aw, ac, s): + def do__w(self, aw: float, ac: float, s: Any) -> None: """Set word and character spacing, move to next line, and show text The " (double quote) operator. @@ -855,15 +857,15 @@ def do__w(self, aw, ac, s): self.do_TJ([s]) return - def do_BI(self): + def do_BI(self) -> None: """Begin inline image object""" return - def do_ID(self): + def do_ID(self) -> None: """Begin inline image data""" return - def do_EI(self, obj): + def do_EI(self, obj: Any) -> None: """End inline image object""" if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj: iobjid = str(id(obj)) @@ -872,7 +874,7 @@ def do_EI(self, obj): self.device.end_figure(iobjid) return - def do_Do(self, xobjid): + def do_Do(self, xobjid: Any) -> None: """Invoke named XObject""" xobjid = literal_name(xobjid) try: diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 18c04272..0ad92285 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -1,7 +1,8 @@ import logging from io import BytesIO -from typing import BinaryIO +from typing import BinaryIO, TYPE_CHECKING, Optional, Union from .psparser import PSStackParser +from .psparser import PSKeyword from .psparser import PSSyntaxError from .psparser import PSEOF from .psparser import KWD @@ -12,6 +13,9 @@ from .pdftypes import int_value from .pdftypes import dict_value +if TYPE_CHECKING: + from .pdfdocument import PDFDocument + log = logging.getLogger(__name__) @@ -19,7 +23,8 @@ class PDFSyntaxError(PDFException): pass -class PDFParser(PSStackParser): +# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None +class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """ PDFParser fetch PDF objects from a file stream. It can handle indirect references by referring to @@ -38,11 +43,11 @@ class PDFParser(PSStackParser): def __init__(self, fp: BinaryIO): PSStackParser.__init__(self, fp) - self.doc = None + self.doc: Optional["PDFDocument"] = None self.fallback = False return - def set_document(self, doc): + def set_document(self, doc: "PDFDocument") -> None: """Associates the parser with a PDFDocument object.""" self.doc = doc return @@ -54,7 +59,7 @@ def set_document(self, doc): KEYWORD_XREF = KWD(b'xref') KEYWORD_STARTXREF = KWD(b'startxref') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): @@ -71,7 +76,7 @@ def do_keyword(self, pos, token): # reference to indirect object try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) + (objid, genno) = (int(objid), int(genno)) # type: ignore obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: @@ -84,7 +89,7 @@ def do_keyword(self, pos, token): objlen = 0 if not self.fallback: try: - objlen = int_value(dic['Length']) + objlen = int_value(dic['Length']) # type: ignore except KeyError: if settings.STRICT: raise PDFSyntaxError('/Length is undefined: %r' % dic) @@ -115,13 +120,13 @@ def do_keyword(self, pos, token): objlen += len(line) if self.fallback: data += line - data = bytes(data) self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10]) - obj = PDFStream(dic, data, self.doc.decipher) - self.push((pos, obj)) + assert self.doc is not None + stream = PDFStream(dic, bytes(data), self.doc.decipher) + self.push((pos, stream)) else: # others @@ -139,22 +144,22 @@ class PDFStreamParser(PDFParser): indirect references to other objects in the same document. """ - def __init__(self, data): + def __init__(self, data: bytes): PDFParser.__init__(self, BytesIO(data)) return - def flush(self): + def flush(self) -> None: self.add_results(*self.popall()) return KEYWORD_OBJ = KWD(b'obj') - def do_keyword(self, pos, token): + def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_R: # reference to indirect object try: ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) + (objid, genno) = (int(objid), int(genno)) # type: ignore obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 5668edca..3ac72e57 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -170,7 +170,7 @@ def keyword_name(x: Any) -> Any: } -PSParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] +PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] class PSBaseParser: @@ -220,7 +220,7 @@ def seek(self, pos: int) -> None: self._parse1 = self._parse_main self._curtoken = b'' self._curtokenpos = 0 - self._tokens: List[Tuple[int, Any]] = [] + self._tokens: List[Tuple[int, PSBaseParserToken]] = [] return def fillbuf(self) -> None: @@ -333,7 +333,7 @@ def _parse_main(self, s: bytes, i: int) -> int: self._add_token(KWD(c)) return j+1 - def _add_token(self, obj: PSParserToken) -> None: + def _add_token(self, obj: PSBaseParserToken) -> None: self._tokens.append((self._curtokenpos, obj)) return @@ -500,7 +500,7 @@ def _parse_hexstring(self, s: bytes, i: int) -> int: self._parse1 = self._parse_main return j - def nexttoken(self) -> Tuple[int, PSParserToken]: + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: while not self._tokens: self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) @@ -509,20 +509,29 @@ def nexttoken(self) -> Tuple[int, PSParserToken]: return token -PSStackEntry = Tuple[int, Any] +# Stack slots may by occupied by any of: +# * the PSBaseParserToken types +# * list (via KEYWORD_ARRAY) +# * dict (via KEYWORD_DICT) +# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT +ExtraT = TypeVar("ExtraT") +PSStackType = Union[float, bool, PSLiteral, bytes, List, Dict, ExtraT] +PSStackEntry = Tuple[int, PSStackType[ExtraT]] -class PSStackParser(PSBaseParser): +class PSStackParser(PSBaseParser, Generic[ExtraT]): + def __init__(self, fp: BinaryIO): PSBaseParser.__init__(self, fp) self.reset() return def reset(self) -> None: - self.context: List[Tuple[int, Optional[str], List[PSStackEntry]]] = [] + self.context: List[Tuple[int, Optional[str], + List[PSStackEntry[ExtraT]]]] = [] self.curtype: Optional[str] = None - self.curstack: List[PSStackEntry] = [] - self.results: List[PSStackEntry] = [] + self.curstack: List[PSStackEntry[ExtraT]] = [] + self.results: List[PSStackEntry[ExtraT]] = [] return def seek(self, pos: int) -> None: @@ -530,21 +539,21 @@ def seek(self, pos: int) -> None: self.reset() return - def push(self, *objs: PSStackEntry) -> None: + def push(self, *objs: PSStackEntry[ExtraT]) -> None: self.curstack.extend(objs) return - def pop(self, n: int) -> List[PSStackEntry]: + def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: objs = self.curstack[-n:] self.curstack[-n:] = [] return objs - def popall(self) -> List[PSStackEntry]: + def popall(self) -> List[PSStackEntry[ExtraT]]: objs = self.curstack self.curstack = [] return objs - def add_results(self, *objs: PSStackEntry) -> None: + def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: try: log.debug('add_results: %r', objs) except Exception: @@ -558,7 +567,7 @@ def start_type(self, pos: int, type: str) -> None: log.debug('start_type: pos=%r, type=%r', pos, type) return - def end_type(self, type: str) -> Tuple[int, List[PSParserToken]]: + def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: if self.curtype != type: raise PSTypeError('Type mismatch: {!r} != {!r}' .format(self.curtype, type)) @@ -570,7 +579,7 @@ def end_type(self, type: str) -> Tuple[int, List[PSParserToken]]: def do_keyword(self, pos: int, token: PSKeyword) -> None: return - def nextobject(self) -> PSStackEntry: + def nextobject(self) -> PSStackEntry[ExtraT]: """Yields a list of objects. Arrays and dictionaries are represented as Python lists and diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 25f09728..0173fe97 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -128,7 +128,11 @@ def apply_png_predictor(pred: Any, colors: int, columns: int, Point = Tuple[float, float] Rect = Tuple[float, float, float, float] Matrix = Tuple[float, float, float, float, float, float] -PathSegment = Tuple[str, float, float] +PathSegment = Union[ + Tuple[str], # Literal['h'] + Tuple[str, float, float], # Literal['m', 'l'] + Tuple[str, float, float, float, float], # Literal['v', 'y'] + Tuple[str, float, float, float, float, float, float]] # Literal['c'] # Matrix operations MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)