diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 27ec0190..28163378 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -42,8 +42,12 @@ class PDFLayoutAnalyzer(PDFTextDevice): cur_item: LTLayoutContainer ctm: Matrix - def __init__(self, rsrcmgr: PDFResourceManager, pageno: int = 1, - laparams: Optional[LAParams] = None) -> None: + def __init__( + self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno self.laparams = laparams @@ -87,8 +91,14 @@ def render_image(self, name: str, stream: PDFStream) -> None: self.cur_item.add(item) return - def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool, - evenodd: bool, path: Sequence[PathSegment]) -> None: + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) @@ -140,9 +150,17 @@ def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool, gstate.scolor, gstate.ncolor) self.cur_item.add(curve) - def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float, - scaling: float, rise: float, cid: int, ncs: PDFColorSpace, - graphicstate: PDFGraphicState) -> float: + def render_char( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs: PDFColorSpace, + graphicstate: PDFGraphicState + ) -> float: try: text = font.to_unichr(cid) assert isinstance(text, str), str(type(text)) @@ -164,10 +182,12 @@ def receive_layout(self, ltpage: LTPage) -> None: class PDFPageAggregator(PDFLayoutAnalyzer): - def __init__(self, - rsrcmgr: PDFResourceManager, - pageno: int = 1, - laparams: Optional[LAParams] = None) -> None: + def __init__( + self, + rsrcmgr: PDFResourceManager, + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.result: Optional[LTPage] = None @@ -187,12 +207,14 @@ def get_result(self) -> LTPage: class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): - def __init__(self, - rsrcmgr: PDFResourceManager, - outfp: IOType, - codec: str = 'utf-8', - pageno: int = 1, - laparams: Optional[LAParams] = None) -> None: + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: IOType, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None + ) -> None: PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.outfp: IOType = outfp @@ -216,14 +238,16 @@ def _is_binary_stream(outfp: AnyIO) -> bool: class TextConverter(PDFConverter[AnyIO]): - def __init__(self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = 'utf-8', - pageno: int = 1, - laparams: Optional[LAParams] = None, - showpageno: bool = False, - imagewriter: Optional[ImageWriter] = None) -> None: + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + showpageno: bool = False, + imagewriter: Optional[ImageWriter] = None + ) -> None: super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno @@ -265,8 +289,14 @@ def render_image(self, name: str, stream: PDFStream) -> None: PDFConverter.render_image(self, name, stream) return - def paint_path(self, gstate: PDFGraphicState, stroke: bool, fill: bool, - evenodd: bool, path: Sequence[PathSegment]) -> None: + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: return @@ -285,21 +315,23 @@ class HTMLConverter(PDFConverter[AnyIO]): 'char': 'black', } - def __init__(self, - rsrcmgr: PDFResourceManager, - outfp: AnyIO, - codec: str = 'utf-8', - pageno: int = 1, - laparams: Optional[LAParams] = None, - scale: float = 1, - fontscale: float = 1.0, - layoutmode: str = 'normal', - showpageno: bool = True, - pagemargin: int = 50, - imagewriter: Optional[ImageWriter] = None, - debug: int = 0, - rect_colors: Optional[Dict[str, str]] = None, - text_colors: Optional[Dict[str, str]] = None) -> None: + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = 'utf-8', + pageno: int = 1, + laparams: Optional[LAParams] = None, + scale: float = 1, + fontscale: float = 1.0, + layoutmode: str = 'normal', + showpageno: bool = True, + pagemargin: int = 50, + imagewriter: Optional[ImageWriter] = None, + debug: int = 0, + rect_colors: Optional[Dict[str, str]] = None, + text_colors: Optional[Dict[str, str]] = None + ) -> None: PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) @@ -360,8 +392,15 @@ def write_text(self, text: str) -> None: self.write(enc(text)) return - def place_rect(self, color: str, borderwidth: int, x: float, y: float, - w: float, h: float) -> None: + def place_rect( + self, + color: str, + borderwidth: int, + x: float, + y: float, + w: float, + h: float + ) -> None: color2 = self.rect_colors.get(color) if color2 is not None: s = '\n') return - def begin_div(self, color: str, borderwidth: int, x: float, y: float, - w: float, h: float, writing_mode: str = 'False') -> None: + def begin_div( + self, + color: str, + borderwidth: int, + x: float, + y: float, + w: float, + h: float, + writing_mode: str = 'False' + ) -> None: self._fontstack.append(self._font) self._font = None s = '
None: +def extract_text_to_fp( + inf: BinaryIO, + outfp: AnyIO, + output_type: str = 'text', + codec: str = 'utf-8', + laparams: Optional[LAParams] = None, + maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", + scale: float = 1.0, + rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, + strip_control: bool = False, + debug: bool = False, + disable_caching: bool = False, + **kwargs: Any +) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. @@ -109,13 +111,15 @@ def extract_text_to_fp(inf: BinaryIO, device.close() -def extract_text(pdf_file: FileOrName, - password: str = '', - page_numbers: Optional[Container[int]] = None, - maxpages: int = 0, - caching: bool = True, - codec: str = 'utf-8', - laparams: Optional[LAParams] = None) -> str: +def extract_text( + pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + codec: str = 'utf-8', + laparams: Optional[LAParams] = None +) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file @@ -151,12 +155,14 @@ def extract_text(pdf_file: FileOrName, return output_string.getvalue() -def extract_pages(pdf_file: FileOrName, - password: str = '', - page_numbers: Optional[Container[int]] = None, - maxpages: int = 0, - caching: bool = True, - laparams: Optional[LAParams] = None) -> Iterator[LTPage]: +def extract_pages( + pdf_file: FileOrName, + password: str = '', + page_numbers: Optional[Container[int]] = None, + maxpages: int = 0, + caching: bool = True, + laparams: Optional[LAParams] = None +) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file diff --git a/pdfminer/image.py b/pdfminer/image.py index 45f1de68..83f9a7aa 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -17,8 +17,13 @@ def align32(x: int) -> int: class BMPWriter: - def __init__(self, fp: BinaryIO, bits: int, width: int, height: int - ) -> None: + def __init__( + self, + fp: BinaryIO, + bits: int, + width: int, + height: int + ) -> None: self.fp = fp self.bits = bits self.width = width @@ -142,8 +147,12 @@ def is_jbig2_image(image: LTImage) -> bool: return is_jbig2 @staticmethod - def _get_image_extension(image: LTImage, width: int, height: int, - is_jbig2: bool) -> str: + def _get_image_extension( + image: LTImage, + width: int, + height: int, + is_jbig2: bool + ) -> str: filters = image.stream.get_filters() if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: ext = '.jpg' @@ -159,8 +168,11 @@ def _get_image_extension(image: LTImage, width: int, height: int, return ext @staticmethod - def _create_unique_image_name(dirname: str, image_name: str, ext: str - ) -> Tuple[str, str]: + def _create_unique_image_name( + dirname: str, + image_name: str, + ext: str + ) -> Tuple[str, str]: name = image_name + ext path = os.path.join(dirname, name) img_index = 0 diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py index 7af31b68..10ee7e6f 100644 --- a/pdfminer/jbig2.py +++ b/pdfminer/jbig2.py @@ -103,16 +103,24 @@ def is_eof(self) -> bool: self.stream.seek(-1, os.SEEK_CUR) return False - def parse_flags(self, segment: JBIG2Segment, flags: int, field: bytes - ) -> JBIG2SegmentFlags: + def parse_flags( + self, + segment: JBIG2Segment, + flags: int, + field: bytes + ) -> JBIG2SegmentFlags: return { "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), "type": masked_value(SEG_TYPE_MASK, flags) } - def parse_retention_flags(self, segment: JBIG2Segment, flags: int, - field: bytes) -> JBIG2RetentionFlags: + def parse_retention_flags( + self, + segment: JBIG2Segment, + flags: int, + field: bytes + ) -> JBIG2RetentionFlags: ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) retain_segments = [] ref_segments = [] @@ -152,15 +160,23 @@ def parse_retention_flags(self, segment: JBIG2Segment, flags: int, "ref_segments": ref_segments, } - def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes - ) -> int: + def parse_page_assoc( + self, + segment: JBIG2Segment, + page: int, + field: bytes + ) -> int: if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]: field += self.stream.read(3) page = unpack_int(">L", field) return page - def parse_data_length(self, segment: JBIG2Segment, length: int, - field: bytes) -> int: + def parse_data_length( + self, + segment: JBIG2Segment, + length: int, + field: bytes + ) -> int: if length: if (cast(JBIG2SegmentFlags, segment["flags"])["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \ @@ -188,8 +204,11 @@ class JBIG2StreamWriter: def __init__(self, stream: BinaryIO) -> None: self.stream = stream - def write_segments(self, segments: Iterable[JBIG2Segment], - fix_last_page: bool = True) -> int: + def write_segments( + self, + segments: Iterable[JBIG2Segment], + fix_last_page: bool = True + ) -> int: data_len = 0 current_page: Optional[int] = None seg_num: Optional[int] = None @@ -218,8 +237,11 @@ def write_segments(self, segments: Iterable[JBIG2Segment], return data_len - def write_file(self, segments: Iterable[JBIG2Segment], - fix_last_page: bool = True) -> int: + def write_file( + self, + segments: Iterable[JBIG2Segment], + fix_last_page: bool = True + ) -> int: header = FILE_HEADER_ID header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN header += pack(">B", header_flags) @@ -269,8 +291,11 @@ def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment return pack(">B", flags) - def encode_retention_flags(self, value: JBIG2RetentionFlags, - segment: JBIG2Segment) -> bytes: + def encode_retention_flags( + self, + value: JBIG2RetentionFlags, + segment: JBIG2Segment + ) -> bytes: flags = [] flags_format = ">B" ref_count = value["ref_count"] @@ -320,8 +345,11 @@ def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes: data += cast(bytes, segment["raw_data"]) return data - def get_eop_segment(self, seg_number: int, page_number: int - ) -> JBIG2Segment: + def get_eop_segment( + self, + seg_number: int, + page_number: int + ) -> JBIG2Segment: return { 'data_length': 0, 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE}, diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 618fcddd..b9f3d105 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -68,14 +68,16 @@ class LAParams: figures. """ - def __init__(self, - line_overlap: float = 0.5, - char_margin: float = 2.0, - line_margin: float = 0.5, - word_margin: float = 0.1, - boxes_flow: Optional[float] = 0.5, - detect_vertical: bool = False, - all_texts: bool = False) -> None: + def __init__( + self, + line_overlap: float = 0.5, + char_margin: float = 2.0, + line_margin: float = 0.5, + word_margin: float = 0.1, + boxes_flow: Optional[float] = 0.5, + detect_vertical: bool = False, + all_texts: bool = False + ) -> None: self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin @@ -203,10 +205,16 @@ def voverlap(self, obj: "LTComponent") -> float: class LTCurve(LTComponent): """A generic Bezier curve""" - def __init__(self, linewidth: float, pts: List[Point], - stroke: bool = False, fill: bool = False, - evenodd: bool = False, stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None) -> None: + def __init__( + self, + linewidth: float, + pts: List[Point], + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts self.linewidth = linewidth @@ -227,10 +235,17 @@ class LTLine(LTCurve): Could be used for separating text or figures. """ - def __init__(self, linewidth: float, p0: Point, p1: Point, - stroke: bool = False, fill: bool = False, - evenodd: bool = False, stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None) -> None: + def __init__( + self, + linewidth: float, + p0: Point, + p1: Point, + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color) return @@ -242,10 +257,16 @@ class LTRect(LTCurve): Could be used for framing another pictures or figures. """ - def __init__(self, linewidth: float, bbox: Rect, - stroke: bool = False, fill: bool = False, - evenodd: bool = False, stroking_color: Optional[Color] = None, - non_stroking_color: Optional[Color] = None) -> None: + def __init__( + self, + linewidth: float, + bbox: Rect, + stroke: bool = False, + fill: bool = False, + evenodd: bool = False, + stroking_color: Optional[Color] = None, + non_stroking_color: Optional[Color] = None + ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, @@ -297,10 +318,19 @@ def get_text(self) -> str: class LTChar(LTComponent, LTText): """Actual letter in the text as a Unicode string.""" - def __init__(self, matrix: Matrix, font: PDFFont, fontsize: float, - scaling: float, rise: float, text: str, textwidth: float, - textdisp: Union[float, Tuple[Optional[float], float]], - ncs: PDFColorSpace, graphicstate: PDFGraphicState) -> None: + def __init__( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + text: str, + textwidth: float, + textdisp: Union[float, Tuple[Optional[float], float]], + ncs: PDFColorSpace, + graphicstate: PDFGraphicState + ) -> None: LTText.__init__(self) self._text = text self.matrix = matrix @@ -458,8 +488,11 @@ def add(self, obj: LTComponent) -> None: # type: ignore[override] super().add(obj) return - def find_neighbors(self, plane: Plane[LTComponentT], ratio: float - ) -> List[LTTextLine]: + def find_neighbors( + self, + plane: Plane[LTComponentT], + ratio: float + ) -> List[LTTextLine]: """ Finds neighboring LTTextLineHorizontals in the plane. @@ -477,30 +510,42 @@ def find_neighbors(self, plane: Plane[LTComponentT], ratio: float self._is_right_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d)))] - def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0 - ) -> bool: + def _is_left_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the left-hand edge of `other` is within `tolerance`. """ return abs(other.x0 - self.x0) <= tolerance - def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0 - ) -> bool: + def _is_right_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the right-hand edge of `other` is within `tolerance`. """ return abs(other.x1 - self.x1) <= tolerance - def _is_centrally_aligned_with(self, other: LTComponent, - tolerance: float = 0) -> bool: + def _is_centrally_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the horizontal center of `other` is within `tolerance`. """ return abs( (other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance - def _is_same_height_as(self, other: LTComponent, tolerance: float = 0 - ) -> bool: + def _is_same_height_as( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: return abs(other.height - self.height) <= tolerance @@ -521,8 +566,11 @@ def add(self, obj: LTComponent) -> None: # type: ignore[override] super().add(obj) return - def find_neighbors(self, plane: Plane[LTComponentT], ratio: float - ) -> List[LTTextLine]: + def find_neighbors( + self, + plane: Plane[LTComponentT], + ratio: float + ) -> List[LTTextLine]: """ Finds neighboring LTTextLineVerticals in the plane. @@ -540,22 +588,31 @@ def find_neighbors(self, plane: Plane[LTComponentT], ratio: float self._is_upper_aligned_with(obj, tolerance=d) or self._is_centrally_aligned_with(obj, tolerance=d)))] - def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0 - ) -> bool: + def _is_lower_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the lower edge of `other` is within `tolerance`. """ return abs(other.y0 - self.y0) <= tolerance - def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0 - ) -> bool: + def _is_upper_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the upper edge of `other` is within `tolerance`. """ return abs(other.y1 - self.y1) <= tolerance - def _is_centrally_aligned_with(self, other: LTComponent, - tolerance: float = 0) -> bool: + def _is_centrally_aligned_with( + self, + other: LTComponent, + tolerance: float = 0 + ) -> bool: """ Whether the vertical center of `other` is within `tolerance`. """ @@ -649,8 +706,11 @@ def __init__(self, bbox: Rect) -> None: return # group_objects: group text object to textlines. - def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent] - ) -> Iterator[LTTextLine]: + def group_objects( + self, + laparams: LAParams, + objs: Iterable[LTComponent] + ) -> Iterator[LTTextLine]: obj0 = None line = None for obj1 in objs: @@ -725,8 +785,11 @@ def group_objects(self, laparams: LAParams, objs: Iterable[LTComponent] yield line return - def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine] - ) -> Iterator[LTTextBox]: + def group_textlines( + self, + laparams: LAParams, + lines: Iterable[LTTextLine] + ) -> Iterator[LTTextBox]: """Group neighboring lines to textboxes""" plane: Plane[LTTextLine] = Plane(self.bbox) plane.extend(lines) @@ -757,8 +820,11 @@ def group_textlines(self, laparams: LAParams, lines: Iterable[LTTextLine] yield box return - def group_textboxes(self, laparams: LAParams, boxes: Sequence[LTTextBox] - ) -> List[LTTextGroup]: + def group_textboxes( + self, + laparams: LAParams, + boxes: Sequence[LTTextBox] + ) -> List[LTTextGroup]: """Group textboxes hierarchically. Get pair-wise distances, via dist func defined below, and then merge diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index c5b998b6..0a370633 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -34,8 +34,12 @@ def __repr__(self) -> str: def __enter__(self) -> "PDFDevice": return self - def __exit__(self, exc_type: object, exc_val: object, exc_tb: object - ) -> None: + def __exit__( + self, + exc_type: object, + exc_val: object, + exc_tb: object + ) -> None: self.close() def close(self) -> None: @@ -45,15 +49,21 @@ def set_ctm(self, ctm: Matrix) -> None: self.ctm = ctm return - def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None - ) -> None: + def begin_tag( + self, + tag: PSLiteral, + props: Optional["PDFStackT"] = None + ) -> None: return def end_tag(self) -> None: return - def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None - ) -> None: + def do_tag( + self, + tag: PSLiteral, + props: Optional["PDFStackT"] = None + ) -> None: return def begin_page(self, page: PDFPage, ctm: Matrix) -> None: @@ -68,25 +78,38 @@ def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: def end_figure(self, name: str) -> None: return - def paint_path(self, graphicstate: "PDFGraphicState", stroke: bool, - fill: bool, evenodd: bool, - path: Sequence[PathSegment]) -> None: + def paint_path( + self, + graphicstate: "PDFGraphicState", + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment] + ) -> None: return def render_image(self, name: str, stream: PDFStream) -> None: return - def render_string(self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState" - ) -> None: + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: return class PDFTextDevice(PDFDevice): - def render_string(self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState" - ) -> None: + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: assert self.ctm is not None matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font @@ -111,12 +134,21 @@ def render_string(self, textstate: "PDFTextState", seq: PDFTextSeq, graphicstate) return - def render_string_horizontal(self, seq: PDFTextSeq, matrix: Matrix, - pos: Point, font: PDFFont, fontsize: float, - scaling: float, charspace: float, - wordspace: float, rise: float, dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState") -> Point: + def render_string_horizontal( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> Point: (x, y) = pos needcharspace = False for obj in seq: @@ -135,12 +167,21 @@ def render_string_horizontal(self, seq: PDFTextSeq, matrix: Matrix, needcharspace = True return (x, y) - def render_string_vertical(self, seq: PDFTextSeq, matrix: Matrix, - pos: Point, font: PDFFont, fontsize: float, - scaling: float, charspace: float, - wordspace: float, rise: float, dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState") -> Point: + def render_string_vertical( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> Point: (x, y) = pos needcharspace = False for obj in seq: @@ -159,16 +200,28 @@ def render_string_vertical(self, seq: PDFTextSeq, matrix: Matrix, needcharspace = True return (x, y) - def render_char(self, matrix: Matrix, font: PDFFont, fontsize: float, - scaling: float, rise: float, cid: int, ncs: PDFColorSpace, - graphicstate: "PDFGraphicState") -> float: + def render_char( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> float: return 0 class TagExtractor(PDFDevice): - def __init__(self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, - codec: str = 'utf-8') -> None: + def __init__( + self, + rsrcmgr: "PDFResourceManager", + outfp: BinaryIO, + codec: str = 'utf-8' + ) -> None: PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec @@ -176,9 +229,13 @@ def __init__(self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, self._stack: List[PSLiteral] = [] return - def render_string(self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState" - ) -> None: + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState" + ) -> None: font = textstate.font assert font is not None text = '' diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index c96d5bfc..bbeaf20e 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -304,8 +304,12 @@ class PDFStandardSecurityHandler: b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') supported_revisions: Tuple[int, ...] = (2, 3) - def __init__(self, docid: Sequence[bytes], param: Dict[str, Any], - password: str = '') -> None: + def __init__( + self, + docid: Sequence[bytes], + param: Dict[str, Any], + password: str = '' + ) -> None: self.docid = docid self.param = param self.password = password @@ -419,8 +423,13 @@ def authenticate_owner_password(self, password: bytes) -> Optional[bytes]: user_password = Arcfour(k).decrypt(user_password) return self.authenticate_user_password(user_password) - def decrypt(self, objid: int, genno: int, data: bytes, - attrs: Optional[Dict[str, Any]] = None) -> bytes: + def decrypt( + self, + objid: int, + genno: int, + data: bytes, + attrs: Optional[Dict[str, Any]] = None + ) -> bytes: return self.decrypt_rc4(objid, genno, data) def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: @@ -460,8 +469,10 @@ def init_params(self) -> None: raise PDFEncryptionError(error_msg) return - def get_cfm(self, name: str - ) -> Optional[Callable[[int, int, bytes], bytes]]: + def get_cfm( + self, + name: str + ) -> Optional[Callable[[int, int, bytes], bytes]]: if name == 'V2': return self.decrypt_rc4 elif name == 'AESV2': @@ -469,9 +480,14 @@ def get_cfm(self, name: str else: return None - def decrypt(self, objid: int, genno: int, data: bytes, - attrs: Optional[Dict[str, Any]] = None, - name: Optional[str] = None) -> bytes: + def decrypt( + self, + objid: int, + genno: int, + data: bytes, + attrs: Optional[Dict[str, Any]] = None, + name: Optional[str] = None + ) -> bytes: if not self.encrypt_metadata and attrs is not None: t = attrs.get('Type') if t is not None and literal_name(t) == 'Metadata': @@ -514,8 +530,10 @@ def init_params(self) -> None: self.u_key_salt = self.u[40:] return - def get_cfm(self, name: str - ) -> Optional[Callable[[int, int, bytes], bytes]]: + def get_cfm( + self, + name: str + ) -> Optional[Callable[[int, int, bytes], bytes]]: if name == 'AESV3': return self.decrypt_aes256 else: @@ -575,8 +593,13 @@ class PDFDocument: 5: PDFStandardSecurityHandlerV5, } - def __init__(self, parser: PDFParser, password: str = '', - caching: bool = True, fallback: bool = True) -> None: + def __init__( + self, + parser: PDFParser, + password: str = '', + caching: bool = True, + fallback: bool = True + ) -> None: "Set the document to use a given PDFParser object." self.caching = caching self.xrefs: List[PDFBaseXRef] = [] @@ -653,8 +676,12 @@ def _initialize_password(self, password: str = '') -> None: self._parser.fallback = False # need to read streams with exact length return - def _getobj_objstm(self, stream: PDFStream, index: int, objid: int - ) -> object: + def _getobj_objstm( + self, + stream: PDFStream, + index: int, + objid: int + ) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: @@ -781,7 +808,11 @@ def search(entry: object, level: int return return search(self.catalog['Outlines'], 0) - def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any: + def lookup_name( + self, + cat: str, + key: Union[str, bytes] + ) -> Any: try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): @@ -840,8 +871,12 @@ def find_xref(self, parser: PDFParser) -> int: return int(prev) # read xref table - def read_xref_from(self, parser: PDFParser, start: int, - xrefs: List[PDFBaseXRef]) -> None: + def read_xref_from( + self, + parser: PDFParser, + start: int, + xrefs: List[PDFBaseXRef] + ) -> None: """Reads XRefs from the given location.""" parser.seek(start) parser.reset() diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index b9f8f500..df0813d5 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -525,8 +525,12 @@ class PDFUnicodeNotDefined(PDFFontError): class PDFFont: - def __init__(self, descriptor: Mapping[str, Any], widths: FontWidthDict, - default_width: Optional[float] = None) -> None: + def __init__( + self, + descriptor: Mapping[str, Any], + widths: FontWidthDict, + default_width: Optional[float] = None + ) -> None: self.descriptor = descriptor self.widths: FontWidthDict = resolve_all(widths) self.fontname = resolve1(descriptor.get('FontName', 'unknown')) @@ -597,8 +601,10 @@ def char_width(self, cid: int) -> float: except (KeyError, PDFUnicodeNotDefined): return self.default_width * self.hscale - def char_disp(self, cid: int - ) -> Union[float, Tuple[Optional[float], float]]: + def char_disp( + self, + cid: int + ) -> Union[float, Tuple[Optional[float], float]]: "Returns an integer for horizontal fonts, a tuple for vertical fonts." return 0 @@ -611,8 +617,12 @@ def to_unichr(self, cid: int) -> str: class PDFSimpleFont(PDFFont): - def __init__(self, descriptor: Mapping[str, Any], widths: FontWidthDict, - spec: Mapping[str, Any]) -> None: + def __init__( + self, + descriptor: Mapping[str, Any], + widths: FontWidthDict, + spec: Mapping[str, Any] + ) -> None: # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. @@ -649,8 +659,11 @@ def to_unichr(self, cid: int) -> str: class PDFType1Font(PDFSimpleFont): - def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any] - ) -> None: + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any] + ) -> None: try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -690,8 +703,11 @@ def __repr__(self) -> str: class PDFType3Font(PDFSimpleFont): - def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any] - ) -> None: + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any] + ) -> None: firstchar = int_value(spec.get('FirstChar', 0)) # lastchar = int_value(spec.get('LastChar', 0)) width_list = list_value(spec.get('Widths', [0]*256)) @@ -714,8 +730,12 @@ def __repr__(self) -> str: class PDFCIDFont(PDFFont): default_disp: Union[float, Tuple[Optional[float], float]] - def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any], - strict: bool = settings.STRICT) -> None: + def __init__( + self, + rsrcmgr: "PDFResourceManager", + spec: Mapping[str, Any], + strict: bool = settings.STRICT + ) -> None: try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -778,8 +798,11 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any], PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool - ) -> CMapBase: + def get_cmap_from_spec( + self, + spec: Mapping[str, Any], + strict: bool + ) -> CMapBase: """Get cmap from font specification For certain PDFs, Encoding Type isn't mentioned as an attribute of @@ -835,8 +858,10 @@ def is_multibyte(self) -> bool: def decode(self, bytes: bytes) -> Iterable[int]: return self.cmap.decode(bytes) - def char_disp(self, cid: int - ) -> Union[float, Tuple[Optional[float], float]]: + def char_disp( + self, + cid: int + ) -> Union[float, Tuple[Optional[float], float]]: "Returns an integer for horizontal fonts, a tuple for vertical fonts." return self.disps.get(cid, self.default_disp) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 4056a1e4..6387b42b 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -264,8 +264,11 @@ def fillbuf(self) -> None: self.charpos = 0 return - def get_inline_data(self, pos: int, target: bytes = b'EI' - ) -> Tuple[int, bytes]: + def get_inline_data( + self, + pos: int, + target: bytes = b'EI' + ) -> Tuple[int, bytes]: self.seek(pos) i = 0 data = b'' @@ -418,12 +421,15 @@ def pop(self, n: int) -> List[PDFStackT]: self.argstack = self.argstack[:-n] return x - def get_current_state(self) -> Tuple[Matrix, PDFTextState, - PDFGraphicState]: + def get_current_state( + self + ) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - def set_current_state(self, state: Tuple[Matrix, PDFTextState, - PDFGraphicState]) -> None: + def set_current_state( + self, + state: Tuple[Matrix, PDFTextState, PDFGraphicState] + ) -> None: (self.ctm, self.textstate, self.graphicstate) = state self.device.set_ctm(self.ctm) return @@ -439,8 +445,15 @@ def do_Q(self) -> None: self.set_current_state(self.gstack.pop()) return - def do_cm(self, a1: PDFStackT, b1: PDFStackT, c1: PDFStackT, d1: PDFStackT, - e1: PDFStackT, f1: PDFStackT) -> None: + def do_cm( + self, + a1: PDFStackT, + b1: PDFStackT, + c1: PDFStackT, + d1: PDFStackT, + e1: PDFStackT, + f1: PDFStackT + ) -> None: """Concatenate matrix to current transformation matrix""" self.ctm = \ mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm) @@ -497,23 +510,40 @@ def do_l(self, x: PDFStackT, y: PDFStackT) -> None: self.curpath.append(('l', cast(float, x), cast(float, y))) return - def do_c(self, x1: PDFStackT, y1: PDFStackT, x2: PDFStackT, - y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: + def do_c( + self, + x1: PDFStackT, + y1: PDFStackT, + x2: PDFStackT, + y2: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (three control points)""" self.curpath.append(('c', cast(float, x1), cast(float, y1), cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3))) return - def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, - y3: PDFStackT) -> None: + def do_v( + self, + x2: PDFStackT, + y2: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (initial point replicated)""" self.curpath.append(('v', cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3))) return - def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, - y3: PDFStackT) -> None: + def do_y( + self, + x1: PDFStackT, + y1: PDFStackT, + x3: PDFStackT, + y3: PDFStackT + ) -> None: """Append curved segment to path (final point replicated)""" self.curpath.append(('y', cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3))) @@ -524,8 +554,13 @@ def do_h(self) -> None: self.curpath.append(('h',)) return - def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, - h: PDFStackT) -> None: + def do_re( + self, + x: PDFStackT, + y: PDFStackT, + w: PDFStackT, + h: PDFStackT + ) -> None: """Append rectangle to path""" x = cast(float, x) y = cast(float, y) @@ -651,15 +686,25 @@ def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: (cast(float, r), cast(float, g), cast(float, b)) return - def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT - ) -> None: + def do_K( + self, + c: PDFStackT, + m: PDFStackT, + y: PDFStackT, + k: PDFStackT + ) -> None: """Set CMYK color for stroking operations""" self.graphicstate.scolor = \ (cast(float, c), cast(float, m), cast(float, y), cast(float, k)) return - def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT - ) -> None: + def do_k( + self, + c: PDFStackT, + m: PDFStackT, + y: PDFStackT, + k: PDFStackT + ) -> None: """Set CMYK color for nonstroking operations""" self.graphicstate.ncolor = \ (cast(float, c), cast(float, m), cast(float, y), cast(float, k)) @@ -834,8 +879,15 @@ def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: self.textstate.linematrix = (0, 0) return - def do_Tm(self, a: PDFStackT, b: PDFStackT, c: PDFStackT, d: PDFStackT, - e: PDFStackT, f: PDFStackT) -> None: + def do_Tm( + self, + a: PDFStackT, + b: PDFStackT, + c: PDFStackT, + d: PDFStackT, + e: PDFStackT, + f: PDFStackT + ) -> None: """Set text matrix and text line matrix""" self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f)) self.textstate.linematrix = (0, 0) @@ -954,9 +1006,12 @@ def process_page(self, page: PDFPage) -> None: self.device.end_page(page) return - def render_contents(self, resources: Dict[object, object], - streams: Sequence[object], - ctm: Matrix = MATRIX_IDENTITY) -> None: + def render_contents( + self, + resources: Dict[object, object], + streams: Sequence[object], + ctm: Matrix = MATRIX_IDENTITY + ) -> None: """Render the content streams. This method may be called recursively. diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 9d08bc5a..8380c239 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -42,8 +42,12 @@ class PDFPage: beads: a chain that represents natural reading order. """ - def __init__(self, doc: PDFDocument, pageid: object, attrs: object - ) -> None: + def __init__( + self, + doc: PDFDocument, + pageid: object, + attrs: object + ) -> None: """Initialize a page object. doc: a PDFDocument object. @@ -81,9 +85,10 @@ def __repr__(self) -> str: @classmethod def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: - def search(obj: object, parent: Dict[str, object] - ) -> Iterator[Tuple[int, - Dict[object, Dict[object, object]]]]: + def search( + obj: object, + parent: Dict[str, object] + ) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]: if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() @@ -127,10 +132,15 @@ def search(obj: object, parent: Dict[str, object] return @classmethod - def get_pages(cls, fp: BinaryIO, - pagenos: Optional[Container[int]] = None, maxpages: int = 0, - password: str = '', caching: bool = True, - check_extractable: bool = False) -> Iterator["PDFPage"]: + def get_pages( + cls, + fp: BinaryIO, + pagenos: Optional[Container[int]] = None, + maxpages: int = 0, + password: str = '', + caching: bool = True, + check_extractable: bool = False + ) -> Iterator["PDFPage"]: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 8a8baf86..6190ea99 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -74,8 +74,12 @@ class PDFNotImplementedError(PDFException): class PDFObjRef(PDFObject): - def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object - ) -> None: + def __init__( + self, + doc: Optional["PDFDocument"], + objid: int, + _: object + ) -> None: if objid == 0: if settings.STRICT: raise PDFValueError('PDF object id cannot be 0.') @@ -121,8 +125,12 @@ def resolve_all(x: object, default: object = None) -> Any: return x -def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object - ) -> Any: +def decipher_all( + decipher: DecipherCallable, + objid: int, + genno: int, + x: object +) -> Any: """Recursively deciphers the given object. """ if isinstance(x, bytes): @@ -210,8 +218,12 @@ def stream_value(x: object) -> "PDFStream": class PDFStream(PDFObject): - def __init__(self, attrs: Dict[str, Any], rawdata: bytes, decipher: - Optional[DecipherCallable] = None): + def __init__( + self, + attrs: Dict[str, Any], + rawdata: bytes, + decipher: Optional[DecipherCallable] = None + ) -> None: assert isinstance(attrs, dict), str(type(attrs)) self.attrs = attrs self.rawdata: Optional[bytes] = rawdata diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 3f21857a..a5cf0334 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -30,8 +30,12 @@ class open_filename(object): (str or pathlib.PurePath type is supported) and closes it on exit, (just like `open`), but does nothing for file-like objects. """ - def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any - ) -> None: + def __init__( + self, + filename: FileOrName, + *args: Any, + **kwargs: Any + ) -> None: if isinstance(filename, pathlib.PurePath): filename = str(filename) if isinstance(filename, str): @@ -46,8 +50,12 @@ def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any def __enter__(self) -> AnyIO: return self.file_handler - def __exit__(self, exc_type: object, exc_val: object, exc_tb: object - ) -> Literal[False]: + def __exit__( + self, + exc_type: object, + exc_val: object, + exc_tb: object + ) -> Literal[False]: if self.closing: self.file_handler.close() return False @@ -78,9 +86,11 @@ def shorten_str(s: str, size: int) -> str: return s -def compatible_encode_method(bytesorstring: Union[bytes, str], - encoding: str = 'utf-8', - erraction: str = 'ignore') -> str: +def compatible_encode_method( + bytesorstring: Union[bytes, str], + encoding: str = 'utf-8', + erraction: str = 'ignore' +) -> str: """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either. @@ -109,8 +119,13 @@ def paeth_predictor(left: int, above: int, upper_left: int) -> int: return upper_left -def apply_png_predictor(pred: int, colors: int, columns: int, - bitspercomponent: int, data: bytes) -> bytes: +def apply_png_predictor( + pred: int, + colors: int, + columns: int, + bitspercomponent: int, + data: bytes +) -> bytes: """Reverse the effect of the PNG predictor Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html @@ -268,8 +283,10 @@ def uniq(objs: Iterable[_T]) -> Iterator[_T]: return -def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T] - ) -> Tuple[List[_T], List[_T]]: +def fsplit( + pred: Callable[[_T], bool], + objs: Iterable[_T] +) -> Tuple[List[_T], List[_T]]: """Split a list into two classes according to the predicate.""" t = [] f = [] @@ -298,8 +315,11 @@ def get_bound(pts: Iterable[Point]) -> Rect: return x0, y0, x1, y1 -def pick(seq: Iterable[_T], func: Callable[[_T], float], - maxobj: Optional[_T] = None) -> Optional[_T]: +def pick( + seq: Iterable[_T], + func: Callable[[_T], float], + maxobj: Optional[_T] = None +) -> Optional[_T]: """Picks the object obj where func(obj) has the highest value.""" maxscore = None for obj in seq: diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 06bbd625..ffdf4241 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -100,8 +100,11 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None: raise TypeError(obj) -def dumptrailers(out: TextIO, doc: PDFDocument, - show_fallback_xref: bool = False) -> None: +def dumptrailers( + out: TextIO, + doc: PDFDocument, + show_fallback_xref: bool = False +) -> None: for xref in doc.xrefs: if not isinstance(xref, PDFXRefFallback) or show_fallback_xref: out.write('\n') @@ -116,8 +119,12 @@ def dumptrailers(out: TextIO, doc: PDFDocument, return -def dumpallobjs(out: TextIO, doc: PDFDocument, codec: Optional[str] = None, - show_fallback_xref: bool = False) -> None: +def dumpallobjs( + out: TextIO, + doc: PDFDocument, + codec: Optional[str] = None, + show_fallback_xref: bool = False +) -> None: visited = set() out.write('') for xref in doc.xrefs: @@ -139,10 +146,16 @@ def dumpallobjs(out: TextIO, doc: PDFDocument, codec: Optional[str] = None, return -def dumpoutline(outfp: TextIO, fname: str, objids: Any, - pagenos: Container[int], password: str = '', - dumpall: bool = False, codec: Optional[str] = None, - extractdir: Optional[str] = None) -> None: +def dumpoutline( + outfp: TextIO, + fname: str, + objids: Any, + pagenos: Container[int], + password: str = '', + dumpall: bool = False, + codec: Optional[str] = None, + extractdir: Optional[str] = None +) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) @@ -235,10 +248,17 @@ def extract1(objid: int, obj: Dict[str, Any]) -> None: return -def dumppdf(outfp: TextIO, fname: str, objids: Iterable[int], - pagenos: Container[int], password: str = '', dumpall: bool = False, - codec: Optional[str] = None, extractdir: Optional[str] = None, - show_fallback_xref: bool = False) -> None: +def dumppdf( + outfp: TextIO, + fname: str, + objids: Iterable[int], + pagenos: Container[int], + password: str = '', + dumpall: bool = False, + codec: Optional[str] = None, + extractdir: Optional[str] = None, + show_fallback_xref: bool = False +) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 457ddd72..47e2c79d 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -30,21 +30,30 @@ def float_or_disabled(x: str) -> FloatOrDisabled: raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) -def extract_text(files: Iterable[str] = [], outfile: str = '-', - no_laparams: bool = False, all_texts: Optional[bool] = None, - detect_vertical: Optional[bool] = None, - word_margin: Optional[float] = None, - char_margin: Optional[float] = None, - line_margin: Optional[float] = None, - boxes_flow: Optional[FloatOrDisabled] = None, - output_type: str = 'text', codec: str = 'utf-8', - strip_control: bool = False, maxpages: int = 0, - page_numbers: Optional[Container[int]] = None, - password: str = "", scale: float = 1.0, rotation: int = 0, - layoutmode: str = 'normal', - output_dir: Optional[str] = None, debug: bool = False, - disable_caching: bool = False, - **kwargs: Any) -> AnyIO: +def extract_text( + files: Iterable[str] = [], + outfile: str = '-', + no_laparams: bool = False, + all_texts: Optional[bool] = None, + detect_vertical: Optional[bool] = None, + word_margin: Optional[float] = None, + char_margin: Optional[float] = None, + line_margin: Optional[float] = None, + boxes_flow: Optional[FloatOrDisabled] = None, + output_type: str = 'text', + codec: str = 'utf-8', + strip_control: bool = False, + maxpages: int = 0, + page_numbers: Optional[Container[int]] = None, + password: str = "", + scale: float = 1.0, + rotation: int = 0, + layoutmode: str = 'normal', + output_dir: Optional[str] = None, + debug: bool = False, + disable_caching: bool = False, + **kwargs: Any +) -> AnyIO: if not files: raise ValueError("Must provide files to work upon!")