From ef4787d8add35c84ed324e488a0683485b422f45 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Sat, 10 Oct 2020 14:17:04 +0100 Subject: [PATCH] Fix not being able to pass boxes flow as None to pdf2txt (#479) * Fix not being able to pass boxes flow as None to pdf2txt * Changes from code review * Update CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 2 ++ tools/pdf2txt.py | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2386d79..10bf3c1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] ### Added + +- Option to disable boxes flow layout analysis when using pdf2txt ([#479](https://github.com/pdfminer/pdfminer.six/pull/479)) - Support for `pathlib.PurePath` in `open_filename` ([#491](https://github.com/pdfminer/pdfminer.six/issues/491)) ### Fixed diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index c9fb4cd1..dcaef0e6 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -16,6 +16,15 @@ (".tag", "tag")) +def float_or_disabled(x): + if x.lower().strip() == "disabled": + return x + try: + x = float(x) + except ValueError: + raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) + + def extract_text(files=[], outfile='-', no_laparams=False, all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, @@ -120,14 +129,14 @@ def maketheparser(): "be part of the same paragraph. The margin is specified " "relative to the height of a line.") la_params.add_argument( - "--boxes-flow", "-F", type=float, default=0.5, + "--boxes-flow", "-F", type=float_or_disabled, default=0.5, help="Specifies how much a horizontal and vertical position of a " "text matters when determining the order of lines. The value " "should be within the range of -1.0 (only horizontal position " "matters) to +1.0 (only vertical position matters). You can also " - "pass `None` to disable advanced layout analysis, and instead " - "return text based on the position of the bottom left corner of " - "the text box.") + "pass `disabled` to disable advanced layout analysis, and " + "instead return text based on the position of the bottom left " + "corner of the text box.") la_params.add_argument( "--all-texts", "-A", default=False, action="store_true", help="If layout analysis should be performed on text in figures.")