Skip to content

Commit

Permalink
Merge branch 'feature/pagesegmode' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
James R. Barlow committed Jan 12, 2016
2 parents b485a1e + 3b53e9a commit 8d323ae
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 10 deletions.
9 changes: 7 additions & 2 deletions ocrmypdf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,16 @@ def check_pil_encoder(codec_name, friendly_name):
"Advanced",
"Advanced options for power users")
advanced.add_argument(
'--tesseract-config', default=[], type=list, action='append',
'--tesseract-config', action='append', metavar='CFG', default=[],
help="additional Tesseract configuration files")
advanced.add_argument(
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
help="set Tesseract page segmentation mode (see tesseract --help)")
advanced.add_argument(
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
help='choose OCR PDF renderer')
advanced.add_argument(
'--tesseract-timeout', default=180.0, type=float,
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
help='give up on OCR after the timeout, but copy the preprocessed page '
'into the final output')

Expand Down Expand Up @@ -529,6 +532,7 @@ def ocr_tesseract_hocr(
timeout=options.tesseract_timeout,
pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
pdfinfo_lock),
pagesegmode=options.tesseract_pagesegmode,
log=log
)

Expand Down Expand Up @@ -635,6 +639,7 @@ def tesseract_ocr_and_render_pdf(
language=options.language,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
log=log)


Expand Down
25 changes: 19 additions & 6 deletions ocrmypdf/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,24 @@ def languages():


def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,
timeout: float, pageinfo_getter, log):
timeout: float, pageinfo_getter, pagesegmode: int, log):

badxml = os.path.splitext(output_hocr)[0] + '.badxml'

args_tesseract = [
get_program('tesseract'),
'-l', '+'.join(language),
'-l', '+'.join(language)
]

if pagesegmode is not None:
args_tesseract.extend(['-psm', str(pagesegmode)])

args_tesseract.extend([
input_file,
badxml,
'hocr'
] + tessconfig
] + tessconfig)
print(args_tesseract)
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
universal_newlines=True)
try:
Expand Down Expand Up @@ -135,7 +142,7 @@ def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,


def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
tessconfig: list, timeout: float, log):
tessconfig: list, timeout: float, pagesegmode: int, log):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
Expand All @@ -148,11 +155,17 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,

args_tesseract = [
get_program('tesseract'),
'-l', '+'.join(language),
'-l', '+'.join(language)
]

if pagesegmode is not None:
args_tesseract.extend(['-psm', str(pagesegmode)])

args_tesseract.extend([
input_image,
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
'pdf'
] + tessconfig
] + tessconfig)
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
universal_newlines=True)

Expand Down
5 changes: 5 additions & 0 deletions tests/spoof/tesseract_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def main():
m.update(lang.encode())
except ValueError:
pass
try:
psm = sys.argv[sys.argv.index('-psm') + 1]
m.update(psm.encode())
except ValueError:
pass

input_file = sys.argv[-3]
output_file = sys.argv[-2]
Expand Down
19 changes: 17 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def check_ocrmypdf(input_basename, output_basename, *args, env=None):
input_file = _make_input(input_basename)
output_file = _make_output(output_basename)

sh, _, err = run_ocrmypdf_sh(input_file, output_file, *args, env=env)
assert sh.returncode == 0, err
sh, out, err = run_ocrmypdf_sh(input_file, output_file, *args, env=env)
assert sh.returncode == 0, dict(stdout=out, stderr=err)
assert os.path.exists(output_file), "Output file not created"
assert os.stat(output_file).st_size > 100, "PDF too small or empty"
return output_file
Expand Down Expand Up @@ -348,3 +348,18 @@ def test_encrypted():
p, out, err = run_ocrmypdf_env('skew-encrypted.pdf', 'wont_be_created.pdf')
assert p.returncode == ExitCode.input_file
assert out.find('password')


@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_pagesegmode(renderer, spoof_tesseract_cache):
check_ocrmypdf(
'skew.pdf', 'test_psm_%s.pdf' % renderer,
'--tesseract-pagesegmode', '7',
'-v', '1',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)



0 comments on commit 8d323ae

Please sign in to comment.