/
__main__.py
executable file
·846 lines (721 loc) · 32.8 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
#!/usr/bin/env python3
# © 2015-17 James R. Barlow: github.com/jbarlow83
from tempfile import mkdtemp
from collections.abc import Sequence
import sys
import os
import re
import warnings
import multiprocessing
import atexit
import textwrap
import logging
import argparse
import PyPDF2 as pypdf
import PIL
import ruffus.ruffus_exceptions as ruffus_exceptions
import ruffus.cmdline as cmdline
import ruffus.proxy_logger as proxy_logger
from .pipeline import JobContext, JobContextManager, \
cleanup_working_files, build_pipeline
from .pdfa import file_claims_pdfa
from .helpers import is_iterable_notstr, re_symlink, is_file_writable
from .exec import tesseract, qpdf, ghostscript
from . import PROGRAM_NAME, VERSION
from .exceptions import ExitCode, ExitCodeException, MissingDependencyError
from . import exceptions as ocrmypdf_exceptions
from ._unicodefun import verify_python3_env
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
# -------------
# External dependencies
MINIMUM_TESS_VERSION = '3.04'
HOCR_OK_LANGS = frozenset([
'eng', 'deu', 'spa', 'ita', 'por'
])
def complain(message):
print(*textwrap.wrap(message), file=sys.stderr)
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
# --------
# Critical environment tests
verify_python3_env()
if tesseract.version() < MINIMUM_TESS_VERSION:
complain(
"Please install tesseract {0} or newer "
"(currently installed version is {1})".format(
MINIMUM_TESS_VERSION, tesseract.version()))
sys.exit(ExitCode.missing_dependency)
# -------------
# Parser
parser = argparse.ArgumentParser(
prog=PROGRAM_NAME,
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Generates a searchable PDF or PDF/A from a regular PDF.
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
""",
epilog="""\
OCRmyPDF attempts to keep the output file at about the same size. If a file
contains losslessly compressed images, and output file will be losslessly
compressed as well.
PDF is a page description file that attempts to preserve a layout exactly.
A PDF can contain vector objects (such as text or lines) and raster objects
(images). A page might have multiple images. OCRmyPDF is prepared to deal
with the wide variety of PDFs that exist in the wild.
When a PDF page contains text, OCRmyPDF assumes that the page has already
been OCRed or is a "born digital" page that should not be OCRed. The default
behavior is to exit in this case without producing a file. You can use the
option --skip-text to ignore pages with text, or --force-ocr to rasterize
all objects on the page and produce an image-only PDF as output.
ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf
ocrmypdf --force-ocr word_document.pdf output.pdf
If you are concerned about long-term archiving of PDFs, use the default option
--output-type pdfa which converts the PDF to a standardized PDF/A-2b. This
converts images to sRGB colorspace, removes some features from the PDF such
as Javascript or forms. If you want to minimize the number of changes made to
your PDF, use --output-type pdf.
If OCRmyPDF is given an image file as input, it will attempt to convert the
image to a PDF before processing. For more control over the conversion of
images to PDF, use the Python package img2pdf or other image to PDF software.
For example, this command uses img2pdf to convert all .png files beginning
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
sending the result to OCRmyPDF through a pipe. img2pdf is a dependency of
ocrmypdf so it is already installed.
img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf
Online documentation is located at:
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
""")
parser.add_argument(
'input_file', metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)")
parser.add_argument(
'output_file', metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be ovewritten. If same as input file, the "
"input file will be updated only if processing is successful.")
parser.add_argument(
'-l', '--language', action='append',
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.")
parser.add_argument(
'--image-dpi', metavar='DPI', type=int,
help="For input image instead of PDF, use this DPI instead of file's.")
parser.add_argument(
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'."
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar', nargs='?', const='\0', default=None, metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.")
parser.add_argument(
'--version', action='version', version=VERSION,
help="Print program version and exit")
jobcontrol = parser.add_argument_group(
"Job control options")
jobcontrol.add_argument(
'-j', '--jobs', metavar='N', type=int,
help="Use up to N CPU cores simultaneously (default: use all).")
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages")
jobcontrol.add_argument(
'-v', '--verbose', const="+", default=[], nargs='?', action="append",
help="Print more verbose messages for each additional verbose level")
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: copy input document's metadata)")
metadata.add_argument(
'--title', type=str,
help="Set document title (place multiple words in quotes)")
metadata.add_argument(
'--author', type=str,
help="Set document author")
metadata.add_argument(
'--subject', type=str,
help="Set document subject description")
metadata.add_argument(
'--keywords', type=str,
help="Set document keywords")
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR")
preprocessing.add_argument(
'-r', '--rotate-pages', action='store_true',
help="Automatically rotate pages based on detected text orientation")
preprocessing.add_argument(
'--remove-background', action='store_true',
help="Attempt to remove background from gray or color pages, setting it "
"to white ")
preprocessing.add_argument(
'-d', '--deskew', action='store_true',
help="Deskew each page before performing OCR")
preprocessing.add_argument(
'-c', '--clean', action='store_true',
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output")
preprocessing.add_argument(
'-i', '--clean-final', action='store_true',
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.")
preprocessing.add_argument(
'--oversample', metavar='DPI', type=int, default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly")
ocrsettings = parser.add_argument_group(
"OCR options",
"Control how OCR is applied")
ocrsettings.add_argument(
'-f', '--force-ocr', action='store_true',
help="Rasterize any fonts or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)")
ocrsettings.add_argument(
'-s', '--skip-text', action='store_true',
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages")
# ocrsettings.add_argument(
# '--redo-ocr', action='store_true',
# help="removing any existing OCR text, but otherwise preserve mixed PDF "
# "pages")
ocrsettings.add_argument(
'--skip-big', type=float, metavar='MPixels',
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output")
advanced = parser.add_argument_group(
"Advanced",
"Advanced options to control Tesseract's OCR behavior")
advanced.add_argument(
'--max-image-mpixels', action='store', type=float, metavar='MPixels',
help="Set maximum number of pixels to unpack before treating an image as a "
"decompression bomb",
default=128.0)
advanced.add_argument(
'--tesseract-config', action='append', metavar='CFG', default=[],
help="Additional Tesseract configuration files -- see documentation")
advanced.add_argument(
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
choices=range(0, 14),
help="Set Tesseract page segmentation mode (see tesseract --help)")
advanced.add_argument(
'--tesseract-oem', action='store', type=int, metavar='MODE',
choices=range(0, 4),
help=("Set Tesseract 4.0 OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
"3 - default.")
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'tesseract', 'hocr', 'tess4', 'sandwich'], default='auto',
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose."
"auto - let OCRmyPDF choose; "
"sandwich - default renderer for Tesseract 3.05.01 and newer; "
"hocr - default renderer for older versions of Tesseract; "
"tesseract - gives better results for non-Latin languages and "
"Tesseract older than 3.05.01 but has problems with some versions "
" of Ghostscript; deprecated"
"tess4 - deprecated alias for 'sandwich'"
)
advanced.add_argument(
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
help='Give up on OCR after the timeout, but copy the preprocessed page '
'into the final output')
advanced.add_argument(
'--rotate-pages-threshold', default=14.0, type=float, metavar='CONFIDENCE',
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)")
advanced.add_argument(
'--pdfa-image-compression', choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.")
advanced.add_argument(
'--user-words', metavar='FILE',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.")
advanced.add_argument(
'--user-patterns', metavar='FILE',
help="Specify the location of the Tesseract user patterns file.")
advanced.add_argument(
'--interword-spaces', action='store_true',
help="Add spaces between words with HOCR transformation.")
debugging = parser.add_argument_group(
"Debugging",
"Arguments to help with troubleshooting and debugging")
debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="Keep temporary files (helpful for debugging)")
debugging.add_argument(
'-g', '--debug-rendering', action='store_true',
help="Render each page twice with debug information on second page")
debugging.add_argument(
'--flowchart', type=str,
help="Generate the pipeline execution flowchart")
def check_options_languages(options, _log):
if not options.language:
options.language = ['eng'] # Enforce English hegemony
# Support v2.x "eng+deu" language syntax
if '+' in options.language[0]:
options.language = options.language[0].split('+')
languages = set(options.language)
if not languages.issubset(tesseract.languages()):
msg = (
"The installed version of tesseract does not have language "
"data for the following requested languages: \n")
for lang in (languages - tesseract.languages()):
msg += lang + '\n'
raise MissingDependencyError(msg)
def check_options_output(options, log):
if options.pdf_renderer == 'auto':
if tesseract.has_textonly_pdf():
options.pdf_renderer = 'sandwich'
else:
options.pdf_renderer = 'hocr'
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
raise MissingDependencyError(
"The 'sandwich' renderer requires Tesseract 3.05.01 or newer; "
"or Tesseract 4.00 alpha newer than February 2017.")
if options.pdf_renderer == 'tess4':
log.warning("The 'tess4' PDF renderer has been renamed to 'sandwich'. "
"Please use --pdf-renderer=sandwich.")
options.pdf_renderer = 'sandwich'
if options.pdf_renderer == 'tesseract':
if tesseract.version() < '3.05' and \
options.output_type.startswith('pdfa'):
log.warning(
"For best results use --pdf-renderer=tesseract "
"--output-type=pdf to disable PDF/A generation via "
"Ghostscript, which is known to corrupt the OCR text of "
"some PDFs produced your version of Tesseract.")
elif tesseract.has_textonly_pdf():
log.warning(
"The argument --pdf-renderer=tesseract provides support for "
"versions of tesseract older than your version. For best "
"results omit this argument and let OCRmyPDF choose the "
"best available renderer.")
if options.debug_rendering and options.pdf_renderer != 'hocr':
log.info(
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
lossless_reconstruction = False
if options.pdf_renderer in ('hocr', 'sandwich'):
if not any((options.deskew, options.clean_final, options.force_ocr,
options.remove_background)):
lossless_reconstruction = True
options.lossless_reconstruction = lossless_reconstruction
def check_options_sidecar(options, log):
if options.sidecar == '\0':
if options.output_file == '-':
raise argparse.ArgumentError(
None,
"--sidecar filename must be specified when output file is "
"stdout.")
options.sidecar = options.output_file + '.txt'
def check_options_preprocessing(options, log):
if any((options.clean, options.clean_final)):
from .exec import unpaper
try:
if unpaper.version() < '6.1':
raise MissingDependencyError(
"The installed 'unpaper' is not supported. "
"Install version 6.1 or newer.")
except FileNotFoundError:
raise MissingDependencyError(
"Install the 'unpaper' program to use --clean, --clean-final.")
if options.clean and \
not options.clean_final and \
options.pdf_renderer == 'tesseract':
log.info(
"Tesseract PDF renderer cannot render --clean pages without "
"also performing --clean-final, so --clean-final is assumed.")
def check_options_ocr_behavior(options, log):
if options.force_ocr and options.skip_text:
raise argparse.ArgumentError(
None,
"Error: --force-ocr and --skip-text are mutually incompatible.")
# if options.redo_ocr and (options.skip_text or options.force_ocr):
# raise argparse.ArgumentError(
# "Error: --redo-ocr and other OCR options are incompatible.")
languages = set(options.language)
if options.pdf_renderer == 'hocr' and \
not languages.issubset(HOCR_OK_LANGS):
msg = (
"The 'hocr' PDF renderer is known to cause problems with one "
"or more of the languages in your document. ")
if tesseract.has_textonly_pdf():
msg += (
"Use --pdf-renderer auto (the default) to avoid this issue.")
else:
msg += (
"Use --pdf-renderer tesseract --output-type pdf to avoid "
"this issue")
log.warning(msg)
elif ghostscript.version() < '9.20' and \
not languages.issubset(HOCR_OK_LANGS) \
and options.output_type != 'pdf':
msg = (
"The installed version of Ghostscript does not work correctly "
"with the OCR languages you specified. Use --output-type pdf or "
"upgrade to Ghostscript 9.20 or later to avoid this issue.")
msg += "Found Ghostscript {}".format(ghostscript.version())
log.warning(msg)
def check_options_advanced(options, log):
if tesseract.v4():
log.info(
"Tesseract v4.x.alpha found.")
if options.tesseract_oem and not tesseract.v4():
log.warning(
"--tesseract-oem requires Tesseract 4.x -- argument ignored")
if options.pdf_renderer == 'tess4' and not tesseract.has_textonly_pdf():
raise MissingDependencyError(
"--pdf-renderer tess4 requires Tesseract 4.x "
"commit 3d9fb3b or later")
if options.pdfa_image_compression != 'auto' and \
options.output_type.startswith('pdfa'):
log.warning(
"--pdfa-image-compression argument has no effect when "
"--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
if options.interword_spaces and options.pdf_renderer != 'hocr':
log.warning(
"--interword-spaces argument has no effect when "
"--pdf-renderer is not 'hocr'"
)
def check_options_metadata(options, log):
import unicodedata
docinfo = [options.title, options.author, options.keywords,
options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
))
def check_options(options, log):
try:
check_options_languages(options, log)
check_options_metadata(options, log)
check_options_output(options, log)
check_options_sidecar(options, log)
check_options_preprocessing(options, log)
check_options_ocr_behavior(options, log)
check_options_advanced(options, log)
except ValueError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
except argparse.ArgumentError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
except MissingDependencyError as e:
log.error(e)
sys.exit(ExitCode.missing_dependency)
# ----------
# Logging
def logging_factory(logger_name, logger_args):
verbose = logger_args['verbose']
quiet = logger_args['quiet']
root_logger = logging.getLogger(logger_name)
root_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
handler.setFormatter(formatter_)
if verbose:
handler.setLevel(logging.DEBUG)
elif quiet:
handler.setLevel(logging.WARNING)
else:
handler.setLevel(logging.INFO)
root_logger.addHandler(handler)
return root_logger
def available_cpu_count():
try:
return multiprocessing.cpu_count()
except NotImplementedError:
pass
try:
import psutil
return psutil.cpu_count()
except (ImportError, AttributeError):
pass
complain(
"Could not get CPU count. Assuming one (1) CPU."
"Use -j N to set manually.")
return 1
def cleanup_ruffus_error_message(msg):
msg = re.sub(r'\s+', r' ', msg)
msg = re.sub(r"\((.+?)\)", r'\1', msg)
msg = msg.strip()
return msg
def do_ruffus_exception(ruffus_five_tuple, options, log):
"""Replace the elaborate ruffus stack trace with a user friendly
description of the error message that occurred."""
exit_code = None
task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
job_name = job_name # unused
if exc_name == 'builtins.SystemExit':
match = re.search(r"\.(.+?)\)", exc_value)
exit_code_name = match.groups()[0]
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
log.error(cleanup_ruffus_error_message(exc_value))
exit_code = ExitCode.input_file
elif exc_name == 'builtins.TypeError':
# Even though repair_pdf will fail, ruffus will still try
# to call split_pages with no input files, likely due to a bug
if task_name == 'split_pages':
log.error("Input file '{0}' is not a valid PDF".format(
options.input_file))
exit_code = ExitCode.input_file
elif exc_name == 'builtins.KeyboardInterrupt':
log.error("Interrupted by user")
exit_code = ExitCode.ctrl_c
elif exc_name == 'subprocess.CalledProcessError':
# It's up to the subprocess handler to report something useful
msg = "Error occurred while running this command:"
log.error(msg + '\n' + exc_value)
exit_code = ExitCode.child_process_error
elif (exc_name == 'PyPDF2.utils.PdfReadError' and \
'not been decrypted' in exc_value) or \
(exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'):
log.error(textwrap.dedent("""\
Input PDF is encrypted. The encryption must be removed to
perform OCR.
For information about this PDF's security use
qpdf --show-encryption infilename
You can remove the encryption using
qpdf --decrypt [--password=[password]] infilename
"""))
exit_code = ExitCode.encrypted_pdf
elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
log.error(textwrap.dedent("""\
Failed to merge PDF image layer with OCR layer
Usually this happens because the input PDF file is mal-formed and
ocrmypdf cannot automatically correct the problem on its own.
Try using
ocrmypdf --pdf-renderer tesseract [..other args..]
"""))
exit_code = ExitCode.input_file
elif exc_name.startswith('ocrmypdf.exceptions.'):
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
exit_code = exc_class.exit_code
elif exc_name == 'PIL.Image.DecompressionBombError':
msg = cleanup_ruffus_error_message(exc_value)
msg += ("\nUse the --max-image-mpixels argument to set increase the "
"maximum number of megapixels to accept.")
log.error(msg)
exit_code = ExitCode.input_file
if exit_code is not None:
return exit_code
if not options.verbose:
log.error(exc_stack)
return ExitCode.other_error
def traverse_ruffus_exception(e_args, options, log):
"""Walk through a RethrownJobError and find the first exception.
The exit code will be based on this, even if multiple exceptions occurred
at the same time."""
if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \
len(e_args) == 5:
return do_ruffus_exception(e_args, options, log)
elif is_iterable_notstr(e_args):
for exc in e_args:
return traverse_ruffus_exception(exc, options, log)
def check_closed_streams(options):
"""Work around Python issue with multiprocessing forking on closed streams
https://bugs.python.org/issue28326
Attempting to a fork/exec a new Python process when any of std{in,out,err}
are closed or not flushable for some reason may raise an exception.
Fix this by opening devnull if the handle seems to be closed. Do this
globally to avoid tracking places all places that fork.
Seems to be specific to multiprocessing.Process not all Python process
forkers.
The error actually occurs when the stream object is not flushable,
but replacing an open stream object that is not flushable with
/dev/null is a bad idea since it will create a silent failure. Replacing
a closed handle with /dev/null seems safe.
"""
if sys.stderr is None:
sys.stderr = open(os.devnull, 'w')
if sys.stdin is None:
if options.input_file == '-':
print("Trying to read from stdin but stdin seems closed",
file=sys.stderr)
return False
sys.stdin = open(os.devnull, 'r')
if sys.stdout is None:
if options.output_file == '-':
# Can't replace stdout if the user is piping
# If this case can even happen, it must be some kind of weird
# stream.
print(textwrap.dedent("""\
Output was set to stdout '-' but the stream attached to
stdout does not support the flush() system call. This
will fail."""), file=sys.stderr)
return False
sys.stdout = open(os.devnull, 'w')
return True
def log_page_orientations(pdfinfo, _log):
direction = {0: 'n', 90: 'e',
180: 's', 270: 'w'}
orientations = []
for n, page in enumerate(pdfinfo):
angle = page.rotation or 0
if angle != 0:
orientations.append('{0}{1}'.format(
n + 1,
direction.get(angle, '')))
if orientations:
_log.info('Page orientations detected: ' + ' '.join(orientations))
def run_pipeline():
options = parser.parse_args()
options.verbose_abbreviated_path = 1
if not check_closed_streams(options):
return ExitCode.bad_args
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
logging_factory, __name__, logger_args)
_log.debug('ocrmypdf ' + VERSION)
_log.debug('tesseract ' + tesseract.version())
_log.debug('qpdf ' + qpdf.version())
check_options(options, _log)
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None
# Complain about qpdf version < 7.0.0
# Suppress the warning if in the test suite, since there are no PPAs
# for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
complain(
"You are using qpdf version {0} which has known issues including "
"security vulnerabilities with certain malformed PDFs. Consider "
"upgrading to version 7.0.0 or newer.".format(qpdf.version()))
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
try:
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
options.history_file = os.path.join(
work_folder, 'ruffus_history.sqlite')
start_input_file = os.path.join(
work_folder, 'origin')
if options.input_file == '-':
# stdin
_log.info('reading file from standard input')
with open(start_input_file, 'wb') as stream_buffer:
from shutil import copyfileobj
copyfileobj(sys.stdin.buffer, stream_buffer)
else:
try:
re_symlink(options.input_file, start_input_file, _log)
except FileNotFoundError:
_log.error("File not found - " + options.input_file)
return ExitCode.input_file
if options.output_file == '-':
if sys.stdout.isatty():
_log.error(textwrap.dedent("""\
Output was set to stdout '-' but it looks like stdout
is connected to a terminal. Please redirect stdout to a
file."""))
return ExitCode.bad_args
elif not is_file_writable(options.output_file):
_log.error(
"Output file location (" + options.output_file + ") " +
"is not a writable file.")
return ExitCode.file_access_error
manager = JobContextManager()
manager.register('JobContext', JobContext) # pylint: disable=no-member
manager.start()
context = manager.JobContext() # pylint: disable=no-member
context.set_options(options)
context.set_work_folder(work_folder)
build_pipeline(options, work_folder, _log, context)
atexit.register(cleanup_working_files, work_folder, options)
cmdline.run(options)
except ruffus_exceptions.RethrownJobError as e:
if options.verbose:
_log.debug(str(e)) # stringify exception so logger doesn't have to
# Ruffus flattens exception to 5 element tuples. Because of a bug
# in <= 2.6.3 it may present either the single:
# (task, job, exc, value, stack)
# or something like:
# [[(task, job, exc, value, stack)]]
#
# Generally cross-process exception marshalling doesn't work well
# and ruffus doesn't support because BaseException has its own
# implementation of __reduce__ that attempts to reconstruct the
# exception based on e.__init__(e.args).
#
# Attempting to log the exception directly marshalls it to the logger
# which is probably in another process, so it's better to log only
# data from the exception at this point.
exitcode = traverse_ruffus_exception(e.args, options, _log)
if exitcode is None:
_log.error("Unexpected ruffus exception: " + str(e))
_log.error(repr(e))
return ExitCode.other_error
return exitcode
except ExitCodeException as e:
return e.exit_code
except Exception as e:
_log.error(e)
return ExitCode.other_error
if options.flowchart:
_log.info("Flowchart saved to {}".format(options.flowchart))
elif options.output_file == '-':
_log.info("Output sent to stdout")
elif os.path.samefile(options.output_file, os.devnull):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
msg = 'Output file is a {} (as expected)'
_log.info(msg.format(pdfa_info['conformance']))
else:
msg = 'Output file is okay but is not PDF/A (seems to be {})'
_log.warning(msg.format(pdfa_info['conformance']))
return ExitCode.invalid_output_pdf
if not qpdf.check(options.output_file, _log):
_log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
pdfinfo = context.get_pdfinfo()
if options.verbose:
from pprint import pformat
_log.debug(pformat(pdfinfo))
log_page_orientations(pdfinfo, _log)
return ExitCode.ok
if __name__ == '__main__':
sys.exit(run_pipeline())