ocrmypdf · ianalexander · Dec 23, 2019 · Dec 23, 2019
diff --git a/.docker/Dockerfile b/.docker/Dockerfile
@@ -39,6 +39,7 @@ RUN pip3 install --no-cache-dir \
   -r requirements/main.txt \
   -r requirements/webservice.txt \
   -r requirements/test.txt \
+  -r requirements/watcher.txt \
   .
 
 FROM base
@@ -69,6 +70,7 @@ COPY --from=builder /usr/local/lib/ /usr/local/lib/
 COPY --from=builder /usr/local/bin/ /usr/local/bin/
 
 COPY --from=builder /app/misc/webservice.py /app/
+COPY --from=builder /app/misc/watcher.py /app/
 
 # Copy minimal project files to get the test suite.
 COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/

diff --git a/docs/batch.rst b/docs/batch.rst
@@ -198,6 +198,42 @@ and all inquiries are appreciated.
 Hot (watched) folders
 =====================
 
+Watched folders with Docker
+---------------------------
+
+The OCRmyPDF Docker image includes a watcher service. This service can
+be launched as follows:
+
+.. code-block:: bash
+
+    docker run \
+        -v <path to files to convert>:/input \
+        -v <path to store results>:/output \
+        -e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
+        -it --entrypoint python3 \
+        jbarlow83/ocrmypdf \
+        watcher.py
+
+This service will watch for a file that matches /input/\*.pdf and will
+convert it to a OCR'ed PDF in /output/. The parameters to this image are:
+
++--------------------------------------+------------------------------------+
+| Parameter                            | Function                           |
++======================================+====================================+
+| -v <path to files to convert>:/input | Files placed in this location will |
+|                                      | be OCR'ed                          |
++--------------------------------------+------------------------------------+
+| -v <path to store results>:/output   | This is where OCR'ed files will be |
+|                                      | stored                             |
++--------------------------------------+------------------------------------+
+| -e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 | This will place files in the output|
+|                                      | folder in {output_directory}\\     |
+|                                      | {year}\\{month}\\{filename}        |
++--------------------------------------+------------------------------------+
+
+Watched Folders with CLI
+------------
+
 To set up a "hot folder" that will trigger OCR for every file inserted,
 use a program like Python
 `watchdog <https://pypi.python.org/pypi/watchdog>`__ (supports all major
@@ -225,12 +261,12 @@ told to run ``ocrmypdf`` on any .pdf added to the current directory
        --command='ocrmypdf "${watch_src_path}" "out/${watch_src_path}" ' \
        .  # don't forget the final dot
 
-For more complex behavior you can write a Python script around to use
-the watchdog API.
-
 On file servers, you could configure watchmedo as a system service so it
 will run all the time.
 
+For more complex behavior you can write a Python script around to use
+the watchdog API. You can refer to the watcher.py script as an example.
+
 Caveats
 -------
 

diff --git a/misc/watcher.py b/misc/watcher.py
@@ -0,0 +1,56 @@
+import sys
+import time
+import os
+import ntpath
+from pathlib import Path
+from datetime import datetime
+import ocrmypdf
+from watchdog.observers import Observer
+from watchdog.events import LoggingEventHandler, PatternMatchingEventHandler
+
+INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
+OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
+OUTPUT_DIRECTORY_YEAR_MONTH = \
+    bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
+PATTERNS = ['*.pdf']
+
+
+def execute_ocrmypdf(file_path):
+    filename = Path(file_path).name
+    if OUTPUT_DIRECTORY_YEAR_MONTH:
+        today = datetime.today()
+        output_directory_year_month = \
+            f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}'
+        if not Path(output_directory_year_month).exists():
+            Path(output_directory_year_month).mkdir(parents=True, exist_ok=True)
+        output_path = f'{output_directory_year_month}/{filename}'
+    else:
+        output_path = f'{OUTPUT_DIRECTORY}/{filename}'
+    print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
+    ocrmypdf.ocr(
+        file_path,
+        output_path
+    )
+
+
+class HandleObserverEvent(PatternMatchingEventHandler):
+    def on_any_event(self, event):
+        if event.event_type in ['created', 'modified']:
+            execute_ocrmypdf(event.src_path)
+
+
+if __name__ == "__main__":
+    print(f"Starting OCRmyPDF watcher with config:\n"
+          f"Input Directory: {INPUT_DIRECTORY}\n"
+          f"Output Directory: {OUTPUT_DIRECTORY}\n"
+          f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}")
+    handler = HandleObserverEvent(patterns=PATTERNS)
+    observer = Observer()
+    observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
+    observer.start()
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        observer.stop()
+    observer.join()
diff --git a/requirements/watcher.txt b/requirements/watcher.txt
@@ -0,0 +1 @@
+watchdog >= 0.8.2