New feature: modified_pdf, export a PDF with just the annotated pages

lucasrla · Dec 27, 2020 · e56c4fd · e56c4fd
1 parent 51383c3
commit e56c4fd
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 7 deletions.
diff --git a/remarks/__main__.py b/remarks/__main__.py
@@ -43,6 +43,12 @@ def main():
         action="store_true",
         help="Create a '*_remarks.pdf' file with all annotated pages merged into the original (unannotated) PDF",
     )
+    parser.add_argument(
+        "--modified_pdf",
+        dest="modified_pdf",
+        action="store_true",
+        help="Create a '*_remarks-only.pdf' file with all annotated pages",
+    )    
     parser.add_argument(
         "-v",
         "--version",
@@ -54,7 +60,7 @@ def main():
         "-h", "--help", action="help", help="Show this help message",
     )
 
-    parser.set_defaults(combined_pdf=False)
+    parser.set_defaults(combined_pdf=False, modified_pdf=False)
 
     args = parser.parse_args()
     args_dict = vars(args)

diff --git a/remarks/remarks.py b/remarks/remarks.py
@@ -37,6 +37,7 @@ def run_remarks(
     pdf_name=None,
     ann_type=None,
     combined_pdf=False,
+    modified_pdf=False,
 ):
     for path in pathlib.Path(f"{input_dir}/").glob("*.pdf"):
         pages = list_pages_uuids(path)
@@ -52,11 +53,14 @@ def run_remarks(
         page_magnitude = math.floor(math.log10(len(pages))) + 1
         in_device_path = get_ui_path(path)
 
-        _dir = pathlib.Path(f"{output_dir}/{in_device_path}/{name}/")
-        _dir.mkdir(parents=True, exist_ok=True)
+        out_path = pathlib.Path(f"{output_dir}/{in_device_path}/{name}/")
+        out_path.mkdir(parents=True, exist_ok=True)
 
         pdf_src = fitz.open(path)
 
+        if modified_pdf:
+            mod_pdf = fitz.open()
+
         print(f"Working on PDF file: {path}")
         print(f'PDF visibleName: "{name}"')
         print(f"PDF in-device directory: {in_device_path}")
@@ -84,7 +88,7 @@ def run_remarks(
             if "svg" in targets:
                 svg_str = draw_svg(parsed_data)
 
-                subdir = prepare_subdir(_dir, "svg")
+                subdir = prepare_subdir(out_path, "svg")
                 with open(f"{subdir}/{page_idx:0{page_magnitude}}.svg", "w") as f:
                     f.write(svg_str)
 
@@ -123,15 +127,15 @@ def run_remarks(
             ann_page = draw_pdf(parsed_data, ann_page)
 
             if "pdf" in targets:
-                subdir = prepare_subdir(_dir, "pdf")
+                subdir = prepare_subdir(out_path, "pdf")
                 ann_doc.save(f"{subdir}/{page_idx:0{page_magnitude}}.pdf")
 
             if "png" in targets:
                 # (2, 2) is a short-hand for 2x zoom on x and y
                 # ref: https://pymupdf.readthedocs.io/en/latest/page.html#Page.getPixmap
                 pixmap = ann_page.getPixmap(matrix=fitz.Matrix(2, 2))
 
-                subdir = prepare_subdir(_dir, "png")
+                subdir = prepare_subdir(out_path, "png")
                 pixmap.writePNG(f"{subdir}/{page_idx:0{page_magnitude}}.png")
 
             if "md" in targets:
@@ -142,7 +146,7 @@ def run_remarks(
 
                     # TODO: maybe also add highlighted image (pixmap) extraction?
 
-                    subdir = prepare_subdir(_dir, "md")
+                    subdir = prepare_subdir(out_path, "md")
                     with open(f"{subdir}/{page_idx:0{page_magnitude}}.md", "w") as f:
                         f.write(md_str)
 
@@ -157,6 +161,9 @@ def run_remarks(
                         f"Found highlighted text but couldn't create markdown from page #{page_idx}"
                     )
 
+            if modified_pdf:
+                mod_pdf.insertPDF(ann_doc, start_at=-1)
+
             if combined_pdf:
                 x_max, y_max = get_ann_max_bound(parsed_data)
                 ann_outside = (x_max > pdf_w_adj) or (y_max > pdf_h_adj)
@@ -177,4 +184,7 @@ def run_remarks(
         if combined_pdf:
             pdf_src.save(f"{output_dir}/{name} _remarks.pdf")
 
+        if modified_pdf:
+            mod_pdf.save(f"{output_dir}/{name} _remarks-only.pdf")
+
         pdf_src.close()