fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.

papermerge · Jul 25, 2020 · 8a8835d · 8a8835d
1 parent 14370d3
commit 8a8835d
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 3 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## [1.2.2] - 25 July 2020
+
+### Changed
+
+  - bugfix - get_pagecount handles non utf-8 encoded documents
+
+### Added
+
+- unit tests for get_pagecount
+
 ## [1.2.1] - 16 July 2020
 
 ### Added

diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py
@@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):
 
         raise Exception("Error occured while getting document page count.")
 
-    lines = compl.stdout.decode('utf-8').split('\n')
+    lines = _split(stdout=compl.stdout)
     # look up for the line containing "Pages: 11"
     for line in lines:
         x = re.match(r"(\d+)", line.strip())
@@ -100,11 +100,27 @@ def get_pagecount(filepath):
 
         raise Exception("Error occured while getting document page count.")
 
-    lines = compl.stdout.decode('utf-8').split('\n')
+    lines = _split(stdout=compl.stdout)
     # look up for the line containing "Pages: 11"
     for line in lines:
-        x = re.match("Pages:\W+(\d+)$", line.strip())
+        x = re.match(r"Pages:\W+(\d+)$", line.strip())
         if x:
             return int(x.group(1))
 
     return 0
+
+
+def _split(stdout):
+    """
+    stdout is result.stdout where result
+    is whatever is returned by subprocess.run
+    """
+    decoded_text = stdout.decode(
+        'utf-8',
+        # in case there are decoding issues, just replace
+        # problematic characters. We don't need text verbatim.
+        'replace'
+    )
+    lines = decoded_text.split('\n')
+
+    return lines
diff --git a/test/data/berlin.jpeg b/test/data/berlin.jpeg
diff --git a/test/data/berlin.jpg b/test/data/berlin.jpg
diff --git a/test/data/berlin.pdf b/test/data/berlin.pdf
diff --git a/test/data/berlin.png b/test/data/berlin.png
diff --git a/test/data/text.tiff b/test/data/text.tiff
diff --git a/test/test_pdfinfo.py b/test/test_pdfinfo.py
@@ -0,0 +1,61 @@
+import os
+import unittest
+
+from mglib.pdfinfo import get_pagecount
+
+BASE_DIR = os.path.dirname(
+    os.path.abspath(__file__)
+)
+
+DATA_DIR = os.path.join(
+    BASE_DIR, "data"
+)
+
+
+def get_filepath(filename):
+    return os.path.join(DATA_DIR, filename)
+
+
+class TestPDFinfo(unittest.TestCase):
+
+    def test_basic_pdf(self):
+        page_count = get_pagecount(get_filepath("berlin.pdf"))
+
+        self.assertEqual(
+            page_count,
+            2
+        )
+
+    def test_basic_jpeg(self):
+        page_count = get_pagecount(get_filepath("berlin.jpeg"))
+
+        self.assertEqual(
+            page_count,
+            1
+        )
+
+    def test_basic_jpg(self):
+        page_count = get_pagecount(get_filepath("berlin.jpg"))
+
+        self.assertEqual(
+            page_count,
+            1
+        )
+
+    def test_basic_png(self):
+        page_count = get_pagecount(get_filepath("berlin.png"))
+
+        self.assertEqual(
+            page_count,
+            1
+        )
+
+    def test_basic_tiff(self):
+        # in case input file has extention tiff extension
+        # it will internally call get_tiff_pagecount method
+        page_count = get_pagecount(get_filepath("text.tiff"))
+
+        self.assertEqual(
+            page_count,
+            2
+        )