Skip to content

Commit

Permalink
fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.
Browse files Browse the repository at this point in the history
  • Loading branch information
ciur committed Jul 25, 2020
1 parent 14370d3 commit 8a8835d
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 3 deletions.
10 changes: 10 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## [1.2.2] - 25 July 2020

### Changed

- bugfix - get_pagecount handles non utf-8 encoded documents

### Added

- unit tests for get_pagecount

## [1.2.1] - 16 July 2020

### Added
Expand Down
22 changes: 19 additions & 3 deletions mglib/pdfinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):

raise Exception("Error occured while getting document page count.")

lines = compl.stdout.decode('utf-8').split('\n')
lines = _split(stdout=compl.stdout)
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match(r"(\d+)", line.strip())
Expand Down Expand Up @@ -100,11 +100,27 @@ def get_pagecount(filepath):

raise Exception("Error occured while getting document page count.")

lines = compl.stdout.decode('utf-8').split('\n')
lines = _split(stdout=compl.stdout)
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match("Pages:\W+(\d+)$", line.strip())
x = re.match(r"Pages:\W+(\d+)$", line.strip())
if x:
return int(x.group(1))

return 0


def _split(stdout):
"""
stdout is result.stdout where result
is whatever is returned by subprocess.run
"""
decoded_text = stdout.decode(
'utf-8',
# in case there are decoding issues, just replace
# problematic characters. We don't need text verbatim.
'replace'
)
lines = decoded_text.split('\n')

return lines
2 changes: 2 additions & 0 deletions test/data/berlin.jpeg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions test/data/berlin.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/data/berlin.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions test/data/berlin.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/data/text.tiff
Binary file not shown.
61 changes: 61 additions & 0 deletions test/test_pdfinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
import unittest

from mglib.pdfinfo import get_pagecount

BASE_DIR = os.path.dirname(
os.path.abspath(__file__)
)

DATA_DIR = os.path.join(
BASE_DIR, "data"
)


def get_filepath(filename):
return os.path.join(DATA_DIR, filename)


class TestPDFinfo(unittest.TestCase):

def test_basic_pdf(self):
page_count = get_pagecount(get_filepath("berlin.pdf"))

self.assertEqual(
page_count,
2
)

def test_basic_jpeg(self):
page_count = get_pagecount(get_filepath("berlin.jpeg"))

self.assertEqual(
page_count,
1
)

def test_basic_jpg(self):
page_count = get_pagecount(get_filepath("berlin.jpg"))

self.assertEqual(
page_count,
1
)

def test_basic_png(self):
page_count = get_pagecount(get_filepath("berlin.png"))

self.assertEqual(
page_count,
1
)

def test_basic_tiff(self):
# in case input file has extention tiff extension
# it will internally call get_tiff_pagecount method
page_count = get_pagecount(get_filepath("text.tiff"))

self.assertEqual(
page_count,
2
)

0 comments on commit 8a8835d

Please sign in to comment.