Skip to content

Commit

Permalink
Fix #791 (#806)
Browse files Browse the repository at this point in the history
* Skip non-Unicode cmaps in TrueType fonts

* Prefer normal space when it has the same cid as a non-breaking one

* Add test

* Update CHANGELOG.md

* Add brackets to make operator order explicit

---------

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
  • Loading branch information
NAZADOTH and pietermarsman committed Nov 25, 2023
1 parent 40b200d commit 997424d
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794))
- `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827))
- Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828))
- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806))

### Deprecated

Expand Down
11 changes: 8 additions & 3 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
assert isinstance(code.name, str)
self.cid2unichr[cid] = name2unicode(code.name)
unichr = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
unichr = code.decode("UTF-16BE", "ignore")
elif isinstance(code, int):
self.cid2unichr[cid] = chr(code)
unichr = chr(code)
else:
raise TypeError(code)

# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ":
return
self.cid2unichr[cid] = unichr


class PyCMap(CMap):
def __init__(self, name: str, module: Any) -> None:
Expand Down
8 changes: 7 additions & 1 deletion pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap:
)
char2gid: Dict[int, int] = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
for (platform_id, encoding_id, st_offset) in subtables:
# Skip non-Unicode cmaps.
# https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
continue
fp.seek(base_offset + st_offset)
(fmttype, fmtlen, fmtlang) = cast(
Tuple[int, int, int], struct.unpack(">HHH", fp.read(6))
Expand Down Expand Up @@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap:
char2gid[c] = (c + idd) & 0xFFFF
else:
assert False, str(("Unhandled", fmttype))
if not char2gid:
raise TrueTypeFont.CMapNotFound
# create unicode map
unicode_map = FileUnicodeMap()
for (char, gid) in char2gid.items():
Expand Down
Binary file added samples/contrib/issue-791-non-unicode-cmap.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def run_with_file(sample_path):
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
"contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech",
}


Expand Down Expand Up @@ -120,6 +121,11 @@ def test_issue_625_identity_cmap(self):

self.assertEqual(lines[6], test_strings[test_file])

def test_issue_791_non_unicode_cmap(self):
test_file = "contrib/issue-791-non-unicode-cmap.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])


class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):
Expand Down

0 comments on commit 997424d

Please sign in to comment.