Skip to content

Commit

Permalink
Add even more non-standard ligatures (PR 15517 follow-up)
Browse files Browse the repository at this point in the history
Given that we already create multi-byte ToUnicode entries in other cases, see e.g. the `getNormalizedUnicodes` table, this is hopefully fine.
  • Loading branch information
Snuffleupagus committed Mar 22, 2023
1 parent f39ff20 commit 137a2d6
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 18 deletions.
10 changes: 10 additions & 0 deletions src/core/evaluator.js
Expand Up @@ -3550,6 +3550,16 @@ class PartialEvaluator {
code = unicode;
}
break;
default:
// Support (some) non-standard ligatures.
switch (glyphName) {
case "f_h":
case "f_t":
case "T_h":
toUnicode[charcode] = glyphName.replaceAll("_", "");
continue;
}
break;
}
if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) {
// If `baseEncodingName` is one the predefined encodings, and `code`
Expand Down
9 changes: 8 additions & 1 deletion test/test_manifest.json
Expand Up @@ -230,7 +230,14 @@
"link": true,
"type": "eq",
"annotations": true
},
},
{ "id": "issue15516",
"file": "pdfs/issue15516_reduced.pdf",
"md5": "a30be78c73d13aa6ff890834ce69adc1",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "bug946506",
"file": "pdfs/bug946506.pdf",
"md5": "c28911b5c31bdc337c2ce404c5971cfc",
Expand Down
17 changes: 0 additions & 17 deletions test/unit/api_spec.js
Expand Up @@ -2607,23 +2607,6 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});

// TODO: Change this to a `text` reference test instead.
// Currently that doesn't work, since the `XMLSerializer` fails on
// the ASCII "control characters" found in the text-content.
it("gets text content with non-standard ligatures (issue issue15516)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue15516_reduced.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(text).toEqual("ffi fi ffl ff fl \x07 \x08 Ý");

await loadingTask.destroy();
});

it("gets text content with multi-byte entries, using predefined CMaps (issue 16176)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue16176.pdf", {
Expand Down

0 comments on commit 137a2d6

Please sign in to comment.