Skip to content

Commit

Permalink
Fix issues in text selection
Browse files Browse the repository at this point in the history
  - PR #13257 fixed a lot of issues but not all and this patch aims to fix almost all remaining issues.
  - the idea in this new patch is to compare position of new glyph with the last position where a glyph has been drawn;
    - no space are "drawn": it just moves the cursor but they aren't added in the chunk;
    - so this way a space followed by a cursor move can be treated as only one space: it helps to merge all spaces into one.
  - to make difference between real spaces and tracking ones, we used a factor of the space width (from the font)
    - it was a pretty good idea in general but it fails with some fonts where space was too big:
    - in Poppler, they're using a factor of the font size: this is an excellent idea (<= 0.1 * fontSize implies tracking space).
  • Loading branch information
calixteman committed May 23, 2021
1 parent 3538ef0 commit a1c46de
Show file tree
Hide file tree
Showing 10 changed files with 208 additions and 122 deletions.
198 changes: 84 additions & 114 deletions src/core/evaluator.js

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions src/core/fonts.js
Expand Up @@ -2884,7 +2884,7 @@ class Font {
/**
* @private
*/
_charToGlyph(charcode, isSpace = false) {
_charToGlyph(charcode) {
let fontCharCode, width, operatorListId;

let widthCode = charcode;
Expand All @@ -2903,6 +2903,7 @@ class Font {
unicode = String.fromCharCode(unicode);
}

const isSpace = unicode === " ";
let isInFont = charcode in this.toFontChar;
// First try the toFontChar map, if it's not there then try falling
// back to the char code.
Expand Down Expand Up @@ -3006,15 +3007,13 @@ class Font {
charcode = c.charcode;
const length = c.length;
i += length;
// Space is char with code 0x20 and length 1 in multiple-byte codes.
const isSpace = length === 1 && chars.charCodeAt(i - 1) === 0x20;
glyph = this._charToGlyph(charcode, isSpace);
glyph = this._charToGlyph(charcode);
glyphs.push(glyph);
}
} else {
for (i = 0, ii = chars.length; i < ii; ++i) {
charcode = chars.charCodeAt(i);
glyph = this._charToGlyph(charcode, charcode === 0x20);
glyph = this._charToGlyph(charcode);
glyphs.push(glyph);
}
}
Expand Down
5 changes: 5 additions & 0 deletions test/pdfs/.gitignore
Expand Up @@ -12,6 +12,7 @@
!xref_command_missing.pdf
!issue1155r.pdf
!issue2017r.pdf
!issue11913.pdf
!issue2391-1.pdf
!issue2391-2.pdf
!issue3214.pdf
Expand Down Expand Up @@ -171,6 +172,7 @@
!issue11931.pdf
!issue1655r.pdf
!issue6541.pdf
!issue10640.pdf
!issue2948.pdf
!issue6231_1.pdf
!issue10402.pdf
Expand Down Expand Up @@ -269,6 +271,7 @@
!issue2840.pdf
!issue4061.pdf
!issue4668.pdf
!issue13226.pdf
!PDFJS-7562-reduced.pdf
!issue11768_reduced.pdf
!issue5039.pdf
Expand Down Expand Up @@ -420,6 +423,7 @@
!annotation-fileattachment.pdf
!annotation-text-widget.pdf
!annotation-choice-widget.pdf
!issue10900.pdf
!annotation-button-widget.pdf
!annotation-polyline-polygon.pdf
!annotation-polyline-polygon-without-appearance.pdf
Expand All @@ -440,6 +444,7 @@
!issue9972-3.pdf
!tiling-pattern-box.pdf
!tiling-pattern-large-steps.pdf
!issue13201.pdf
!issue11555.pdf
!issue12337.pdf
!pr12564.pdf
Expand Down
Binary file added test/pdfs/issue10640.pdf
Binary file not shown.
Binary file added test/pdfs/issue10900.pdf
Binary file not shown.
Binary file added test/pdfs/issue11913.pdf
Binary file not shown.
Binary file added test/pdfs/issue13201.pdf
Binary file not shown.
Binary file added test/pdfs/issue13226.pdf
Binary file not shown.
116 changes: 114 additions & 2 deletions test/unit/api_spec.js
Expand Up @@ -70,6 +70,10 @@ describe("api", function () {
}, WAIT_TIMEOUT);
}

function mergeText(items) {
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
}

describe("getDocument", function () {
it("creates pdf doc from URL-string", async function () {
const urlStr = TEST_PDFS_PATH + basicApiFileName;
Expand Down Expand Up @@ -1500,13 +1504,20 @@ describe("api", function () {
});

const data = await Promise.all([defaultPromise, parametersPromise]);
// console.log(data);window.alert("");

expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(12);
expect(data[0].items.length).toEqual(11);
expect(!!data[0].styles).toEqual(true);

const page1 = mergeText(data[0].items);
expect(page1).toEqual(`Table Of Content
Chapter 1 .......................................................... 2
Paragraph 1.1 ...................................................... 3
page 1 / 3`);

expect(!!data[1].items).toEqual(true);
expect(data[1].items.length).toEqual(7);
expect(data[1].items.length).toEqual(6);
expect(!!data[1].styles).toEqual(true);
});

Expand Down Expand Up @@ -1539,6 +1550,107 @@ describe("api", function () {
await loadingTask.destroy();
});

it("gets text content, with no extra spaces (issue 13226)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(text).toEqual(
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
);

await loadingTask.destroy();
});

it("gets text content, with merged spaces (issue 13201)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
)
).toEqual(true);
expect(
text.includes(
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
)
).toEqual(true);
expect(
text.includes(
"system is secure as long as honest nodes collectively control more CPU power than any"
)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
)
).toEqual(true);
expect(
text.includes(
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
)
).toEqual(true);
await loadingTask.destroy();
});

it("gets text content, with merged (issue 10900)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`3 3 3 3
851.5 854.9 839.3 837.5
633.6 727.8 789.9 796.2
1,485.1 1,582.7 1,629.2 1,633.7
114.2 121.7 125.3 130.7
13.0x 13.0x 13.0x 12.5x`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with spaces", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
Open Sans was designed with an upright stress, open forms and a neu-
tral, yet friendly appearance. It was optimized for print, web, and mobile
interfaces, and has excellent legibility characteristics in its letterforms (see
figure \x81 on the following page). This font is available from the Google Font
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
This package provides support for this font in LATEX. It includes Type \x81
versions of the fonts, converted for this package using FontForge from its
sources, for full support with Dvips.`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets empty structure tree", async function () {
const tree = await page.getStructTree();

Expand Down
2 changes: 1 addition & 1 deletion test/unit/pdf_find_controller_spec.js
Expand Up @@ -268,7 +268,7 @@ describe("pdf_find_controller", function () {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[19, 48, 66]],
pageMatches: [[19, 46, 62]],
pageMatchesLength: [[8, 8, 8]],
});
});
Expand Down

0 comments on commit a1c46de

Please sign in to comment.