Skip to content

Commit

Permalink
Merge pull request #13424 from calixteman/chunks2
Browse files Browse the repository at this point in the history
[api-minor] Fix issues in text selection
  • Loading branch information
calixteman committed Oct 18, 2021
2 parents 52fce0d + 61d1063 commit bbb6436
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 142 deletions.
292 changes: 154 additions & 138 deletions src/core/evaluator.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/display/text_layer.js
Expand Up @@ -188,7 +188,7 @@ function appendText(task, geom, styles, ctx) {
(task._enhanceTextSelection && AllWhitespaceRegexp.test(geom.str))
) {
shouldScaleText = true;
} else if (geom.transform[0] !== geom.transform[3]) {
} else if (geom.str !== " " && geom.transform[0] !== geom.transform[3]) {
const absScaleX = Math.abs(geom.transform[0]),
absScaleY = Math.abs(geom.transform[3]);
// When the horizontal/vertical scaling differs significantly, also scale
Expand Down
5 changes: 5 additions & 0 deletions test/pdfs/.gitignore
Expand Up @@ -13,6 +13,7 @@
!issue1155r.pdf
!issue2017r.pdf
!bug1727053.pdf
!issue11913.pdf
!issue2391-1.pdf
!issue2391-2.pdf
!issue14046.pdf
Expand Down Expand Up @@ -182,6 +183,7 @@
!issue11931.pdf
!issue1655r.pdf
!issue6541.pdf
!issue10640.pdf
!issue2948.pdf
!issue6231_1.pdf
!issue10402.pdf
Expand Down Expand Up @@ -285,6 +287,7 @@
!issue2840.pdf
!issue4061.pdf
!issue4668.pdf
!issue13226.pdf
!PDFJS-7562-reduced.pdf
!issue11768_reduced.pdf
!issue5039.pdf
Expand Down Expand Up @@ -441,6 +444,7 @@
!annotation-fileattachment.pdf
!annotation-text-widget.pdf
!annotation-choice-widget.pdf
!issue10900.pdf
!annotation-button-widget.pdf
!annotation-polyline-polygon.pdf
!annotation-polyline-polygon-without-appearance.pdf
Expand All @@ -463,6 +467,7 @@
!issue9972-3.pdf
!tiling-pattern-box.pdf
!tiling-pattern-large-steps.pdf
!issue13201.pdf
!issue11555.pdf
!issue12337.pdf
!pr12564.pdf
Expand Down
Binary file added test/pdfs/issue10640.pdf
Binary file not shown.
Binary file added test/pdfs/issue10900.pdf
Binary file not shown.
Binary file added test/pdfs/issue11913.pdf
Binary file not shown.
Binary file added test/pdfs/issue13201.pdf
Binary file not shown.
Binary file added test/pdfs/issue13226.pdf
Binary file not shown.
115 changes: 113 additions & 2 deletions test/unit/api_spec.js
Expand Up @@ -73,6 +73,10 @@ describe("api", function () {
}, WAIT_TIMEOUT);
}

function mergeText(items) {
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
}

describe("getDocument", function () {
it("creates pdf doc from URL-string", async function () {
const urlStr = TEST_PDFS_PATH + basicApiFileName;
Expand Down Expand Up @@ -1604,11 +1608,17 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);

expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(12);
expect(data[0].items.length).toEqual(11);
expect(!!data[0].styles).toEqual(true);

const page1 = mergeText(data[0].items);
expect(page1).toEqual(`Table Of Content
Chapter 1 .......................................................... 2
Paragraph 1.1 ...................................................... 3
page 1 / 3`);

expect(!!data[1].items).toEqual(true);
expect(data[1].items.length).toEqual(7);
expect(data[1].items.length).toEqual(6);
expect(!!data[1].styles).toEqual(true);
});

Expand Down Expand Up @@ -1643,6 +1653,107 @@ describe("api", function () {
await loadingTask.destroy();
});

it("gets text content, with no extra spaces (issue 13226)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(text).toEqual(
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
);

await loadingTask.destroy();
});

it("gets text content, with merged spaces (issue 13201)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
)
).toEqual(true);
expect(
text.includes(
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
)
).toEqual(true);
expect(
text.includes(
"system is secure as long as honest nodes collectively control more CPU power than any"
)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
)
).toEqual(true);
expect(
text.includes(
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
)
).toEqual(true);
await loadingTask.destroy();
});

it("gets text content, with merged spaces (issue 10900)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`3 3 3 3
851.5 854.9 839.3 837.5
633.6 727.8 789.9 796.2
1,485.1 1,582.7 1,629.2 1,633.7
114.2 121.7 125.3 130.7
13.0x 13.0x 13.0x 12.5x`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets text content, with spaces (issue 10640)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);

expect(
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
Open Sans was designed with an upright stress, open forms and a neu-
tral, yet friendly appearance. It was optimized for print, web, and mobile
interfaces, and has excellent legibility characteristics in its letterforms (see
figure \x81 on the following page). This font is available from the Google Font
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
This package provides support for this font in LATEX. It includes Type \x81
versions of the fonts, converted for this package using FontForge from its
sources, for full support with Dvips.`)
).toEqual(true);

await loadingTask.destroy();
});

it("gets empty structure tree", async function () {
const tree = await page.getStructTree();

Expand Down
2 changes: 1 addition & 1 deletion test/unit/pdf_find_controller_spec.js
Expand Up @@ -261,7 +261,7 @@ describe("pdf_find_controller", function () {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[19, 48, 66]],
pageMatches: [[19, 46, 62]],
pageMatchesLength: [[8, 8, 8]],
});
});
Expand Down

0 comments on commit bbb6436

Please sign in to comment.